In [1]:
import json
import operator

# Define Pooling Depth for Each Question

In [2]:
pool_depth = 100

# Define Number of Questions to be Evaluated

In [3]:
question_depth = {
    'Cluster Identification': 7,
    'Cluster Facet': 7,
    'Cluster Aggregate': 15,
}

In [4]:
submission_path = '../../team_submissions/'
questions_path = '../../../questions/'
previous_data_file = 'cluster_annotation_data.json'

In [5]:
def extract_seed(question):
    lines = question['SPARQL'][0].split('\n')
    seed = str(lines[4].split(' ')[1]).strip("'")
        
    return seed

In [6]:
def top_pool(pooling_level, id_pos, score_pos, ans_length, id_length, responses):
    # Explicitly check for score ordering
    score = 100
    count = 0
    seen = []
    for answer in responses:
        # Non-Array, leading elements may happen to have correct
        # length
        if type(answer) == list:
            # Assume elements not matching expected length
            # are aggregate answers
            if len(answer) == ans_length:
                # Confirming doc id is where we expect
                if len(answer[id_pos]) != id_length:
                    print "NONSTANDARD DOC ID DETECTED"
                    print answer
                    break
                if answer[score_pos] > score:
                    print (score, answer[score_pos])
                    print "RANK ORDER ISSUE"
                    break
                score = answer[score_pos]
                if answer[id_pos] not in seen:
                    # Found another unique doc id
                    seen.append(answer[id_pos])
                    count += 1
                    if count == pooling_level:
                        # Found top N docs
                        # Confirm no duplicates
                        len1 = len(seen)
                        uniq_seen = list(set(seen))
                        if len1 != len(uniq_seen):
                            print "PROBLEM WITH DUPLICATED DOC IDS"
                            break
                        return uniq_seen
    # Or even if you don't get N uniq
    uniq_seen = list(set(seen))
    return uniq_seen

# Determine Seed for Every Question

In [7]:
seeds = {}

### Cluster Identification

In [8]:
seeds['Cluster Identification'] = {}

file_path = questions_path + 'post_cluster_identification.json'
f = open(file_path, 'r')
for line in f:
    temp = json.loads(line)
    seed = extract_seed(temp)
    seeds['Cluster Identification'][temp['id']] = seed
f.close()

### Cluster Facet

In [9]:
seeds['Cluster Facet'] = {}

file_path = questions_path + 'post_cluster_facet.json'
f = open(file_path, 'r')
for line in f:
    temp = json.loads(line)
    seed = extract_seed(temp)
    seeds['Cluster Facet'][temp['id']] = seed
f.close()

### Cluster Facet

In [10]:
seeds['Cluster Aggregate'] = {}

file_path = questions_path + 'post_aggregate_V2.json'
f = open(file_path, 'r')
for line in f:
    temp = json.loads(line)
    seed = extract_seed(temp)
    seeds['Cluster Aggregate'][temp['id']] = seed
f.close()

In [39]:
with open(previous_data_file, 'r') as f:
    old_clusters = eval(f.read())

In [15]:
new_answers = {}
new_answers['HG'] = {}
new_answers['JPL'] = {}

# ISI

## HG

### ISI HG-Cluster Identification

In [16]:
new_answers['HG']['ISI'] = {}

In [23]:
clus_type = 'Cluster Identification'
new_answers['HG']['ISI'][clus_type] = {}

file_path = submission_path + ('ISI/DomainDiscovery/'
                               'hg_all_asnwers/'
                               'properly_formatted_submissions/'
                               'formatted_post_cluster_identification-parsed_fixed_all_answers.json')
f = open(file_path, 'r')
data = eval(f.read())

id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    qid = entry['question_id'].split('-')[0]
    top_ids = top_pool(pool_depth, id_pos, score_pos,
                       ans_length, id_length, entry['answer'])
    # Reconfirming no dupes
    if len(top_ids) != len(list(set(top_ids))):
        print "PROBLEM WITH DUPLICATED DOC IDS"
        break
    new_answers['HG']['ISI'][clus_type][qid] = top_ids

### ISI HG-Cluster Facet

In [25]:
clus_type = 'Cluster Facet'
new_answers['HG']['ISI'][clus_type] = {}

file_path = submission_path + ('ISI/DomainDiscovery/'
                               'hg_all_asnwers/'
                               'properly_formatted_submissions/'
                               'formatted_post_cluster_facet_parsed_fixed_all_answers.json')
f = open(file_path, 'r')
data = eval(f.read())

id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    qid = entry['question_id'].split('-')[0]
    top_ids = top_pool(pool_depth, id_pos, score_pos,
                       ans_length, id_length, entry['answer'])
    # Reconfirming no dupes
    if len(top_ids) != len(list(set(top_ids))):
        print "PROBLEM WITH DUPLICATED DOC IDS"
        break
    new_answers['HG']['ISI'][clus_type][qid] = top_ids

### ISI HG-Cluster Aggregate

In [26]:
clus_type = 'Cluster Aggregate'
new_answers['HG']['ISI'][clus_type] = {}

file_path = submission_path + ('ISI/DomainDiscovery/'
                               'hg_all_asnwers/'
                               'properly_formatted_submissions/'
                               'formatted_post_aggregate_parsed_fixed_all_answers.json')
f = open(file_path, 'r')
data = eval(f.read())

id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    qid = entry['question_id'].split('-')[0]
    top_ids = top_pool(pool_depth, id_pos, score_pos,
                       ans_length, id_length, entry['answer'])
    # Reconfirming no dupes
    if len(top_ids) != len(list(set(top_ids))):
        print "PROBLEM WITH DUPLICATED DOC IDS"
        break
    new_answers['HG']['ISI'][clus_type][qid] = top_ids

## JPL

### ISI JPL-Cluster Identification

In [29]:
new_answers['JPL']['ISI'] = {}

In [30]:
clus_type = 'Cluster Identification'
new_answers['JPL']['ISI'][clus_type] = {}

file_path = submission_path + ('ISI/DomainDiscovery/'
                               'jpl_answers_isi/'
                               'properly_formatted_submissions/'
                               'formatted_cluster-identification-queries-parsed_all_answers.json')
f = open(file_path, 'r')
data = eval(f.read())

id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    qid = entry['question_id'].split('-')[0]
    top_ids = top_pool(pool_depth, id_pos, score_pos,
                       ans_length, id_length, entry['answer'])
    # Reconfirming no dupes
    if len(top_ids) != len(list(set(top_ids))):
        print "PROBLEM WITH DUPLICATED DOC IDS"
        break
    new_answers['JPL']['ISI'][clus_type][qid] = top_ids

### ISI JPL-Cluster Facet

In [31]:
clus_type = 'Cluster Facet'
new_answers['JPL']['ISI'][clus_type] = {}

file_path = submission_path + ('ISI/DomainDiscovery/'
                               'jpl_answers_isi/'
                               'properly_formatted_submissions/'
                               'formatted_cluster-facet-queries-parsed_all_answers.json')
f = open(file_path, 'r')
data = eval(f.read())

id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    qid = entry['question_id'].split('-')[0]
    top_ids = top_pool(pool_depth, id_pos, score_pos,
                       ans_length, id_length, entry['answer'])
    # Reconfirming no dupes
    if len(top_ids) != len(list(set(top_ids))):
        print "PROBLEM WITH DUPLICATED DOC IDS"
        break
    new_answers['JPL']['ISI'][clus_type][qid] = top_ids

### ISI JPL-Cluster Aggregate

In [32]:
clus_type = 'Cluster Aggregate'
new_answers['JPL']['ISI'][clus_type] = {}

file_path = submission_path + ('ISI/DomainDiscovery/'
                               'jpl_answers_isi/'
                               'properly_formatted_submissions/'
                               'formatted_aggregate-queries-parsed_all_answers.json')
f = open(file_path, 'r')
data = eval(f.read())

id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    qid = entry['question_id'].split('-')[0]
    top_ids = top_pool(pool_depth, id_pos, score_pos,
                       ans_length, id_length, entry['answer'])
    # Reconfirming no dupes
    if len(top_ids) != len(list(set(top_ids))):
        print "PROBLEM WITH DUPLICATED DOC IDS"
        break
    new_answers['JPL']['ISI'][clus_type][qid] = top_ids

In [40]:
old_clusters['HG']

[['7026023157',
  '375F2DCC59394A63BC9A78EBF3507C0D850D359C5275F2F16FE239874E0FA455'],
 ['2125675378',
  '51232D9F1629FA958813280CAAECCDFB99CD5ED237658323CA20D6FA67F05E2F'],
 ['6477878168',
  '4153D1943066E15F35257A52496B5D69BD6F45F762F394300CF5D488E49667BC'],
 ['7202565497',
  '0DF0F01F52FA85DC9E68F9609EE7498FA33477280174AAB76EA895BF593CB210'],
 ['7202565497',
  '96AE3433A4B9868F30BD6CBFB1D164FD33A6FC3023F7A90B6E3133393C52E63F'],
 ['4164557000',
  'AD78C01C0B509C03CA4A3E139BED5D6B6A845D2C69C99C46A996F9C9E06A4753'],
 ['6126696637',
  '66162ABD2C8F95A13119FDD4B9B15D7E0B7F7443BF990767211CCBE06BCF8DB0'],
 ['6477932052',
  '1C3A9DC30DB38636FBB5F8CE43061FFD8AFD96EF888E8283A9C29A546C46108F'],
 ['aylinakkayam@gmail.com',
  '7943FA0AE5A0D9DA7A4E3AA0A88E74DD3810FBAF57EAC93378364E358F45BA67'],
 ['6477878168',
  '908BA47F5B8D37F097456CD3DCD714F9D82E9130839489085568F67B7EE4CCDD'],
 ['7202565497',
  '958EBDD51D3AC29E666E8F99A8B0F5E73C68D9CC1917CA3CEA00CDF7CD0CE520'],
 ['4164557000',
  'BF997B0C9E17

# Prepare Data for Handoff to Uncharted

In [None]:
temp_clusters = {}
for index in answers.keys():
    temp_clusters[index] = []
    for team in answers[index].keys():
        for qtype in answers[index][team].keys():
            for qid in answers[index][team][qtype].keys():
                if qid in chosen[index][qtype].keys():
                    # Check seed
                    seed = chosen[index][qtype][qid]['seed']
                    seed2 = seeds[qtype][qid]
                    if seed != seed2:
                        print "SEED TROUBLE"
                    for doc_id in answers[index][team][qtype][qid]:
                        temp = [seed, doc_id]
                        temp_clusters[index].append(temp)

# Re-re-forming uniq IDs
uniq_clusters = {}
for index in temp_clusters.keys():
    print len(temp_clusters[index])
    uniq_clusters[index] = list(set(tuple(i) for i in temp_clusters[index]))
    print len(uniq_clusters[index])
    print " "

In [None]:
output_file = 'new_cluster_annotation_data.json'
# Will be re-name manually to cluster_annotation_data.json
# This is to avoid overwriting
with open(output_file, 'w') as f:
    json.dump(uniq_clusters, f, indent=2)

# Sanity Check on Chosen Doc IDs 

## NYU Cluster Identification

In [None]:
try:
    del(data)
except:
    pass
missing = []

index = 'NYU'
qtype = 'Cluster Identification'

# DATA
georgetown_data = []
file_path = submission_path + 'Georgetown/DomainDiscovery/NYU_CI.json'
f = open(file_path, 'r')
georgetown_data = eval(f.read())

isi_data = []
file_path = submission_path + ('ISI/DomainDiscovery/'
                               'isi-nyu-answers-dig-extractions/'
                               'properly_formatted_submissions/'
                               'formatted_post_cluster_identification'
                               '-parsed_fixed_all_answers.json')
f = open(file_path, 'r')
isi_data = eval(f.read())

uncharted_data = []
all_data = []
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_NYU_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
for entry in all_data:
    if entry['questionType'] == qtype:
        uncharted_data.append(entry)

# MAIN LOOP
for qid in chosen[index][qtype].keys():
    # Get list of ad candidtes
    candidates = []
    for clus_entry in uniq_clusters[index]:
        if clus_entry[0] == chosen[index][qtype][qid]['seed']:
            candidates.append(clus_entry[1])
    if len(candidates) == 0:
        print "NO CANDIDATES"

    # Georgetown
    id_pos = 0
    score_pos = 1
    found = 0
    for entry in georgetown_data:
        seen = []
        count = 0
        score = 100
        if entry['id'] == qid:
            found = 1
            for ans in entry['answer']:
                if ans[score_pos] > score:
                    print "TROUBLE!"
                if ans[id_pos] not in seen:
                    count += 1
                    if count <= 100:
                        seen.append(ans[id_pos])
            if len(seen) == 0:
                print ('Georgetown', qid)
            for doc in seen:
                if doc not in candidates:
                    missing.append(doc)
                    print ('Georgetown', doc)
    if found == 0:
        print "No submission Georgetown, {0}".format(qid)
            
    # ISI
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in isi_data:
        seen = []
        count = 0
        score = 100
        if entry['question_id'].split('-')[0] == qid:
            found = 1
            for ans in entry['answer']:
                if ans[score_pos] > score:
                    print "TROUBLE!"
                if ans[id_pos] not in seen:
                    count += 1
                    if count <= 100:
                        seen.append(ans[id_pos])
            if len(seen) == 0:
                print ('ISI', qid)
            for doc in seen:
                if doc not in candidates:
                    missing.append(doc)
                    print ('ISI', doc)
    if found == 0:
        print "No submission ISI, {0}".format(qid)
            
    # Uncharted
    id_pos = 0
    score_pos = 1
    found = 0
    for entry in uncharted_data:
        seen = []
        count = 0
        score = 100
        if 'answers' in entry.keys():
            if entry['question_id'] == qid:
                found = 1
                for ans in entry['answers']:
                    if ans[score_pos] > score:
                        print "TROUBLE!"
                    if ans[id_pos] not in seen:
                        count += 1
                        if count <= 100:
                            seen.append(ans[id_pos])
                if len(seen) == 0:
                    print ('Uncharted', qid)
                for doc in seen:
                    if doc not in candidates:
                        missing.append(doc)
                        print ('Uncharted', doc)
    if found == 0:
        print "No submission Uncharted, {0}".format(qid)
        
if len(missing) > 0:
    print "MISSING DOCS"

## NYU Cluster Facet

In [None]:
try:
    del(data)
except:
    pass
missing = []

index = 'NYU'
qtype = 'Cluster Facet'

# DATA
georgetown_data = []
file_path = submission_path + 'Georgetown/DomainDiscovery/NYU_CF.json'
f = open(file_path, 'r')
georgetown_data = eval(f.read())

isi_data = []
file_path = submission_path + ('ISI/DomainDiscovery/'
                               'isi-nyu-answers-dig-extractions/'
                               'properly_formatted_submissions/'
                               'formatted_post_cluster_facet'
                               '_parsed_fixed_all_answers.json')
f = open(file_path, 'r')
isi_data = eval(f.read())
    
uncharted_data = []
all_data = []
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_NYU_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
for entry in all_data:
    if entry['questionType'] == qtype:
        uncharted_data.append(entry)

# MAIN LOOP
for qid in chosen[index][qtype].keys():
    # Get list of ad candidtes
    candidates = []
    for clus_entry in uniq_clusters[index]:
        if clus_entry[0] == chosen[index][qtype][qid]['seed']:
            candidates.append(clus_entry[1])
    if len(candidates) == 0:
        print "NO CANDIDATES"
        
    # Georgetown
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in georgetown_data:
        seen = []
        count = 0
        score = 100
        if entry['id'] == qid:
            found = 1
            for ans in entry['answer']:
                if ans[score_pos] > score:
                    print "TROUBLE!"
                if ans[id_pos] not in seen:
                    count += 1
                    if count <= 100:
                        seen.append(ans[id_pos])
            if len(seen) == 0:
                print ('Georgetown', qid)
            for doc in seen:
                if doc not in candidates:
                    missing.append(doc)
                    print ('Georgetown', doc)
    if found == 0:
        print "No submission Georgetown, {0}".format(qid)
            
    # ISI
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in isi_data:
        seen = []
        count = 0
        score = 100
        if entry['question_id'].split('-')[0] == qid:
            found = 1
            for ans in entry['answer']:
                if ans[score_pos] > score:
                    print "TROUBLE!"
                if ans[id_pos] not in seen:
                    count += 1
                    if count <= 100:
                        seen.append(ans[id_pos])
            if len(seen) == 0:
                print ('ISI', qid)
            for doc in seen:
                if doc not in candidates:
                    missing.append(doc)
                    print ('ISI', doc)
    if found == 0:
        print "No submission ISI, {0}".format(qid)
            
    # Uncharted
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in uncharted_data:
        seen = []
        count = 0
        score = 100
        if 'answers' in entry.keys():
            if entry['question_id'] == qid:
                found = 1
                for ans in entry['answers']:
                    if ans[score_pos] > score:
                        print "TROUBLE!"
                    if ans[id_pos] not in seen:
                        count += 1
                        if count <= 100:
                            seen.append(ans[id_pos])
                if len(seen) == 0:
                    print ('Uncharted', qid)
                for doc in seen:
                    if doc not in candidates:
                        missing.append(doc)
                        print ('Uncharted', doc)
                        print qid
    if found == 0:
        print "No submission Uncharted, {0}".format(qid)

if len(missing) > 0:
    print "MISSING DOCS"

## NYU Cluster Aggregate

In [None]:
try:
    del(data)
except:
    pass
missing = []

index = 'NYU'
qtype = 'Cluster Aggregate'

# DATA
georgetown_data = []
file_path = submission_path + 'Georgetown/DomainDiscovery/NYU_aggregate.json'
f = open(file_path, 'r')
georgetown_data = eval(f.read())

isi_data = []
file_path = submission_path + ('ISI/DomainDiscovery/'
                               'isi-nyu-answers-dig-extractions/'
                               'properly_formatted_submissions/'
                               'formatted_post_aggregate'
                               '_parsed_fixed_all_answers.json')
f = open(file_path, 'r')
isi_data = eval(f.read())
    
uncharted_data = []
all_data = []
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_NYU_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
for entry in all_data:
    if entry['questionType'] in ['AVG', 'MAX', 'MIN', 'MODE']:
        uncharted_data.append(entry)

# MAIN LOOP
for qid in chosen[index][qtype].keys():
    # Get list of ad candidtes
    candidates = []
    for clus_entry in uniq_clusters[index]:
        if clus_entry[0] == chosen[index][qtype][qid]['seed']:
            candidates.append(clus_entry[1])
    if len(candidates) == 0:
        print "NO CANDIDATES"
        
    # Georgetown
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in georgetown_data:
        seen = []
        count = 0
        score = 100
        if entry['id'] == qid:
            found = 1
            for ans in entry['answer']:
                if type(ans) == list:
                    if len(ans) == 3:
                        if ans[score_pos] > score:
                            print "TROUBLE!"
                        if ans[id_pos] not in seen:
                            count += 1
                            if count <= 100:
                                seen.append(ans[id_pos])
            if len(seen) == 0:
                print ('Georgetown', qid)
            for doc in seen:
                if doc not in candidates:
                    missing.append(doc)
                    print ('Georgetown', doc)
    if found == 0:
        print "No submission Georgetown, {0}".format(qid)
            
    # ISI
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in isi_data:
        seen = []
        count = 0
        score = 100
        if entry['question_id'].split('-')[0] == qid:
            found = 1
            for ans in entry['answer']:
                if type(ans) == list:
                    if ans[score_pos] > score:
                        print "TROUBLE!"
                    if ans[id_pos] not in seen:
                        count += 1
                        if count <= 100:
                            seen.append(ans[id_pos])
            if len(seen) == 0:
                print ('ISI', qid)
            for doc in seen:
                if doc not in candidates:
                    missing.append(doc)
                    print ('ISI', doc)
    if found == 0:
        print "No submission ISI, {0}".format(qid)
            
    # Uncharted
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in uncharted_data:
        seen = []
        count = 0
        score = 100
        if 'answers' in entry.keys():
            if entry['question_id'] == qid:
                found = 1
                for ans in entry['answers']:
                    if type(ans) == list:
                        if ans[score_pos] > score:
                            print "TROUBLE!"
                        if ans[id_pos] not in seen:
                            count += 1
                            if count <= 100:
                                seen.append(ans[id_pos])
                if len(seen) == 0:
                    print ('Uncharted', qid)
                for doc in seen:
                    if doc not in candidates:
                        missing.append(doc)
                        print ('Uncharted', doc)
                        print qid
    if found == 0:
        print "No submission Uncharted, {0}".format(qid)

if len(missing) > 0:
    print "MISSING DOCS"

## JPL Cluster Identification

In [None]:
try:
    del(data)
except:
    pass
missing = []

index = 'JPL'
qtype = 'Cluster Identification'

uncharted_data = []
all_data = []
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_JPL_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
for entry in all_data:
    if entry['questionType'] == qtype:
        uncharted_data.append(entry)

# MAIN LOOP
for qid in chosen[index][qtype].keys():
    # Get list of ad candidtes
    candidates = []
    for clus_entry in uniq_clusters[index]:
        if clus_entry[0] == chosen[index][qtype][qid]['seed']:
            candidates.append(clus_entry[1])
    if len(candidates) == 0:
        print "NO CANDIDATES"
            
    # Uncharted
    id_pos = 0
    score_pos = 1
    found = 0
    for entry in uncharted_data:
        seen = []
        count = 0
        score = 100
        if 'answers' in entry.keys():
            if entry['question_id'] == qid:
                found = 1
                for ans in entry['answers']:
                    if ans[score_pos] > score:
                        print "TROUBLE!"
                    if ans[id_pos] not in seen:
                        count += 1
                        if count <= 100:
                            seen.append(ans[id_pos])
                if len(seen) == 0:
                    print ('Uncharted', qid)
                for doc in seen:
                    if doc not in candidates:
                        missing.append(doc)
                        print ('Uncharted', doc)
    if found == 0:
        print "No submission Uncharted, {0}".format(qid)
        
if len(missing) > 0:
    print "MISSING DOCS"

## JPL Cluster Facet

In [None]:
try:
    del(data)
except:
    pass
missing = []

index = 'JPL'
qtype = 'Cluster Facet'

# DATA
uncharted_data = []
all_data = []
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_JPL_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
for entry in all_data:
    if entry['questionType'] == qtype:
        uncharted_data.append(entry)

# MAIN LOOP
for qid in chosen[index][qtype].keys():
    # Get list of ad candidtes
    candidates = []
    for clus_entry in uniq_clusters[index]:
        if clus_entry[0] == chosen[index][qtype][qid]['seed']:
            candidates.append(clus_entry[1])
    if len(candidates) == 0:
        print "NO CANDIDATES"

    # Uncharted
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in uncharted_data:
        seen = []
        count = 0
        score = 100
        if 'answers' in entry.keys():
            if entry['question_id'] == qid:
                found = 1
                for ans in entry['answers']:
                    if ans[score_pos] > score:
                        print "TROUBLE!"
                    if ans[id_pos] not in seen:
                        count += 1
                        if count <= 100:
                            seen.append(ans[id_pos])
                if len(seen) == 0:
                    print ('Uncharted', qid)
                for doc in seen:
                    if doc not in candidates:
                        missing.append(doc)
                        print ('Uncharted', doc)
                        print qid
    if found == 0:
        print "No submission Uncharted, {0}".format(qid)
        print "Confirmed, no Uncharted submission for JPL: Cluster Facet: 26"
if len(missing) > 0:
    print "MISSING DOCS"

## JPL Cluster Aggregate

In [None]:
try:
    del(data)
except:
    pass
missing = []

index = 'JPL'
qtype = 'Cluster Aggregate'

uncharted_data = []
all_data = []
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_JPL_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
for entry in all_data:
    if entry['questionType'] in ['AVG', 'MAX', 'MIN', 'MODE']:
        uncharted_data.append(entry)

# MAIN LOOP
for qid in chosen[index][qtype].keys():
    # Get list of ad candidtes
    candidates = []
    for clus_entry in uniq_clusters[index]:
        if clus_entry[0] == chosen[index][qtype][qid]['seed']:
            candidates.append(clus_entry[1])
    if len(candidates) == 0:
        print "NO CANDIDATES"
             
    # Uncharted
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in uncharted_data:
        seen = []
        count = 0
        score = 100
        if 'answers' in entry.keys():
            if entry['question_id'] == qid:
                found = 1
                for ans in entry['answers']:
                    if type(ans) == list:
                        if ans[score_pos] > score:
                            print "TROUBLE!"
                        if ans[id_pos] not in seen:
                            count += 1
                            if count <= 100:
                                seen.append(ans[id_pos])
                if len(seen) == 0:
                    print ('Uncharted', qid)
                for doc in seen:
                    if doc not in candidates:
                        missing.append(doc)
                        print ('Uncharted', doc)
                        print qid
    if found == 0:
        print "No submission Uncharted, {0}".format(qid)

if len(missing) > 0:
    print "MISSING DOCS"

## HG Cluster Identification

In [None]:
try:
    del(data)
except:
    pass
missing = []

index = 'HG'
qtype = 'Cluster Identification'

uncharted_data = []
all_data = []
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_HG_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
for entry in all_data:
    if entry['questionType'] == qtype:
        uncharted_data.append(entry)

# MAIN LOOP
for qid in chosen[index][qtype].keys():
    # Get list of ad candidtes
    candidates = []
    for clus_entry in uniq_clusters[index]:
        if clus_entry[0] == chosen[index][qtype][qid]['seed']:
            candidates.append(clus_entry[1])
    if len(candidates) == 0:
        print "NO CANDIDATES"
            
    # Uncharted
    id_pos = 0
    score_pos = 1
    found = 0
    for entry in uncharted_data:
        seen = []
        count = 0
        score = 100
        if 'answers' in entry.keys():
            if entry['question_id'] == qid:
                found = 1
                for ans in entry['answers']:
                    if ans[score_pos] > score:
                        print "TROUBLE!"
                    if ans[id_pos] not in seen:
                        count += 1
                        if count <= 100:
                            seen.append(ans[id_pos])
                if len(seen) == 0:
                    print ('Uncharted', qid)
                for doc in seen:
                    if doc not in candidates:
                        missing.append(doc)
                        print ('Uncharted', doc)
    if found == 0:
        print "No submission Uncharted, {0}".format(qid)
        
if len(missing) > 0:
    print "MISSING DOCS"

## HG Cluster Facect

In [None]:
try:
    del(data)
except:
    pass
missing = []

index = 'HG'
qtype = 'Cluster Facet'

# DATA
uncharted_data = []
all_data = []
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_HG_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
for entry in all_data:
    if entry['questionType'] == qtype:
        uncharted_data.append(entry)

# MAIN LOOP
for qid in chosen[index][qtype].keys():
    # Get list of ad candidtes
    candidates = []
    for clus_entry in uniq_clusters[index]:
        if clus_entry[0] == chosen[index][qtype][qid]['seed']:
            candidates.append(clus_entry[1])
    if len(candidates) == 0:
        print "NO CANDIDATES"

    # Uncharted
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in uncharted_data:
        seen = []
        count = 0
        score = 100
        if 'answers' in entry.keys():
            if entry['question_id'] == qid:
                found = 1
                for ans in entry['answers']:
                    if ans[score_pos] > score:
                        print "TROUBLE!"
                    if ans[id_pos] not in seen:
                        count += 1
                        if count <= 100:
                            seen.append(ans[id_pos])
                if len(seen) == 0:
                    print ('Uncharted', qid)
                for doc in seen:
                    if doc not in candidates:
                        missing.append(doc)
                        print ('Uncharted', doc)
                        print qid
    if found == 0:
        print "No submission Uncharted, {0}".format(qid)
        print "Confirmed, no Uncharted submission for JPL: Cluster Facet: 26"
if len(missing) > 0:
    print "MISSING DOCS"

## HG Cluster Aggregate

In [None]:
try:
    del(data)
except:
    pass
missing = []

index = 'HG'
qtype = 'Cluster Aggregate'

uncharted_data = []
all_data = []
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_HG_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
for entry in all_data:
    if entry['questionType'] in ['AVG', 'MAX', 'MIN', 'MODE']:
        uncharted_data.append(entry)

# MAIN LOOP
for qid in chosen[index][qtype].keys():
    # Get list of ad candidtes
    candidates = []
    for clus_entry in uniq_clusters[index]:
        if clus_entry[0] == chosen[index][qtype][qid]['seed']:
            candidates.append(clus_entry[1])
    if len(candidates) == 0:
        print "NO CANDIDATES"
             
    # Uncharted
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in uncharted_data:
        seen = []
        count = 0
        score = 100
        if 'answers' in entry.keys():
            if entry['question_id'] == qid:
                found = 1
                for ans in entry['answers']:
                    if type(ans) == list:
                        if ans[score_pos] > score:
                            print "TROUBLE!"
                        if ans[id_pos] not in seen:
                            count += 1
                            if count <= 100:
                                seen.append(ans[id_pos])
                if len(seen) == 0:
                    print ('Uncharted', qid)
                for doc in seen:
                    if doc not in candidates:
                        missing.append(doc)
                        print ('Uncharted', doc)
                        print qid
    if found == 0:
        print "No submission Uncharted, {0}".format(qid)

if len(missing) > 0:
    print "MISSING DOCS"

In [None]:
chosen['NYU']['Cluster Aggregate'].keys()