In [1]:
import json
import operator

# Define Pooling Depth for Each Question

In [2]:
pool_depth = 100

# Define Number of Questions to be Evaluated

In [3]:
question_depth = {
    'Cluster Identification': 7,
    'Cluster Facet': 7,
    'Cluster Aggregate': 15,
}

In [4]:
submission_path = '../../team_submissions/'
questions_path = '../../../questions/'

In [5]:
def extract_seed(question):
    lines = question['SPARQL'][0].split('\n')
    seed = str(lines[4].split(' ')[1]).strip("'")
        
    return seed

In [6]:
def top_pool(pooling_level, id_pos, score_pos, ans_length, id_length, responses):
    # Explicitly check for score ordering
    score = 100
    count = 0
    seen = []
    for answer in responses:
        # Non-Array, leading elements may happen to have correct
        # length
        if type(answer) == list:
            # Assume elements not matching expected length
            # are aggregate answers
            if len(answer) == ans_length:
                # Confirming doc id is where we expect
                if len(answer[id_pos]) != id_length:
                    print "NONSTANDARD DOC ID DETECTED"
                    print answer
                    break
                if answer[score_pos] > score:
                    print (score, answer[score_pos])
                    print "RANK ORDER ISSUE"
                    break
                score = answer[score_pos]
                if answer[id_pos] not in seen:
                    # Found another unique doc id
                    seen.append(answer[id_pos])
                    count += 1
                    if count == pooling_level:
                        # Found top N docs
                        # Confirm no duplicates
                        len1 = len(seen)
                        uniq_seen = list(set(seen))
                        if len1 != len(uniq_seen):
                            print "PROBLEM WITH DUPLICATED DOC IDS"
                            break
                        return uniq_seen
    # Or even if you don't get N uniq
    uniq_seen = list(set(seen))
    return uniq_seen

# Determine Seed for Every Question

In [7]:
seeds = {}

### Cluster Identification

In [8]:
seeds['Cluster Identification'] = {}

file_path = questions_path + 'post_cluster_identification.json'
f = open(file_path, 'r')
for line in f:
    temp = json.loads(line)
    seed = extract_seed(temp)
    seeds['Cluster Identification'][temp['id']] = seed
f.close()

### Cluster Facet

In [9]:
seeds['Cluster Facet'] = {}

file_path = questions_path + 'post_cluster_facet.json'
f = open(file_path, 'r')
for line in f:
    temp = json.loads(line)
    seed = extract_seed(temp)
    seeds['Cluster Facet'][temp['id']] = seed
f.close()

### Cluster Facet

In [10]:
seeds['Cluster Aggregate'] = {}

file_path = questions_path + 'post_aggregate_V2.json'
f = open(file_path, 'r')
for line in f:
    temp = json.loads(line)
    seed = extract_seed(temp)
    seeds['Cluster Aggregate'][temp['id']] = seed
f.close()

In [11]:
answers = {}

# Georgetown

## NYU

In [12]:
answers['NYU'] = {}

### Georgetown Cluster Identification
NOTE: Response missing for Question #6

In [13]:
answers['NYU']['Georgetown'] = {}
clus_type = 'Cluster Identification'
answers['NYU']['Georgetown'][clus_type] = {}

In [14]:
file_path = submission_path + 'Georgetown/DomainDiscovery/NYU_CI.json'
f = open(file_path, 'r')
data = eval(f.read())

id_pos = 0
score_pos = 1
ans_length = 2 
id_length = 64

for entry in data:
    top_ids = top_pool(pool_depth, id_pos, score_pos,
                       ans_length, id_length, entry['answer'])
    # Reconfirming no dupes
    if len(top_ids) != len(list(set(top_ids))):
        print "PROBLEM WITH DUPLICATED DOC IDS"
        break
    answers['NYU']['Georgetown'][clus_type][entry['id']] = top_ids

### Georgetown Cluster Facet

In [15]:
clus_type = 'Cluster Facet'
answers['NYU']['Georgetown'][clus_type] = {}

In [16]:
file_path = submission_path + 'Georgetown/DomainDiscovery/NYU_CF.json'
f = open(file_path, 'r')
data = eval(f.read())

id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    top_ids = top_pool(pool_depth, id_pos, score_pos,
                       ans_length, id_length, entry['answer'])
    # Reconfirming no dupes
    if len(top_ids) != len(list(set(top_ids))):
        print "PROBLEM WITH DUPLICATED DOC IDS"
        break
    answers['NYU']['Georgetown'][clus_type][entry['id']] = top_ids

### Georgetown Cluster Aggregate

In [17]:
clus_type = 'Cluster Aggregate'
answers['NYU']['Georgetown'][clus_type] = {}

In [18]:
file_path = submission_path + 'Georgetown/DomainDiscovery/NYU_aggregate.json'
f = open(file_path, 'r')
data = eval(f.read())

id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    # Cluster Aggregate Question 94 was Removed
    if entry['id'] != '94':
        top_ids = top_pool(pool_depth, id_pos, score_pos,
                       ans_length, id_length, entry['answer'])
        # Reconfirming no dupes
        if len(top_ids) != len(list(set(top_ids))):
            print "PROBLEM WITH DUPLICATED DOC IDS"
            break
    answers['NYU']['Georgetown'][clus_type][entry['id']] = top_ids

# ISI

## NYU

### ISI Cluster Identification

In [19]:
answers['NYU']['ISI'] = {}
clus_type = 'Cluster Identification'
answers['NYU']['ISI'][clus_type] = {}

In [20]:
file_path = submission_path + ('ISI/DomainDiscovery/'
                               'isi-nyu-answers-dig-extractions/'
                               'properly_formatted_submissions/'
                               'formatted_post_cluster_identification'
                               '-parsed_fixed_all_answers.json')
f = open(file_path, 'r')
data = eval(f.read())

id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    qid = entry['question_id'].split('-')[0]
    top_ids = top_pool(pool_depth, id_pos, score_pos,
                       ans_length, id_length, entry['answer'])
    # Reconfirming no dupes
    if len(top_ids) != len(list(set(top_ids))):
        print "PROBLEM WITH DUPLICATED DOC IDS"
        break
    answers['NYU']['ISI'][clus_type][qid] = top_ids

### ISI Cluster Facet

In [21]:
clus_type = 'Cluster Facet'
answers['NYU']['ISI'][clus_type] = {}

In [22]:
file_path = submission_path + ('ISI/DomainDiscovery/'
                               'isi-nyu-answers-dig-extractions/'
                               'properly_formatted_submissions/'
                               'formatted_post_cluster_facet'
                               '_parsed_fixed_all_answers.json')
f = open(file_path, 'r')
data = eval(f.read())

id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    qid = entry['question_id'].split('-')[0]
    top_ids = top_pool(pool_depth, id_pos, score_pos,
                       ans_length, id_length, entry['answer'])
    # Reconfirming no dupes
    if len(top_ids) != len(list(set(top_ids))):
        print "PROBLEM WITH DUPLICATED DOC IDS"
        break
    answers['NYU']['ISI'][clus_type][qid] = top_ids

### ISI Cluster Aggregate

In [23]:
clus_type = 'Cluster Aggregate'
answers['NYU']['ISI'][clus_type] = {}

In [24]:
file_path = submission_path + ('ISI/DomainDiscovery/'
                               'isi-nyu-answers-dig-extractions/'
                               'properly_formatted_submissions/'
                               'formatted_post_aggregate'
                               '_parsed_fixed_all_answers.json')
f = open(file_path, 'r')
data = eval(f.read())

id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    qid = entry['question_id'].split('-')[0]
    top_ids = top_pool(pool_depth, id_pos, score_pos,
                       ans_length, id_length, entry['answer'])
    # Reconfirming no dupes
    if len(top_ids) != len(list(set(top_ids))):
        print "PROBLEM WITH DUPLICATED DOC IDS"
        break
    answers['NYU']['ISI'][clus_type][qid] = top_ids

# Uncharted

## NYU

### Uncharted NYU-Cluster Identification

In [25]:
answers['NYU']['Uncharted'] = {}
clus_type = 'Cluster Identification'
answers['NYU']['Uncharted'][clus_type] = {}

In [26]:
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_NYU_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
data = []
for entry in all_data:
    if entry['questionType'] == clus_type:
        data.append(entry)
        
id_pos = 0
score_pos = 1
ans_length = 2 
id_length = 64

for entry in data:
    # Not all entries have 'answers'
    if 'answers' in entry.keys():
        top_ids = top_pool(pool_depth, id_pos, score_pos,
                           ans_length, id_length, entry['answers'])
        # Reconfirming no dupes
        if len(top_ids) != len(list(set(top_ids))):
            print "PROBLEM WITH DUPLICATED DOC IDS"
            break
    answers['NYU']['Uncharted'][clus_type][entry['question_id']] = top_ids

### Uncharted NYU-Cluster Facet

In [27]:
clus_type = 'Cluster Facet'
answers['NYU']['Uncharted'][clus_type] = {}

In [28]:
data = []
for entry in all_data:
    if entry['questionType'] == clus_type:
        data.append(entry)
        
id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    # Not all entries have 'answers'
    if 'answers' in entry.keys():
        top_ids = top_pool(pool_depth, id_pos, score_pos,
                           ans_length, id_length, entry['answers'])
        # Reconfirming no dupes
        if len(top_ids) != len(list(set(top_ids))):
            print "PROBLEM WITH DUPLICATED DOC IDS"
            break
    answers['NYU']['Uncharted'][clus_type][entry['question_id']] = top_ids

### Uncharted NYU-Cluster Aggregate

In [29]:
clus_type = 'Cluster Aggregate'
answers['NYU']['Uncharted'][clus_type] = {}

In [30]:
data = []
for entry in all_data:
    if entry['questionType'] in ['AVG', 'MIN', 'MAX', 'MODE']:
        data.append(entry)
        
id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    # Not all entries have 'answers'
    if 'answers' in entry.keys():
        # Cluster Aggregate Question 94 was Removed
        if entry['question_id'] != '94':
            top_ids = top_pool(pool_depth, id_pos, score_pos,
                               ans_length, id_length, entry['answers'])
            # Reconfirming no dupes
            if len(top_ids) != len(list(set(top_ids))):
                print "PROBLEM WITH DUPLICATED DOC IDS"
                break
        answers['NYU']['Uncharted'][clus_type][entry['question_id']] = top_ids

## Hyperion Gray

### Uncharted HG-Cluster Identification

In [31]:
answers ['HG'] = {}
answers['HG']['Uncharted'] = {}
clus_type = 'Cluster Identification'
answers['HG']['Uncharted'][clus_type] = {}

In [32]:
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_HG_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
data = []
for entry in all_data:
    if entry['questionType'] == clus_type:
        data.append(entry)
        
id_pos = 0
score_pos = 1
ans_length = 2 
id_length = 64

for entry in data:
    # Not all entries have 'answers'
    if 'answers' in entry.keys():
        top_ids = top_pool(pool_depth, id_pos, score_pos,
                           ans_length, id_length, entry['answers'])
        # Reconfirming no dupes
        if len(top_ids) != len(list(set(top_ids))):
            print "PROBLEM WITH DUPLICATED DOC IDS"
            break
    answers['HG']['Uncharted'][clus_type][entry['question_id']] = top_ids

### Uncharted HG-Cluster Facet

In [33]:
clus_type = 'Cluster Facet'
answers['HG']['Uncharted'][clus_type] = {}

In [34]:
data = []
for entry in all_data:
    if entry['questionType'] == clus_type:
        data.append(entry)
        
id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    # Not all entries have 'answers'
    if 'answers' in entry.keys():
        top_ids = top_pool(pool_depth, id_pos, score_pos,
                           ans_length, id_length, entry['answers'])
        # Reconfirming no dupes
        if len(top_ids) != len(list(set(top_ids))):
            print "PROBLEM WITH DUPLICATED DOC IDS"
            break
    answers['HG']['Uncharted'][clus_type][entry['question_id']] = top_ids

### Uncharted HG-Cluster Aggregate

In [35]:
clus_type = 'Cluster Aggregate'
answers['HG']['Uncharted'][clus_type] = {}

In [36]:
data = []
for entry in all_data:
    if entry['questionType'] in ['AVG', 'MIN', 'MAX', 'MODE']:
        data.append(entry)
        
id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    # Not all entries have 'answers'
    if 'answers' in entry.keys():
        # Cluster Aggregate Question 94 was Removed
        if entry['question_id'] != '94':
            top_ids = top_pool(pool_depth, id_pos, score_pos,
                               ans_length, id_length, entry['answers'])
            # Reconfirming no dupes
            if len(top_ids) != len(list(set(top_ids))):
                print "PROBLEM WITH DUPLICATED DOC IDS"
                break
        answers['HG']['Uncharted'][clus_type][entry['question_id']] = top_ids

## JPL

### Uncharted JPL-Cluster Identification

In [37]:
answers ['JPL'] = {}
answers['JPL']['Uncharted'] = {}
clus_type = 'Cluster Identification'
answers['JPL']['Uncharted'][clus_type] = {}

In [38]:
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_JPL_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
data = []
for entry in all_data:
    if entry['questionType'] == clus_type:
        data.append(entry)
        
id_pos = 0
score_pos = 1
ans_length = 2 
id_length = 64

for entry in data:
    # Not all entries have 'answers'
    if 'answers' in entry.keys():
        top_ids = top_pool(pool_depth, id_pos, score_pos,
                           ans_length, id_length, entry['answers'])
        # Reconfirming no dupes
        if len(top_ids) != len(list(set(top_ids))):
            print "PROBLEM WITH DUPLICATED DOC IDS"
            break
    answers['JPL']['Uncharted'][clus_type][entry['question_id']] = top_ids

### Uncharted JPL-Cluster Facet

In [39]:
clus_type = 'Cluster Facet'
answers['JPL']['Uncharted'][clus_type] = {}

In [40]:
data = []
for entry in all_data:
    if entry['questionType'] == clus_type:
        data.append(entry)
        
id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    # Not all entries have 'answers'
    if 'answers' in entry.keys():
        top_ids = top_pool(pool_depth, id_pos, score_pos,
                           ans_length, id_length, entry['answers'])
        # Reconfirming no dupes
        if len(top_ids) != len(list(set(top_ids))):
            print "PROBLEM WITH DUPLICATED DOC IDS"
            break
    answers['JPL']['Uncharted'][clus_type][entry['question_id']] = top_ids

### Uncharted JPL-Cluster Aggregate

In [41]:
clus_type = 'Cluster Aggregate'
answers['JPL']['Uncharted'][clus_type] = {}

In [42]:
data = []
for entry in all_data:
    if entry['questionType'] in ['AVG', 'MIN', 'MAX', 'MODE']:
        data.append(entry)
        
id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    # Not all entries have 'answers'
    if 'answers' in entry.keys():
        # Cluster Aggregate Question 94 was Removed
        if entry['question_id'] != '94':
            top_ids = top_pool(pool_depth, id_pos, score_pos,
                               ans_length, id_length, entry['answers'])
            # Reconfirming no dupes
            if len(top_ids) != len(list(set(top_ids))):
                print "PROBLEM WITH DUPLICATED DOC IDS"
                break
        answers['JPL']['Uncharted'][clus_type][entry['question_id']] = top_ids

# Question Selection
Here we look for questions that have the greatest overlap of document ids (i.e., the highest number of repeated document ids submitted by two or more teams) in the NYU index.  A greater overlap is used as an indicator that two or more teams were able to find relevant ads for the question.

In [43]:
# Note: the quantity reflected in this dictionary is not neccessarily 
# the "number of duplicated ids" since 1 document appearing 3 times would
# count the same as 2 documents appearing 2 times each.  Perhaps a better
# description of this quantity is something like "the number of times
# two different teams submitted the same document ID".
overlap = {}
for qtype in seeds.keys():
    overlap[qtype] = {}
    for qid in seeds[qtype].keys():
        submissions = []
        for team in answers['NYU'].keys():
            # Only choose questions each team answered
            if qid not in answers['NYU'][team][qtype].keys():
                break
            if len(answers['NYU'][team][qtype][qid]) == 0:
                break
            team_sub = answers['NYU'][team][qtype][qid]
            submissions.extend(team_sub)
        first = len(submissions)
        uniq_submissions = list(set(submissions))
        second = len(uniq_submissions)
        overlap[qtype][qid] = first - second

In [44]:
chosen = {}
chosen['NYU'] = {}
chosen['HG'] = {}
chosen['JPL'] = {}
for qtype in overlap.keys():
    chosen['NYU'][qtype] = {}
    chosen['JPL'][qtype] = {}
    chosen['HG'][qtype] = {}
    sorted_sub = sorted(overlap[qtype].items(), key=operator.itemgetter(1), reverse=True)
    top_overlaps = sorted_sub[0:question_depth[qtype]]
    for entry in top_overlaps:
        qid = str(entry[0])
        score = entry[1]
        # NYU 
        chosen['NYU'][qtype][qid] = {}
        chosen['NYU'][qtype][qid]['overlap'] = score
        chosen['NYU'][qtype][qid]['seed'] = seeds[qtype][qid]
        # Repeat the above uniq submissions routine for certainty
        submissions = []
        for team in answers['NYU'].keys():
            chosen['NYU'][qtype][qid][team] = {}
            chosen['NYU'][qtype][qid][team]['number submissions'] = len(answers['NYU'][team][qtype][qid])
            submissions.extend(answers['NYU'][team][qtype][qid])
        first = len(submissions)
        uniq_submissions = list(set(submissions))
        second = len(uniq_submissions)
        if (first - second) !=  score:
            print "TROUBLE"
        chosen['NYU'][qtype][qid]['submissions'] = uniq_submissions
        chosen['NYU'][qtype][qid]['number uniq submissions'] = len(uniq_submissions)
        
        # JPL
        chosen['JPL'][qtype][qid] = {}
        chosen['JPL'][qtype][qid]['seed'] = seeds[qtype][qid]
        submissions = []
        submissions.extend(answers['JPL']['Uncharted'][qtype][qid])
        # Shouldn't have any dupes
        first = len(submissions)
        uniq_submissions = list(set(submissions))
        second = len(uniq_submissions)
        if first != second:
            print "DUPLICATES IN JPL SUBMISSIONS"
        chosen['JPL'][qtype][qid]['submissions'] = uniq_submissions
        chosen['JPL'][qtype][qid]['number uniq submissions'] = len(uniq_submissions)
        
        # HG
        chosen['HG'][qtype][qid] = {}
        chosen['HG'][qtype][qid]['seed'] = seeds[qtype][qid]
        submissions = []
        submissions.extend(answers['HG']['Uncharted'][qtype][qid])
        # Shouldn't have any dupes
        first = len(submissions)
        uniq_submissions = list(set(submissions))
        second = len(uniq_submissions)
        if first != second:
            print "DUPLICATES IN HG SUBMISSIONS"
        chosen['HG'][qtype][qid]['submissions'] = uniq_submissions
        chosen['HG'][qtype][qid]['number uniq submissions'] = len(uniq_submissions)

## Check Seeds

In [45]:
# Note just as with the original set of questions, some seeds have been repeated in the chosen questions.
# However, based on the criteria for choosing questions which are most likely to
# produce relevant ads, it should be expected that questions with clusters represented in the DD data set
# should repeatedly match that criteria.
repeated_seeds = []
for index in chosen.keys():
    chosen_seeds = []
    for qtype in chosen[index].keys():
        for qid in chosen[index][qtype].keys():
            # Check seed
            if seeds[qtype][qid] != chosen[index][qtype][qid]['seed']:
                print "SEED TROUBLE"
            if chosen[index][qtype][qid]['seed'] in chosen_seeds:
                repeated_seeds.append(chosen[index][qtype][qid]['seed'])
            chosen_seeds.append(chosen[index][qtype][qid]['seed'])

print set(repeated_seeds)

set(['6126696637', '4164557000', '3345579838', '7026023157', '6477932052'])


In [46]:
output_file = 'chosen_questions.json'
with open(output_file, 'w') as f:
    json.dump(chosen, f, indent=2)

# Prepare Data for Handoff to Uncharted

In [47]:
temp_clusters = {}
for index in answers.keys():
    temp_clusters[index] = []
    for team in answers[index].keys():
        for qtype in answers[index][team].keys():
            for qid in answers[index][team][qtype].keys():
                if qid in chosen[index][qtype].keys():
                    # Check seed
                    seed = chosen[index][qtype][qid]['seed']
                    seed2 = seeds[qtype][qid]
                    if seed != seed2:
                        print "SEED TROUBLE"
                    for doc_id in answers[index][team][qtype][qid]:
                        temp = [seed, doc_id]
                        temp_clusters[index].append(temp)

# Re-re-forming uniq IDs
uniq_clusters = {}
for index in temp_clusters.keys():
    print len(temp_clusters[index])
    uniq_clusters[index] = list(set(tuple(i) for i in temp_clusters[index]))
    print len(uniq_clusters[index])
    print " "

3486
2402
 
2534
2337
 
2777
2556
 


In [48]:
output_file = 'cluster_annotation_data.json'
with open(output_file, 'w') as f:
    json.dump(uniq_clusters, f, indent=2)

# Sanity Check on Chosen Doc IDs 

## NYU Cluster Identification

In [49]:
try:
    del(data)
except:
    pass
missing = []

index = 'NYU'
qtype = 'Cluster Identification'

# DATA
georgetown_data = []
file_path = submission_path + 'Georgetown/DomainDiscovery/NYU_CI.json'
f = open(file_path, 'r')
georgetown_data = eval(f.read())

isi_data = []
file_path = submission_path + ('ISI/DomainDiscovery/'
                               'isi-nyu-answers-dig-extractions/'
                               'properly_formatted_submissions/'
                               'formatted_post_cluster_identification'
                               '-parsed_fixed_all_answers.json')
f = open(file_path, 'r')
isi_data = eval(f.read())

uncharted_data = []
all_data = []
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_NYU_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
for entry in all_data:
    if entry['questionType'] == qtype:
        uncharted_data.append(entry)

# MAIN LOOP
for qid in chosen[index][qtype].keys():
    # Get list of ad candidtes
    candidates = []
    for clus_entry in uniq_clusters[index]:
        if clus_entry[0] == chosen[index][qtype][qid]['seed']:
            candidates.append(clus_entry[1])
    if len(candidates) == 0:
        print "NO CANDIDATES"

    # Georgetown
    id_pos = 0
    score_pos = 1
    found = 0
    for entry in georgetown_data:
        seen = []
        count = 0
        score = 100
        if entry['id'] == qid:
            found = 1
            for ans in entry['answer']:
                if ans[score_pos] > score:
                    print "TROUBLE!"
                if ans[id_pos] not in seen:
                    count += 1
                    if count <= 100:
                        seen.append(ans[id_pos])
            if len(seen) == 0:
                print ('Georgetown', qid)
            for doc in seen:
                if doc not in candidates:
                    missing.append(doc)
                    print ('Georgetown', doc)
    if found == 0:
        print "No submission Georgetown, {0}".format(qid)
            
    # ISI
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in isi_data:
        seen = []
        count = 0
        score = 100
        if entry['question_id'].split('-')[0] == qid:
            found = 1
            for ans in entry['answer']:
                if ans[score_pos] > score:
                    print "TROUBLE!"
                if ans[id_pos] not in seen:
                    count += 1
                    if count <= 100:
                        seen.append(ans[id_pos])
            if len(seen) == 0:
                print ('ISI', qid)
            for doc in seen:
                if doc not in candidates:
                    missing.append(doc)
                    print ('ISI', doc)
    if found == 0:
        print "No submission ISI, {0}".format(qid)
            
    # Uncharted
    id_pos = 0
    score_pos = 1
    found = 0
    for entry in uncharted_data:
        seen = []
        count = 0
        score = 100
        if 'answers' in entry.keys():
            if entry['question_id'] == qid:
                found = 1
                for ans in entry['answers']:
                    if ans[score_pos] > score:
                        print "TROUBLE!"
                    if ans[id_pos] not in seen:
                        count += 1
                        if count <= 100:
                            seen.append(ans[id_pos])
                if len(seen) == 0:
                    print ('Uncharted', qid)
                for doc in seen:
                    if doc not in candidates:
                        missing.append(doc)
                        print ('Uncharted', doc)
    if found == 0:
        print "No submission Uncharted, {0}".format(qid)
        
if len(missing) > 0:
    print "MISSING DOCS"

## NYU Cluster Facet

In [50]:
try:
    del(data)
except:
    pass
missing = []

index = 'NYU'
qtype = 'Cluster Facet'

# DATA
georgetown_data = []
file_path = submission_path + 'Georgetown/DomainDiscovery/NYU_CF.json'
f = open(file_path, 'r')
georgetown_data = eval(f.read())

isi_data = []
file_path = submission_path + ('ISI/DomainDiscovery/'
                               'isi-nyu-answers-dig-extractions/'
                               'properly_formatted_submissions/'
                               'formatted_post_cluster_facet'
                               '_parsed_fixed_all_answers.json')
f = open(file_path, 'r')
isi_data = eval(f.read())
    
uncharted_data = []
all_data = []
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_NYU_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
for entry in all_data:
    if entry['questionType'] == qtype:
        uncharted_data.append(entry)

# MAIN LOOP
for qid in chosen[index][qtype].keys():
    # Get list of ad candidtes
    candidates = []
    for clus_entry in uniq_clusters[index]:
        if clus_entry[0] == chosen[index][qtype][qid]['seed']:
            candidates.append(clus_entry[1])
    if len(candidates) == 0:
        print "NO CANDIDATES"
        
    # Georgetown
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in georgetown_data:
        seen = []
        count = 0
        score = 100
        if entry['id'] == qid:
            found = 1
            for ans in entry['answer']:
                if ans[score_pos] > score:
                    print "TROUBLE!"
                if ans[id_pos] not in seen:
                    count += 1
                    if count <= 100:
                        seen.append(ans[id_pos])
            if len(seen) == 0:
                print ('Georgetown', qid)
            for doc in seen:
                if doc not in candidates:
                    missing.append(doc)
                    print ('Georgetown', doc)
    if found == 0:
        print "No submission Georgetown, {0}".format(qid)
            
    # ISI
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in isi_data:
        seen = []
        count = 0
        score = 100
        if entry['question_id'].split('-')[0] == qid:
            found = 1
            for ans in entry['answer']:
                if ans[score_pos] > score:
                    print "TROUBLE!"
                if ans[id_pos] not in seen:
                    count += 1
                    if count <= 100:
                        seen.append(ans[id_pos])
            if len(seen) == 0:
                print ('ISI', qid)
            for doc in seen:
                if doc not in candidates:
                    missing.append(doc)
                    print ('ISI', doc)
    if found == 0:
        print "No submission ISI, {0}".format(qid)
            
    # Uncharted
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in uncharted_data:
        seen = []
        count = 0
        score = 100
        if 'answers' in entry.keys():
            if entry['question_id'] == qid:
                found = 1
                for ans in entry['answers']:
                    if ans[score_pos] > score:
                        print "TROUBLE!"
                    if ans[id_pos] not in seen:
                        count += 1
                        if count <= 100:
                            seen.append(ans[id_pos])
                if len(seen) == 0:
                    print ('Uncharted', qid)
                for doc in seen:
                    if doc not in candidates:
                        missing.append(doc)
                        print ('Uncharted', doc)
                        print qid
    if found == 0:
        print "No submission Uncharted, {0}".format(qid)

if len(missing) > 0:
    print "MISSING DOCS"

## NYU Cluster Aggregate

In [51]:
try:
    del(data)
except:
    pass
missing = []

index = 'NYU'
qtype = 'Cluster Aggregate'

# DATA
georgetown_data = []
file_path = submission_path + 'Georgetown/DomainDiscovery/NYU_aggregate.json'
f = open(file_path, 'r')
georgetown_data = eval(f.read())

isi_data = []
file_path = submission_path + ('ISI/DomainDiscovery/'
                               'isi-nyu-answers-dig-extractions/'
                               'properly_formatted_submissions/'
                               'formatted_post_aggregate'
                               '_parsed_fixed_all_answers.json')
f = open(file_path, 'r')
isi_data = eval(f.read())
    
uncharted_data = []
all_data = []
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_NYU_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
for entry in all_data:
    if entry['questionType'] in ['AVG', 'MAX', 'MIN', 'MODE']:
        uncharted_data.append(entry)

# MAIN LOOP
for qid in chosen[index][qtype].keys():
    # Get list of ad candidtes
    candidates = []
    for clus_entry in uniq_clusters[index]:
        if clus_entry[0] == chosen[index][qtype][qid]['seed']:
            candidates.append(clus_entry[1])
    if len(candidates) == 0:
        print "NO CANDIDATES"
        
    # Georgetown
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in georgetown_data:
        seen = []
        count = 0
        score = 100
        if entry['id'] == qid:
            found = 1
            for ans in entry['answer']:
                if type(ans) == list:
                    if len(ans) == 3:
                        if ans[score_pos] > score:
                            print "TROUBLE!"
                        if ans[id_pos] not in seen:
                            count += 1
                            if count <= 100:
                                seen.append(ans[id_pos])
            if len(seen) == 0:
                print ('Georgetown', qid)
            for doc in seen:
                if doc not in candidates:
                    missing.append(doc)
                    print ('Georgetown', doc)
    if found == 0:
        print "No submission Georgetown, {0}".format(qid)
            
    # ISI
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in isi_data:
        seen = []
        count = 0
        score = 100
        if entry['question_id'].split('-')[0] == qid:
            found = 1
            for ans in entry['answer']:
                if type(ans) == list:
                    if ans[score_pos] > score:
                        print "TROUBLE!"
                    if ans[id_pos] not in seen:
                        count += 1
                        if count <= 100:
                            seen.append(ans[id_pos])
            if len(seen) == 0:
                print ('ISI', qid)
            for doc in seen:
                if doc not in candidates:
                    missing.append(doc)
                    print ('ISI', doc)
    if found == 0:
        print "No submission ISI, {0}".format(qid)
            
    # Uncharted
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in uncharted_data:
        seen = []
        count = 0
        score = 100
        if 'answers' in entry.keys():
            if entry['question_id'] == qid:
                found = 1
                for ans in entry['answers']:
                    if type(ans) == list:
                        if ans[score_pos] > score:
                            print "TROUBLE!"
                        if ans[id_pos] not in seen:
                            count += 1
                            if count <= 100:
                                seen.append(ans[id_pos])
                if len(seen) == 0:
                    print ('Uncharted', qid)
                for doc in seen:
                    if doc not in candidates:
                        missing.append(doc)
                        print ('Uncharted', doc)
                        print qid
    if found == 0:
        print "No submission Uncharted, {0}".format(qid)

if len(missing) > 0:
    print "MISSING DOCS"

## JPL Cluster Identification

In [52]:
try:
    del(data)
except:
    pass
missing = []

index = 'JPL'
qtype = 'Cluster Identification'

uncharted_data = []
all_data = []
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_JPL_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
for entry in all_data:
    if entry['questionType'] == qtype:
        uncharted_data.append(entry)

# MAIN LOOP
for qid in chosen[index][qtype].keys():
    # Get list of ad candidtes
    candidates = []
    for clus_entry in uniq_clusters[index]:
        if clus_entry[0] == chosen[index][qtype][qid]['seed']:
            candidates.append(clus_entry[1])
    if len(candidates) == 0:
        print "NO CANDIDATES"
            
    # Uncharted
    id_pos = 0
    score_pos = 1
    found = 0
    for entry in uncharted_data:
        seen = []
        count = 0
        score = 100
        if 'answers' in entry.keys():
            if entry['question_id'] == qid:
                found = 1
                for ans in entry['answers']:
                    if ans[score_pos] > score:
                        print "TROUBLE!"
                    if ans[id_pos] not in seen:
                        count += 1
                        if count <= 100:
                            seen.append(ans[id_pos])
                if len(seen) == 0:
                    print ('Uncharted', qid)
                for doc in seen:
                    if doc not in candidates:
                        missing.append(doc)
                        print ('Uncharted', doc)
    if found == 0:
        print "No submission Uncharted, {0}".format(qid)
        
if len(missing) > 0:
    print "MISSING DOCS"

## JPL Cluster Facet

In [53]:
try:
    del(data)
except:
    pass
missing = []

index = 'JPL'
qtype = 'Cluster Facet'

# DATA
uncharted_data = []
all_data = []
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_JPL_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
for entry in all_data:
    if entry['questionType'] == qtype:
        uncharted_data.append(entry)

# MAIN LOOP
for qid in chosen[index][qtype].keys():
    # Get list of ad candidtes
    candidates = []
    for clus_entry in uniq_clusters[index]:
        if clus_entry[0] == chosen[index][qtype][qid]['seed']:
            candidates.append(clus_entry[1])
    if len(candidates) == 0:
        print "NO CANDIDATES"

    # Uncharted
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in uncharted_data:
        seen = []
        count = 0
        score = 100
        if 'answers' in entry.keys():
            if entry['question_id'] == qid:
                found = 1
                for ans in entry['answers']:
                    if ans[score_pos] > score:
                        print "TROUBLE!"
                    if ans[id_pos] not in seen:
                        count += 1
                        if count <= 100:
                            seen.append(ans[id_pos])
                if len(seen) == 0:
                    print ('Uncharted', qid)
                for doc in seen:
                    if doc not in candidates:
                        missing.append(doc)
                        print ('Uncharted', doc)
                        print qid
    if found == 0:
        print "No submission Uncharted, {0}".format(qid)
        print "Confirmed, no Uncharted submission for JPL: Cluster Facet: 26"
if len(missing) > 0:
    print "MISSING DOCS"

No submission Uncharted, 26
Confirmed, no Uncharted submission for JPL: Cluster Facet: 26


## JPL Cluster Aggregate

In [54]:
try:
    del(data)
except:
    pass
missing = []

index = 'JPL'
qtype = 'Cluster Aggregate'

uncharted_data = []
all_data = []
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_JPL_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
for entry in all_data:
    if entry['questionType'] in ['AVG', 'MAX', 'MIN', 'MODE']:
        uncharted_data.append(entry)

# MAIN LOOP
for qid in chosen[index][qtype].keys():
    # Get list of ad candidtes
    candidates = []
    for clus_entry in uniq_clusters[index]:
        if clus_entry[0] == chosen[index][qtype][qid]['seed']:
            candidates.append(clus_entry[1])
    if len(candidates) == 0:
        print "NO CANDIDATES"
             
    # Uncharted
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in uncharted_data:
        seen = []
        count = 0
        score = 100
        if 'answers' in entry.keys():
            if entry['question_id'] == qid:
                found = 1
                for ans in entry['answers']:
                    if type(ans) == list:
                        if ans[score_pos] > score:
                            print "TROUBLE!"
                        if ans[id_pos] not in seen:
                            count += 1
                            if count <= 100:
                                seen.append(ans[id_pos])
                if len(seen) == 0:
                    print ('Uncharted', qid)
                for doc in seen:
                    if doc not in candidates:
                        missing.append(doc)
                        print ('Uncharted', doc)
                        print qid
    if found == 0:
        print "No submission Uncharted, {0}".format(qid)

if len(missing) > 0:
    print "MISSING DOCS"

## HG Cluster Identification

In [55]:
try:
    del(data)
except:
    pass
missing = []

index = 'HG'
qtype = 'Cluster Identification'

uncharted_data = []
all_data = []
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_HG_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
for entry in all_data:
    if entry['questionType'] == qtype:
        uncharted_data.append(entry)

# MAIN LOOP
for qid in chosen[index][qtype].keys():
    # Get list of ad candidtes
    candidates = []
    for clus_entry in uniq_clusters[index]:
        if clus_entry[0] == chosen[index][qtype][qid]['seed']:
            candidates.append(clus_entry[1])
    if len(candidates) == 0:
        print "NO CANDIDATES"
            
    # Uncharted
    id_pos = 0
    score_pos = 1
    found = 0
    for entry in uncharted_data:
        seen = []
        count = 0
        score = 100
        if 'answers' in entry.keys():
            if entry['question_id'] == qid:
                found = 1
                for ans in entry['answers']:
                    if ans[score_pos] > score:
                        print "TROUBLE!"
                    if ans[id_pos] not in seen:
                        count += 1
                        if count <= 100:
                            seen.append(ans[id_pos])
                if len(seen) == 0:
                    print ('Uncharted', qid)
                for doc in seen:
                    if doc not in candidates:
                        missing.append(doc)
                        print ('Uncharted', doc)
    if found == 0:
        print "No submission Uncharted, {0}".format(qid)
        
if len(missing) > 0:
    print "MISSING DOCS"

## HG Cluster Facect

In [56]:
try:
    del(data)
except:
    pass
missing = []

index = 'HG'
qtype = 'Cluster Facet'

# DATA
uncharted_data = []
all_data = []
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_HG_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
for entry in all_data:
    if entry['questionType'] == qtype:
        uncharted_data.append(entry)

# MAIN LOOP
for qid in chosen[index][qtype].keys():
    # Get list of ad candidtes
    candidates = []
    for clus_entry in uniq_clusters[index]:
        if clus_entry[0] == chosen[index][qtype][qid]['seed']:
            candidates.append(clus_entry[1])
    if len(candidates) == 0:
        print "NO CANDIDATES"

    # Uncharted
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in uncharted_data:
        seen = []
        count = 0
        score = 100
        if 'answers' in entry.keys():
            if entry['question_id'] == qid:
                found = 1
                for ans in entry['answers']:
                    if ans[score_pos] > score:
                        print "TROUBLE!"
                    if ans[id_pos] not in seen:
                        count += 1
                        if count <= 100:
                            seen.append(ans[id_pos])
                if len(seen) == 0:
                    print ('Uncharted', qid)
                for doc in seen:
                    if doc not in candidates:
                        missing.append(doc)
                        print ('Uncharted', doc)
                        print qid
    if found == 0:
        print "No submission Uncharted, {0}".format(qid)
        print "Confirmed, no Uncharted submission for JPL: Cluster Facet: 26"
if len(missing) > 0:
    print "MISSING DOCS"

## HG Cluster Aggregate

In [57]:
try:
    del(data)
except:
    pass
missing = []

index = 'HG'
qtype = 'Cluster Aggregate'

uncharted_data = []
all_data = []
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_HG_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
for entry in all_data:
    if entry['questionType'] in ['AVG', 'MAX', 'MIN', 'MODE']:
        uncharted_data.append(entry)

# MAIN LOOP
for qid in chosen[index][qtype].keys():
    # Get list of ad candidtes
    candidates = []
    for clus_entry in uniq_clusters[index]:
        if clus_entry[0] == chosen[index][qtype][qid]['seed']:
            candidates.append(clus_entry[1])
    if len(candidates) == 0:
        print "NO CANDIDATES"
             
    # Uncharted
    id_pos = 1
    score_pos = 2
    found = 0
    for entry in uncharted_data:
        seen = []
        count = 0
        score = 100
        if 'answers' in entry.keys():
            if entry['question_id'] == qid:
                found = 1
                for ans in entry['answers']:
                    if type(ans) == list:
                        if ans[score_pos] > score:
                            print "TROUBLE!"
                        if ans[id_pos] not in seen:
                            count += 1
                            if count <= 100:
                                seen.append(ans[id_pos])
                if len(seen) == 0:
                    print ('Uncharted', qid)
                for doc in seen:
                    if doc not in candidates:
                        missing.append(doc)
                        print ('Uncharted', doc)
                        print qid
    if found == 0:
        print "No submission Uncharted, {0}".format(qid)

if len(missing) > 0:
    print "MISSING DOCS"