In [1]:
import json
pool_depth = 100

In [2]:
submission_path = '../../team_submissions/'
questions_path = '../../../questions/'

In [3]:
def extract_seed(question):
    lines = question['SPARQL'][0].split('\n')
    seed = str(lines[4].split(' ')[1]).strip("'")
        
    return seed

In [4]:
def top_pool(pooling_level, id_pos, score_pos, ans_length, id_length, responses):
    # Explicitly check for score ordering
    score = 100
    count = 0
    seen = []
    for answer in responses:
        # Non-Array, leading elements may happen to have correct
        # length
        if type(answer) == list:
            # Assume elements not matching expected length
            # are aggregate answers
            if len(answer) == ans_length:
                # Confirming doc id is where we expect
                if len(answer[id_pos]) != id_length:
                    print "NONSTANDARD DOC ID DETECTED"
                    print answer
                    break
                if answer[score_pos] > score:
                    print (score, answer[score_pos])
                    print "RANK ORDER ISSUE"
                    break
                score = answer[score_pos]
                if answer[id_pos] not in seen:
                    # Found another unique doc id
                    seen.append(answer[id_pos])
                    count += 1
                    if count == pooling_level:
                        # Found top N docs
                        # Confirm no duplicates
                        len1 = len(seen)
                        uniq_seen = list(set(seen))
                        if len1 != len(uniq_seen):
                            print "PROBLEM WITH DUPLICATED DOC IDS"
                            break
                        return uniq_seen
    # Or even if you don't get N uniq
    uniq_seen = list(set(seen))
    return uniq_seen

# Determine Seed for Every Question

In [5]:
seeds = {}

### Cluster Identification

In [6]:
seeds['Cluster Identification'] = {}

file_path = questions_path + 'post_cluster_identification.json'
f = open(file_path, 'r')
for line in f:
    temp = json.loads(line)
    seed = extract_seed(temp)
    seeds['Cluster Identification'][temp['id']] = seed
f.close()

### Cluster Facet

In [7]:
seeds['Cluster Facet'] = {}

file_path = questions_path + 'post_cluster_facet.json'
f = open(file_path, 'r')
for line in f:
    temp = json.loads(line)
    seed = extract_seed(temp)
    seeds['Cluster Facet'][temp['id']] = seed
f.close()

### Cluster Facet

In [8]:
seeds['Cluster Aggregate'] = {}

file_path = questions_path + 'post_aggregate_V2.json'
f = open(file_path, 'r')
for line in f:
    temp = json.loads(line)
    seed = extract_seed(temp)
    seeds['Cluster Aggregate'][temp['id']] = seed
f.close()

In [11]:
answers = {}

# Georgetown

## NYU

In [12]:
answers['NYU'] = {}

### Georgetown Cluster Identification
NOTE: Response missing for Question #6

In [13]:
answers['NYU']['Georgetown'] = {}
clus_type = 'Cluster Identification'
answers['NYU']['Georgetown'][clus_type] = {}

In [14]:
file_path = submission_path + 'Georgetown/DomainDiscovery/NYU_CI.json'
f = open(file_path, 'r')
data = eval(f.read())

id_pos = 0
score_pos = 1
ans_length = 2 
id_length = 64

for entry in data:
    qseed = seeds[clus_type][entry['id']]
    top_ids = top_pool(pool_depth, id_pos, score_pos,
                       ans_length, id_length, entry['answer'])
    # Reconfirming no dupes
    if len(top_ids) != len(list(set(top_ids))):
        print "PROBLEM WITH DUPLICATED DOC IDS"
        break
    answers['NYU']['Georgetown'][clus_type][qseed] = top_ids

### Georgetown Cluster Facet

In [15]:
clus_type = 'Cluster Facet'
answers['NYU']['Georgetown'][clus_type] = {}

In [16]:
file_path = submission_path + 'Georgetown/DomainDiscovery/NYU_CF.json'
f = open(file_path, 'r')
data = eval(f.read())

id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    qseed = seeds[clus_type][entry['id']]
    top_ids = top_pool(pool_depth, id_pos, score_pos,
                       ans_length, id_length, entry['answer'])
    # Reconfirming no dupes
    if len(top_ids) != len(list(set(top_ids))):
        print "PROBLEM WITH DUPLICATED DOC IDS"
        break
    answers['NYU']['Georgetown'][clus_type][qseed] = top_ids

### Georgetown Cluster Aggregate

In [17]:
clus_type = 'Cluster Aggregate'
answers['NYU']['Georgetown'][clus_type] = {}

In [18]:
file_path = submission_path + 'Georgetown/DomainDiscovery/NYU_aggregate.json'
f = open(file_path, 'r')
data = eval(f.read())

id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    # Cluster Aggregate Question 94 was Removed
    if entry['id'] != '94':
        qseed = seeds[clus_type][entry['id']]
        top_ids = top_pool(pool_depth, id_pos, score_pos,
                       ans_length, id_length, entry['answer'])
        # Reconfirming no dupes
        if len(top_ids) != len(list(set(top_ids))):
            print "PROBLEM WITH DUPLICATED DOC IDS"
            break
    answers['NYU']['Georgetown'][clus_type][qseed] = top_ids

# ISI

## NYU

### ISI Cluster Identification

In [19]:
answers['NYU']['ISI'] = {}
clus_type = 'Cluster Identification'
answers['NYU']['ISI'][clus_type] = {}

In [20]:
file_path = submission_path + ('ISI/DomainDiscovery/'
                               'isi-nyu-answers-dig-extractions/'
                               'properly_formatted_submissions/'
                               'formatted_post_cluster_identification'
                               '-parsed_fixed_all_answers.json')
f = open(file_path, 'r')
data = eval(f.read())

id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    qid = entry['question_id'].split('-')[0]
    qseed = seeds[clus_type][qid]
    top_ids = top_pool(pool_depth, id_pos, score_pos,
                       ans_length, id_length, entry['answer'])
    # Reconfirming no dupes
    if len(top_ids) != len(list(set(top_ids))):
        print "PROBLEM WITH DUPLICATED DOC IDS"
        break
    answers['NYU']['ISI'][clus_type][qseed] = top_ids

### ISI Cluster Facet

In [21]:
clus_type = 'Cluster Facet'
answers['NYU']['ISI'][clus_type] = {}

In [22]:
file_path = submission_path + ('ISI/DomainDiscovery/'
                               'isi-nyu-answers-dig-extractions/'
                               'properly_formatted_submissions/'
                               'formatted_post_cluster_facet'
                               '_parsed_fixed_all_answers.json')
f = open(file_path, 'r')
data = eval(f.read())

id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    qid = entry['question_id'].split('-')[0]
    qseed = seeds[clus_type][qid]
    top_ids = top_pool(pool_depth, id_pos, score_pos,
                       ans_length, id_length, entry['answer'])
    # Reconfirming no dupes
    if len(top_ids) != len(list(set(top_ids))):
        print "PROBLEM WITH DUPLICATED DOC IDS"
        break
    answers['NYU']['ISI'][clus_type][qseed] = top_ids

### ISI Cluster Aggregate

In [23]:
clus_type = 'Cluster Aggregate'
answers['NYU']['ISI'][clus_type] = {}

In [24]:
file_path = submission_path + ('ISI/DomainDiscovery/'
                               'isi-nyu-answers-dig-extractions/'
                               'properly_formatted_submissions/'
                               'formatted_post_aggregate'
                               '_parsed_fixed_all_answers.json')
f = open(file_path, 'r')
data = eval(f.read())

id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    qid = entry['question_id'].split('-')[0]
    qseed = seeds[clus_type][qid]
    top_ids = top_pool(pool_depth, id_pos, score_pos,
                       ans_length, id_length, entry['answer'])
    # Reconfirming no dupes
    if len(top_ids) != len(list(set(top_ids))):
        print "PROBLEM WITH DUPLICATED DOC IDS"
        break
    answers['NYU']['ISI'][clus_type][qseed] = top_ids

# Uncharted

## NYU

### Uncharted NYU-Cluster Identification

In [25]:
answers['NYU']['Uncharted'] = {}
clus_type = 'Cluster Identification'
answers['NYU']['Uncharted'][clus_type] = {}

In [26]:
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_NYU_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
data = []
for entry in all_data:
    if entry['questionType'] == clus_type:
        data.append(entry)
        
id_pos = 0
score_pos = 1
ans_length = 2 
id_length = 64

for entry in data:
    # Not all entries have 'answers'
    if 'answers' in entry.keys():
        qseed = seeds[clus_type][entry['question_id']]
        top_ids = top_pool(pool_depth, id_pos, score_pos,
                           ans_length, id_length, entry['answers'])
        # Reconfirming no dupes
        if len(top_ids) != len(list(set(top_ids))):
            print "PROBLEM WITH DUPLICATED DOC IDS"
            break
    answers['NYU']['Uncharted'][clus_type][qseed] = top_ids

### Uncharted NYU-Cluster Facet

In [27]:
clus_type = 'Cluster Facet'
answers['NYU']['Uncharted'][clus_type] = {}

In [28]:
data = []
for entry in all_data:
    if entry['questionType'] == clus_type:
        data.append(entry)
        
id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    # Not all entries have 'answers'
    if 'answers' in entry.keys():
        qseed = seeds[clus_type][entry['question_id']]
        top_ids = top_pool(pool_depth, id_pos, score_pos,
                           ans_length, id_length, entry['answers'])
        # Reconfirming no dupes
        if len(top_ids) != len(list(set(top_ids))):
            print "PROBLEM WITH DUPLICATED DOC IDS"
            break
    answers['NYU']['Uncharted'][clus_type][qseed] = top_ids

### Uncharted NYU-Cluster Aggregate

In [29]:
clus_type = 'Cluster Aggregate'
answers['NYU']['Uncharted'][clus_type] = {}

In [30]:
data = []
for entry in all_data:
    if entry['questionType'] in ['AVG', 'MIN', 'MAX', 'MODE']:
        data.append(entry)
        
id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    # Not all entries have 'answers'
    if 'answers' in entry.keys():
        # Cluster Aggregate Question 94 was Removed
        if entry['question_id'] != '94':
            qseed = seeds[clus_type][entry['question_id']]
            top_ids = top_pool(pool_depth, id_pos, score_pos,
                               ans_length, id_length, entry['answers'])
            # Reconfirming no dupes
            if len(top_ids) != len(list(set(top_ids))):
                print "PROBLEM WITH DUPLICATED DOC IDS"
                break
        answers['NYU']['Uncharted'][clus_type][qseed] = top_ids

## Hyperion Gray

### Uncharted HG-Cluster Identification

In [31]:
answers ['HG'] = {}
answers['HG']['Uncharted'] = {}
clus_type = 'Cluster Identification'
answers['HG']['Uncharted'][clus_type] = {}

In [32]:
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_HG_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
data = []
for entry in all_data:
    if entry['questionType'] == clus_type:
        data.append(entry)
        
id_pos = 0
score_pos = 1
ans_length = 2 
id_length = 64

for entry in data:
    # Not all entries have 'answers'
    if 'answers' in entry.keys():
        qseed = seeds[clus_type][entry['question_id']]
        top_ids = top_pool(pool_depth, id_pos, score_pos,
                           ans_length, id_length, entry['answers'])
        # Reconfirming no dupes
        if len(top_ids) != len(list(set(top_ids))):
            print "PROBLEM WITH DUPLICATED DOC IDS"
            break
    answers['HG']['Uncharted'][clus_type][qseed] = top_ids

### Uncharted HG-Cluster Facet

In [33]:
clus_type = 'Cluster Facet'
answers['HG']['Uncharted'][clus_type] = {}

In [34]:
data = []
for entry in all_data:
    if entry['questionType'] == clus_type:
        data.append(entry)
        
id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    # Not all entries have 'answers'
    if 'answers' in entry.keys():
        qseed = seeds[clus_type][entry['question_id']]
        top_ids = top_pool(pool_depth, id_pos, score_pos,
                           ans_length, id_length, entry['answers'])
        # Reconfirming no dupes
        if len(top_ids) != len(list(set(top_ids))):
            print "PROBLEM WITH DUPLICATED DOC IDS"
            break
    answers['HG']['Uncharted'][clus_type][qseed] = top_ids

### Uncharted HG-Cluster Aggregate

In [35]:
clus_type = 'Cluster Aggregate'
answers['HG']['Uncharted'][clus_type] = {}

In [36]:
data = []
for entry in all_data:
    if entry['questionType'] in ['AVG', 'MIN', 'MAX', 'MODE']:
        data.append(entry)
        
id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    # Not all entries have 'answers'
    if 'answers' in entry.keys():
        # Cluster Aggregate Question 94 was Removed
        if entry['question_id'] != '94':
            qseed = seeds[clus_type][entry['question_id']]
            top_ids = top_pool(pool_depth, id_pos, score_pos,
                               ans_length, id_length, entry['answers'])
            # Reconfirming no dupes
            if len(top_ids) != len(list(set(top_ids))):
                print "PROBLEM WITH DUPLICATED DOC IDS"
                break
        answers['HG']['Uncharted'][clus_type][qseed] = top_ids

## JPL

### Uncharted JPL-Cluster Identification

In [37]:
answers ['JPL'] = {}
answers['JPL']['Uncharted'] = {}
clus_type = 'Cluster Identification'
answers['JPL']['Uncharted'][clus_type] = {}

In [38]:
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_JPL_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
data = []
for entry in all_data:
    if entry['questionType'] == clus_type:
        data.append(entry)
        
id_pos = 0
score_pos = 1
ans_length = 2 
id_length = 64

for entry in data:
    # Not all entries have 'answers'
    if 'answers' in entry.keys():
        qseed = seeds[clus_type][entry['question_id']]
        top_ids = top_pool(pool_depth, id_pos, score_pos,
                           ans_length, id_length, entry['answers'])
        # Reconfirming no dupes
        if len(top_ids) != len(list(set(top_ids))):
            print "PROBLEM WITH DUPLICATED DOC IDS"
            break
    answers['JPL']['Uncharted'][clus_type][qseed] = top_ids

### Uncharted JPL-Cluster Facet

In [39]:
clus_type = 'Cluster Facet'
answers['JPL']['Uncharted'][clus_type] = {}

In [40]:
data = []
for entry in all_data:
    if entry['questionType'] == clus_type:
        data.append(entry)
        
id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    # Not all entries have 'answers'
    if 'answers' in entry.keys():
        qseed = seeds[clus_type][entry['question_id']]
        top_ids = top_pool(pool_depth, id_pos, score_pos,
                           ans_length, id_length, entry['answers'])
        # Reconfirming no dupes
        if len(top_ids) != len(list(set(top_ids))):
            print "PROBLEM WITH DUPLICATED DOC IDS"
            break
    answers['JPL']['Uncharted'][clus_type][qseed] = top_ids

### Uncharted JPL-Cluster Aggregate

In [41]:
clus_type = 'Cluster Aggregate'
answers['JPL']['Uncharted'][clus_type] = {}

In [42]:
data = []
for entry in all_data:
    if entry['questionType'] in ['AVG', 'MIN', 'MAX', 'MODE']:
        data.append(entry)
        
id_pos = 1
score_pos = 2
ans_length = 3 
id_length = 64

for entry in data:
    # Not all entries have 'answers'
    if 'answers' in entry.keys():
        # Cluster Aggregate Question 94 was Removed
        if entry['question_id'] != '94':
            qseed = seeds[clus_type][entry['question_id']]
            top_ids = top_pool(pool_depth, id_pos, score_pos,
                               ans_length, id_length, entry['answers'])
            # Reconfirming no dupes
            if len(top_ids) != len(list(set(top_ids))):
                print "PROBLEM WITH DUPLICATED DOC IDS"
                break
        answers['JPL']['Uncharted'][clus_type][qseed] = top_ids

## Prepare Data for Handoff to Uncharted

In [43]:
temp_clusters = {}
for index in answers.keys():
    temp_clusters[index] = []
    for team in answers[index].keys():
        for qtype in answers[index][team].keys():
            for seed in answers[index][team][qtype].keys():
                for doc_id in answers[index][team][qtype][seed]:
                    temp = [seed, doc_id]
                    temp_clusters[index].append(temp)

uniq_clusters = {}
for index in temp_clusters.keys():
    print len(temp_clusters[index])
    uniq_clusters[index] = list(set(tuple(i) for i in temp_clusters[index]))
    print len(uniq_clusters[index])
    print " "

11188
9798
 
11648
10636
 
14268
12226
 


In [44]:
output_file = 'cluster_annotation_data.json'
with open(output_file, 'w') as f:
    json.dump(uniq_clusters, f, indent=2)

# Sanity Check on Chosen Doc IDs 

## NYU Cluster Identification

In [216]:
try:
    del(data)
except:
    pass
missing = []

qtype = 'Cluster Identification'

# DATA
georgetown_data = []
file_path = submission_path + 'Georgetown/DomainDiscovery/NYU_CI.json'
f = open(file_path, 'r')
georgetown_data = eval(f.read())

isi_data = []
file_path = submission_path + ('ISI/DomainDiscovery/'
                               'isi-nyu-answers-dig-extractions/'
                               'properly_formatted_submissions/'
                               'formatted_post_cluster_identification'
                               '-parsed_fixed_all_answers.json')
f = open(file_path, 'r')
isi_data = eval(f.read())

uncharted_data = []
all_data = []
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_NYU_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
for entry in all_data:
    if entry['questionType'] == qtype:
        uncharted_data.append(entry)

# MAIN LOOP
for qid in seeds[qtype].keys():
    # Get list of ad candidtes
    candidates = []
    for clus_entry in uniq_clusters['NYU']:
        if clus_entry[0] == seeds[qtype][qid]:
            candidates.append(clus_entry[1])

    # Georgetown
    id_pos = 0
    score_pos = 1
    for entry in georgetown_data:
        seen = []
        count = 0
        score = 100
        if entry['id'] == qid:
            for ans in entry['answer']:
                if ans[score_pos] > score:
                    print "TROUBLE!"
                if ans[id_pos] not in seen:
                    count += 1
                    if count <= 100:
                        seen.append(ans[id_pos])
            if len(seen) == 0:
                print ('Georgetown', qid)
            for doc in seen:
                if doc not in candidates:
                    missing.append(doc)
                    print ('Georgetown', doc)
            
    # ISI
    id_pos = 1
    score_pos = 2
    for entry in isi_data:
        seen = []
        count = 0
        score = 100
        if entry['question_id'] == qid:
            for ans in entry['answer']:
                if ans[score_pos] > score:
                    print "TROUBLE!"
                if ans[id_pos] not in seen:
                    count += 1
                    if count <= 100:
                        seen.append(ans[id_pos])
            if len(seen) == 0:
                print ('ISI', qid)
            for doc in seen:
                if doc not in candidates:
                    missing.append(doc)
                    print ('ISI', doc)
            
    # Uncharted
    id_pos = 0
    score_pos = 1
    for entry in uncharted_data:
        seen = []
        count = 0
        score = 100
        if 'answers' in entry.keys():
            if entry['question_id'] == qid:
                for ans in entry['answers']:
                    if ans[score_pos] > score:
                        print "TROUBLE!"
                    if ans[id_pos] not in seen:
                        count += 1
                        if count <= 100:
                            seen.append(ans[id_pos])
                if len(seen) == 0:
                    print ('Uncharted', qid)
                for doc in seen:
                    if doc not in candidates:
                        missing.append(doc)
                        print ('Uncharted', doc)
if len(missing) > 0:
    print "MISSING DOCS"

('Georgetown', u'25')
('Georgetown', u'20')
('Georgetown', u'22')
('Georgetown', u'47')
('Georgetown', u'28')
('Georgetown', u'40')
('Georgetown', u'41')
('Georgetown', u'8')
('Georgetown', u'39')
('Georgetown', u'11')
('Georgetown', u'13')
('Georgetown', u'16')
('Georgetown', u'31')
('Georgetown', u'30')
('Georgetown', u'33')
('Georgetown', u'32')


In [215]:
# Checkout questions that returned NO uniq doc ids
for entry in georgetown_data:
    if entry['id'] == '32':
        temp = entry
        if len(temp['answer']) > 0:
            print "SHOULD HAVE CAUGHT THIS"
            print temp['answer']
        print temp

{'answer': [], 'type': 'Cluster Identification', 'id': '32'}


## NYU Cluster Facet

In [220]:
try:
    del(data)
except:
    pass
missing = []

qtype = 'Cluster Facet'

# DATA
georgetown_data = []
file_path = submission_path + 'Georgetown/DomainDiscovery/NYU_CF.json'
f = open(file_path, 'r')
georgetown_data = eval(f.read())

isi_data = []
file_path = submission_path + ('ISI/DomainDiscovery/'
                               'isi-nyu-answers-dig-extractions/'
                               'properly_formatted_submissions/'
                               'formatted_post_cluster_facet'
                               '_parsed_fixed_all_answers.json')
f = open(file_path, 'r')
isi_data = eval(f.read())

uncharted_data = []
all_data = []
file_path = submission_path + ('Uncharted/DomainDiscovery/uncharted_NYU_DD.json')
f = open(file_path, 'r')
all_data = eval(f.read())
for entry in all_data:
    if entry['questionType'] == qtype:
        uncharted_data.append(entry)

# MAIN LOOP
for qid in seeds[qtype].keys():
    # Get list of ad candidtes
    candidates = []
    for clus_entry in uniq_clusters['NYU']:
        if clus_entry[0] == seeds[qtype][qid]:
            candidates.append(clus_entry[1])

    # Georgetown
    id_pos = 1
    score_pos = 2
    for entry in georgetown_data:
        seen = []
        count = 0
        score = 100
        if entry['id'] == qid:
            for ans in entry['answer']:
                if ans[score_pos] > score:
                    print "TROUBLE!"
                if ans[id_pos] not in seen:
                    count += 1
                    if count <= 100:
                        seen.append(ans[id_pos])
            if len(seen) == 0:
                print ('Georgetown', qid)
            for doc in seen:
                if doc not in candidates:
                    missing.append(doc)
                    print ('Georgetown', doc)
            
    # ISI
    id_pos = 1
    score_pos = 2
    for entry in isi_data:
        seen = []
        count = 0
        score = 100
        if entry['question_id'] == qid:
            for ans in entry['answer']:
                if ans[score_pos] > score:
                    print "TROUBLE!"
                if ans[id_pos] not in seen:
                    count += 1
                    if count <= 100:
                        seen.append(ans[id_pos])
            if len(seen) == 0:
                print ('ISI', qid)
            for doc in seen:
                if doc not in candidates:
                    missing.append(doc)
                    print ('ISI', doc)
            
    # Uncharted
    id_pos = 1
    score_pos = 2
    for entry in uncharted_data:
        seen = []
        count = 0
        score = 100
        if 'answers' in entry.keys():
            if entry['question_id'] == qid:
                for ans in entry['answers']:
                    if ans[score_pos] > score:
                        print "TROUBLE!"
                    if ans[id_pos] not in seen:
                        count += 1
                        if count <= 100:
                            seen.append(ans[id_pos])
                if len(seen) == 0:
                    print ('Uncharted', qid)
                for doc in seen:
                    if doc not in candidates:
                        missing.append(doc)
                        print ('Uncharted', doc)
                        print qid
if len(missing) > 0:
    print "MISSING DOCS"

('Georgetown', u'42')
('Georgetown', u'48')
('Georgetown', u'43')
('Georgetown', u'49')
('Georgetown', u'24')
('Georgetown', u'27')
('Georgetown', u'20')
('Georgetown', u'21')
('Georgetown', u'46')
('Georgetown', u'44')
('Georgetown', u'45')
('Georgetown', u'28')
('Georgetown', u'29')
('Georgetown', u'40')
('Georgetown', u'41')
('Georgetown', u'1')
('Georgetown', u'3')
('Georgetown', u'2')
('Georgetown', u'5')
('Georgetown', u'4')
('Georgetown', u'7')
('Georgetown', u'6')
('Georgetown', u'39')
('Georgetown', u'38')
('Georgetown', u'11')
('Georgetown', u'10')
('Georgetown', u'13')
('Uncharted', '90412E1895E93C4F413560E857ADBDF0FA48B6592FD67704777A2144E9C5AD8C')
13
('Uncharted', 'A1A86AD0637E27049BF7D84C601232DF10DDBC5B2891F3E0FD2AC2267D126C91')
13
('Uncharted', '3060A8FC06ADBEC27E758B4326EBE7566A5B3F47FEAEF4F60974F52371BCC8AC')
13
('Uncharted', 'D6BE25B9CCD26699D2774A633AC8AA093385F3068DAB123454D5BBBAB16A6C56')
13
('Uncharted', 'D5DFAE89604331EAAF758DFB7D6B152A2D18237DB34461754E11CE475F

In [219]:
missing

['90412E1895E93C4F413560E857ADBDF0FA48B6592FD67704777A2144E9C5AD8C',
 'A1A86AD0637E27049BF7D84C601232DF10DDBC5B2891F3E0FD2AC2267D126C91',
 '3060A8FC06ADBEC27E758B4326EBE7566A5B3F47FEAEF4F60974F52371BCC8AC',
 'D6BE25B9CCD26699D2774A633AC8AA093385F3068DAB123454D5BBBAB16A6C56',
 'D5DFAE89604331EAAF758DFB7D6B152A2D18237DB34461754E11CE475F87BDEC',
 'B43AF2809831CA31C4E0A15EFEA20F14BCF27BCF2F287EDE870E0F3E0B495D82',
 'C83D06ABE357ABBF37991B3AC67380926267C0EE447A830C925D8B86148D6D0C',
 '8718C5E2641DBE85C730CFC4C2E2053BAE3DB287864E61F61271C720892BF00E',
 '94457D5A17EFA333799A1154DA7AD23D579AF6922B97EB798FDFCFEE9C7B8D73',
 '37FFDB6696233373445A17445F549F7F4F228E81047C8207A579B2AA6B334337',
 '66D718D5C1F15C9D8C7954E2264D40AD780AC961600E6F03627B79E228CEE8B1',
 '8286AE73D3A2284873892AF38D2EBA9624DA69470FA496625A8464172E032933',
 '05CF3D9927187C7043C4AAE484C1EFF00130A14194A98F207993306577175803',
 'F39D24E0621BC27BF0B53C280F75706AD2EF72E814CC42324379E9B5B5DB76E2',
 '72EC4B2F03535F8FEF9B66772B2A01D5

In [224]:
# Checkout questions that returned NO uniq doc ids
for entry in uncharted_data:
    if entry['question_id'] == '13':
        temp = entry
        if len(temp['answers']) > 0:
            print "SHOULD HAVE CAUGHT THIS"
            print temp['answers']
        print temp

SHOULD HAVE CAUGHT THIS
[['latin', '90412E1895E93C4F413560E857ADBDF0FA48B6592FD67704777A2144E9C5AD8C', 0.5], ['white', '90412E1895E93C4F413560E857ADBDF0FA48B6592FD67704777A2144E9C5AD8C', 0.5], ['black', '90412E1895E93C4F413560E857ADBDF0FA48B6592FD67704777A2144E9C5AD8C', 0.5], ['asian', '90412E1895E93C4F413560E857ADBDF0FA48B6592FD67704777A2144E9C5AD8C', 0.5], ['caucasian', '90412E1895E93C4F413560E857ADBDF0FA48B6592FD67704777A2144E9C5AD8C', 0.5], ['latin', 'A1A86AD0637E27049BF7D84C601232DF10DDBC5B2891F3E0FD2AC2267D126C91', 0.49564034], ['white', 'A1A86AD0637E27049BF7D84C601232DF10DDBC5B2891F3E0FD2AC2267D126C91', 0.49564034], ['black', 'A1A86AD0637E27049BF7D84C601232DF10DDBC5B2891F3E0FD2AC2267D126C91', 0.49564034], ['asian', 'A1A86AD0637E27049BF7D84C601232DF10DDBC5B2891F3E0FD2AC2267D126C91', 0.49564034], ['caucasian', 'A1A86AD0637E27049BF7D84C601232DF10DDBC5B2891F3E0FD2AC2267D126C91', 0.49564034], ['latin', '3060A8FC06ADBEC27E758B4326EBE7566A5B3F47FEAEF4F60974F52371BCC8AC', 0.49564034], [