In [1]:
import json
import csv

In [2]:
def csv_line(qid, entry):
    line = {}
    line['Question'] = qid
    line['Ground Truth'] = entry['Ground Truth']
    line['CDR'] = entry['CDR']
    if 'HG' in entry.keys():
        line['NYU'] = entry['NYU']
        line['JPL'] = entry['JPL']
        line['HG'] = entry['HG']
        
    return line

# Ground Truth

### The purpose of the first part of this notebook is to generate a dictionary for all relevant ground truth docs for each question.  This dictionary will be used to query the CDR to determine what documents exist in the CDR.

See memex-analysis/cdr

In [3]:
gt_files = {}

gt_files['Cluster Identification'] = ('../data/posted_GT/gt_answer_key/id_quest2ads.json')
gt_files['Cluster Facet'] = ('../data/posted_GT/gt_answer_key/facet_quest2ads.json')
gt_files['Cluster Aggregate'] = ('../data/posted_GT/gt_answer_key/agg_quest2ads.json')
gt_files['Pure Aggregate'] = ('../data/posted_GT/gt_answer_key/pure_agg_gt_quest2ads.json')

gt = {}

for qtype in gt_files.keys():
    data_file = gt_files[qtype]
    with open(data_file, 'r') as f:
        gt[qtype] = eval(f.read())

In [4]:
point_fact_file = '../data/posted_GT/ground_truth_pf_submissions_relevant_V4.json'

with open(point_fact_file, 'r') as f:
    data = eval(f.read())

temp_gt = {}
for qid in data.keys():
    temp_gt[qid] = data[qid].keys()

gt['Point Fact'] = temp_gt

## Save File

In [5]:
save_file = 'all_gt_relevant_docs.json'
with open(save_file, 'w') as outfile:
    json.dump(gt, outfile, indent=2)
    
# Note: this file was the processed in another script and the output was returned as gt_cdr_docs.json

In [6]:
gt_docs = {}
gt_docs_file = 'gt_cdr_docs.json'

with open(gt_docs_file, 'r') as f:
    gt_docs = eval(f.read())

### The purpose of the second part of this notebook is to count how many pages were annotated as "relevant" for each question against all datasets.

# Ground Truth

In [7]:
relevant_docs = {}

for qtype in gt.keys():
    relevant_docs[qtype] = {}
    for qid in gt[qtype].keys():
        relevant_docs[qtype][qid] = {}
        relevant_docs[qtype][qid]['CDR'] = gt_docs[qtype][qid]['cdr_count']
        relevant_docs[qtype][qid]['Ground Truth'] = len(gt[qtype][qid])

# Domain Discovery

In [8]:
dd_relevant_files = {}

dd_relevant_files['Point Fact'] = ('../data/post_hoc_annotations/post_hoc_point_fact_combined.json')
dd_relevant_files['Cluster Facet'] = ('../data/post_hoc_annotations/post_hoc_cluster_facet.json')
dd_relevant_files['Cluster Aggregate'] = ('../data/post_hoc_annotations/post_hoc_cluster_aggregate-cleaned.json')
dd_relevant_files['Pure Aggregate'] = ('../data/post_hoc_annotations/post_hoc_pure_agg.json')

dd_relevant = []
for qtype in dd_relevant_files.keys():
    # Initializing to ensure removal
    dd_relevant = []
    del(dd_relevant)
    dd_file = dd_relevant_files[qtype]
    with open(dd_file, 'r') as f:
        dd_relevant = eval(f.read())

    for qid in dd_relevant.keys():
        # Make default be zero
        for data_set in ['HG', 'NYU', 'JPL']:
            relevant_docs[qtype][qid][data_set] = 0
        for dataset in dd_relevant[qid]['relevant_ads'].keys():
            temp = dd_relevant[qid]['relevant_ads'][dataset].keys()
            new_set = dataset.upper()
            relevant_docs[qtype][qid][dataset.upper()] = len(temp)

## Cluster Identification

In [9]:
with open('../evaluation/dd_evaluation/cluster_identification/Cluster_Identification_Crawl_Team_Breakdown.json', 'r') as f:
    gt_length = eval(f.read())

for qid in gt_length.keys():
    for dataset in ['HG', 'NYU', 'JPL']:
        relevant_docs['Cluster Identification'][qid][dataset] = gt_length[qid][dataset]

## Save File

In [10]:
save_file = 'all_relevant_docs_totals.json'
with open(save_file, 'w') as outfile:
    json.dump(relevant_docs, outfile, indent=2)

## Generate CSVs

These CSVs can be manually combined into one file

In [11]:
for qtype in relevant_docs.keys():
    file_name =  qtype.replace(' ','_') + '_relevant_docs_by_question.csv'
    
    with open(file_name, 'w') as csvfile:
        fieldnames = ['Question', 'Ground Truth', 'CDR', 'NYU', 'JPL', 'HG']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for qid in relevant_docs[qtype].keys():
            csv_dict = csv_line(qid, relevant_docs[qtype][qid])
            writer.writerow(csv_dict)