In [None]:
%load_ext autoreload
%autoreload 2

# Merge scored results

Merge previous scored-results with question-results from notebook 56 and write the result to merged-results.

The next steps are to upload merged-results into a google spreadsheet, score the unscored results, download the spreadsheet, and replace scored-results with the downloaded spreadsheet.

In [None]:
from collections import defaultdict
import csv

In [None]:
indexed_ids_path = "../data/exports/indexed-ids-2023-12-21.txt"
scored_results_path = "../references/scored-results-45.csv"
# scored_results_path = "../data/exports/merged-results3.csv"
question_results_path = "../data/exports/question-results-hyde-openai.csv"
top_k = 10

merged_results_path = "../data/exports/merged-results.csv"

## Read indexed ids

In [None]:
indexed_ids = set()
with open(indexed_ids_path, 'r') as f:
    for line in f:
        indexed_ids.add(line.strip())

## Read scored results

In [None]:
merged_results = defaultdict(dict)  # each item in merged_results dictionary is itself another dictionary
n_unindexed_ids = 0
with open(scored_results_path, 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        question = row.get('Question', '')  # Use get to handle missing 'Question' key
        result_id = row.get('ResultId', '')  # Use get to handle missing 'ResultId' key
        score = row.get('Score', '')  # Use get to handle missing 'Score' key
        if question == '' or result_id == '':
            print('ERROR - missing data', row)
            continue
        if result_id not in indexed_ids:
            n_unindexed_ids += 1
            continue
        # create a dictionary keyed on result id so we don't get duplicate result ids in the merged results
        merged_results[question][result_id] = {
            'ResultScore': row.get('ResultScore'),
            'ResultURL': row.get('ResultURL', ''),
            'ResultTitle': row.get('ResultTitle', ''),
            'ResultRank': row.get('ResultRank', ''),
            'ResultText': row.get('ResultText', ''),
            'score': score,
        }

In [None]:
print('Questions', len(merged_results))
print('Unindexed ids', n_unindexed_ids)
print('Scored ids', len([result for results in merged_results.values() for result in results.values() if result["score"] != '']))
print('Unscored ids', len([result for results in merged_results.values() for result in results.values() if result["score"] == '']))

## Merge question-results into scored-results

This is the output from running notebook 56

In [None]:
with open(question_results_path, 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        question = row.get('Question', '')  # Use get to handle missing 'Question' key
        result_id = row.get('ResultId', '')  # Use get to handle missing 'ResultId' key
        result_rank = row.get('ResultRank', '')
        if question == '' or result_id == '' or result_rank == '':
            print('ERROR - missing data', row)
            continue
        if result_id not in indexed_ids:
            print('WARNING - missing result id', question, result_id)
            continue
        if result_id in merged_results[question]:
            # result already exists
            continue
        if int(result_rank) > top_k:
            # only score the top_k results
            continue
        merged_results[question][result_id] = {
            'ResultScore': row.get('ResultScore'),
            'ResultURL': row.get('ResultURL', ''),
            'ResultTitle': row.get('ResultTitle', ''),
            'ResultRank': row.get('ResultRank', ''),
            'ResultText': row.get('ResultText', ''),
            'score': '',
        }

In [None]:
print('Questions', len(merged_results))
print('Scored ids', len([result for results in merged_results.values() for result in results.values() if result["score"] != '']))
print('Unscored ids', len([result for results in merged_results.values() for result in results.values() if result["score"] == '']))

## Write merged scores

In [None]:
with open(merged_results_path, 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Question', 'ResultId', 'ResultScore', 'ResultURL', 'ResultTitle', 'ResultRank', 'ResultText', 'Score'])  # header row
    for question, results in merged_results.items():
        for result_id, result in results.items():
            writer.writerow([
                question,
                result_id, 
                result["ResultScore"], 
                result["ResultURL"], 
                result["ResultTitle"], 
                result["ResultRank"], 
                result["ResultText"],
                result["score"],
            ])