## Compute NDCG Scores

In [None]:
import csv

import matplotlib.pyplot as plt

from models.rank_eval import get_ndcg

In [None]:
# Define file paths
true_results_file = '../references/scored-results-45.csv'
predicted_results_file = '../data/exports/question-results-hyde-openai.csv'

## Write functions to read two different csv files - scored-results.csv and question-results.csv

In [None]:
# Function to read the first CSV file with "Question," "ResultId," and "Score" columns
def read_true_csv(file_path):
    data_dict = {}
    with open(file_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            question = row.get('Question', '')  # Use get to handle missing 'Question' key
            result_id = row.get('ResultId', '')  # Use get to handle missing 'ResultId' key
            score = float(row.get('Score', 0))  # Use get to handle missing 'Score' key and convert to float
            if question and result_id:  # Check if both 'Question' and 'ResultId' are present
                if question not in data_dict:
                    data_dict[question] = []
                data_dict[question].append({'id': result_id, 'score': score})
    return data_dict

# Function to read the second CSV file with "Question," "ResultId," and "ResultRank" columns
def read_predicted_csv(file_path):
    data_dict = {}
    with open(file_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            question = row.get('Question', '')  # Use get to handle missing 'Question' key
            result_id = row.get('ResultId', '')  # Use get to handle missing 'ResultId' key
            result_rank = int(row.get('ResultRank', 0))  # Use get to handle missing 'ResultRank' key and convert to int
            result_similarity = float(row.get('ResultScore', 0.0))  # Use get to handle missing 'ResultScore' key and convert to float
            if question and result_id:  # Check if both 'Question' and 'ResultId' are present
                if question not in data_dict:
                    data_dict[question] = []
                data_dict[question].append({'id': result_id, 'score': result_rank, 'similarity': result_similarity})
    return data_dict



## Write a function to compute the NDCG scores

In [None]:
# Function to convert ranks 1..20 to scores 20..1
# get_ndcg needs the top-ranked item (rank=1) to have the highest score (20) 
# and the bottom-ranked item (rank=20) to have the lowest score(1)
def ranks_to_scores(ranks):
    return [{'id': rank['id'], 'score': len(ranks)-rank['score']} for rank in ranks]

In [None]:
# Function to calculate NDCG scores
def calculate_ndcg(true_results, predicted_results, k_values):
    ndcg_scores = {}
    single_document_questions = []  # Keep track of questions with only one document
    
    for question in true_results:
        true_list = true_results[question]
        predicted_list = predicted_results.get(question, [])

        if len(true_list) == 1:
            # Append details of questions with only one document
            single_document_questions.append({
                'question': question,
                'true_result': true_list[0],
                'predicted_result': predicted_list[0] if predicted_list else None
            })
        else:
            # Calculate NDCG scores for questions with more than one document
            ndcg_scores[question] = [get_ndcg(true_list, ranks_to_scores(predicted_list), k) for k in k_values]
    
    # Print questions with only one document
    if len(single_document_questions) > 0:
        print("Questions with Only One Document:")
    for item in single_document_questions:
        print(f"Question: {item['question']}")
        print(f"True Result: {item['true_result']}")
        print(f"Predicted Result: {item['predicted_result']}")
        print()
    
    return ndcg_scores

## Call the functions to read the csv files and to compute NDCG scores

In [None]:
# Read the first CSV file with true results
true_results = read_true_csv(true_results_file)

# Read the second CSV file with predicted results
predicted_results = read_predicted_csv(predicted_results_file)

# Calculate NDCG scores
k_values = [5, 3, 10]
ndcg_scores = calculate_ndcg(true_results, predicted_results, k_values)
# print("predicted results", predicted_results)
# print()
# print("true results", true_results)
# Print NDCG scores
print("NDCG Scores:")
for question, scores in ndcg_scores.items():
    print(f"Question: {question}, NDCG Scores: {scores}")

In [None]:
len(ndcg_scores)

## Average NDCG scores

In [None]:
# Calculate and print average NDCG scores
avg_ndcg_scores = {k: sum(scores) / len(scores) for k, scores in zip(k_values, zip(*ndcg_scores.values()))}
print("\nAverage NDCG Scores:")
for k, avg_score in avg_ndcg_scores.items():
    print(f"At k={k}: {avg_score}")

## Look at an individual question

In [None]:
q = "Why do you serve missions?"
k = 5

ndcg = get_ndcg(true_results[q], ranks_to_scores(predicted_results[q]), k)
print(f"NDCG={ndcg}")

print(f"Scores of the top {k} search results")
scores = {id_score['id']: id_score['score'] for id_score in true_results[q]}
rankings = {id_score['score']: id_score['id'] for id_score in predicted_results[q]}
for ix in range(1, k+1):
    _id = rankings.get(ix, 0)
    print(ix, scores.get(_id, 0), _id)

print("Ideal top search results")
sorted_scores = sorted(true_results[q], key=lambda x: x['score'], reverse=True)
curr_score = 3
for ix in range(len(sorted_scores)):
    id_score = sorted_scores[ix]
    if id_score['score'] < curr_score:
        if ix >= k:
            break
        print('-----')
    print(ix+1, id_score['score'], id_score['id'])
    curr_score = id_score['score']

## Graph scores of relevant vs irrelevant results

In [None]:
scores_3 = []
scores_2 = []
scores_1 = []
scores_0 = []
not_found_results = 0
for question in ndcg_scores:
    for predicted_result in predicted_results[question]:
        found = False
        score = None
        similarity = None
        for true_result in true_results[question]:
            if true_result['id'] == predicted_result['id']:
                score = true_result['score']
                similarity = predicted_result['similarity']
                found = True
                break
        if not found:
            not_found_results += 1
            print('result not found', predicted_result['id'], question)
        elif score == 3.0:
            scores_3.append(similarity)
        elif score == 2.0:
            scores_2.append(similarity)
        elif score == 1.0:
            scores_1.append(similarity)
        elif score == 0.0:
            scores_0.append(similarity)
        else:
            print('unexpected score', score, predicted_result['id'], question)
print(not_found_results)

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(scores_3, bins=20, alpha=0.25, label="3", color='blue')
plt.hist(scores_2, bins=20, alpha=0.25, label="2", color='green')
plt.hist(scores_1, bins=20, alpha=0.25, label="1", color='yellow')
plt.hist(scores_0, bins=20, alpha=0.25, label="0", color='red')
plt.title('Overlapping Histogram')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend(loc='upper right')

# Show the plot
plt.tight_layout()
plt.show()