# Read data

## Read run files

In [1]:
# Get input to the `eval_recommendation.EvalRecommendation`

import os
import json

def data_loader(seed: int, topic: str):
    """Load the proposed (Memory) run, the baseline (Standard) run, and the reference data.
    Then, transform the data into the format that `eval_recommendation.EvalRecommendation` requires.
    """

    assert topic in ['movie', 'recipe'], 'topic must be either "movie" or "recipe"'
    assert seed in range(10), 'seed must be in range(10)'

    dir_to_runs = './data/runs/'

    # read run data for proposed (Memory) method
    path_to_proposed = os.path.join(dir_to_runs, f'run_memory_{topic}_seed{seed}.jsonl')
    proposed = [json.loads(line) for line in open(path_to_proposed, 'r')]

    # read run data for baseline (Standard) method
    path_to_baseline = os.path.join(dir_to_runs, f'run_standard_{topic}_seed{seed}.jsonl')
    baseline = [json.loads(line) for line in open(path_to_baseline, 'r')]

    # read reference data
    path_to_reference = os.path.join(dir_to_runs, f'reference_{topic}.jsonl')
    reference = [json.loads(line) for line in open(path_to_reference, 'r')]

    assert len(proposed) == len(baseline) == len(reference), 'The number of runs must be the same'
    
    # explanation of the data
    # print(f'{topic} data for seed {seed} has {len(proposed)} predictions')

    # create recommendation file
    recommendations = []
    for i in range(len(proposed)):
        assert proposed[i]['worker_id'] == baseline[i]['worker_id'] == reference[i]['worker_id'], 'worker_id must be the same'
        assert proposed[i]['session_ind'] == baseline[i]['session_ind'] == reference[i]['session_ind'], 'session_ind must be the same'
        assert proposed[i]['turn_number'] == baseline[i]['turn_number'] == reference[i]['turn_number'], 'turn_number must be the same'
        recommendations.append({
            'worker_id': proposed[i]['worker_id'],
            'session_ind': proposed[i]['session_ind'],
            'turn_number': proposed[i]['turn_number'],
            'pred_text_proposed': proposed[i]['pred_text'],
            'pred_text_baseline': baseline[i]['pred_text'],
            'reference': reference[i]['reference'],
            'preferences_for_each_session': reference[i]['preferences_for_each_session']
        })
    return recommendations

# Evaluation

In [2]:
import eval_recommendation
import importlib
EvalRecommendation = eval_recommendation.EvalRecommendation

In [3]:
import pandas as pd
def calculate_combined_prf(topic, target_session_lens):
    all_prf_scores = []

    for seed in range(10):
        combined_counts = {
            'pred': {'tp': [], 'fp': [], 'fn': []},
            'baseline': {'tp': [], 'fp': [], 'fn': []},
            'gold': {'tp': [], 'fp': [], 'fn': []}
        }
        
        for session_len in target_session_lens:
            print(f'Calculating tp/fp/fn for seed {seed} and current session (session_len) {session_len} ...')
            recommendations = data_loader(seed, topic)
            eval_session = EvalRecommendation(recommendations, target_session_len=session_len)
            eval_session.calc_metrics()
            # NOTE: Comment out here if you want to see the PRF scores for each target session
            # eval_session.calculate_prf_scores(flag_print=False)
            # print(f'seed {seed}, session_len {session_len}: {eval_session.prf_scores}')
            for category in ['pred', 'baseline', 'gold']:
                for metric in ['tp', 'fp', 'fn']:
                    combined_counts[category][metric].extend(eval_session.counts[category][metric])

        # Calculate PRF scores for this seed
        print(f'Calculating overall PRF scores for seed {seed} ...')
        prf_scores = eval_recommendation.calculate_overall_prf_scores([combined_counts], flag_print=False)
        all_prf_scores.append(prf_scores)
        print(f'Done for seed {seed}', end='\n\n')

    # Average the PRF scores across seeds
    averaged_prf_scores = {category: {'p': 0, 'r': 0, 'f': 0} for category in ['pred', 'baseline', 'gold']}
    for prf_scores in all_prf_scores:
        for category in ['pred', 'baseline', 'gold']:
            averaged_prf_scores[category]['p'] += prf_scores[category]['p']
            averaged_prf_scores[category]['r'] += prf_scores[category]['r']
            averaged_prf_scores[category]['f'] += prf_scores[category]['f']
    
    for category in ['pred', 'baseline', 'gold']:
        averaged_prf_scores[category]['p'] /= len(all_prf_scores)
        averaged_prf_scores[category]['r'] /= len(all_prf_scores)
        averaged_prf_scores[category]['f'] /= len(all_prf_scores)

    # replace pred with Memory and baseline with Standard, and remove gold
    averaged_prf_scores['Standard'] = averaged_prf_scores.pop('baseline')
    averaged_prf_scores['Memory'] = averaged_prf_scores.pop('pred')
    averaged_prf_scores.pop('gold')

    # Display the averaged PRF scores
    print('Averaged PRF scores for both session lengths:')
    display(pd.DataFrame(averaged_prf_scores).T.round(3))

    return averaged_prf_scores

In [4]:
prf_mean_overall_recipe_for_both_2_3 = calculate_combined_prf('recipe', target_session_lens=[2, 3])

Calculating tp/fp/fn for seed 0 and current session (session_len) 2 ...
Calculating tp/fp/fn for seed 0 and current session (session_len) 3 ...
Calculating overall PRF scores for seed 0 ...
Done for seed 0

Calculating tp/fp/fn for seed 1 and current session (session_len) 2 ...
Calculating tp/fp/fn for seed 1 and current session (session_len) 3 ...
Calculating overall PRF scores for seed 1 ...
Done for seed 1

Calculating tp/fp/fn for seed 2 and current session (session_len) 2 ...
Calculating tp/fp/fn for seed 2 and current session (session_len) 3 ...
Calculating overall PRF scores for seed 2 ...
Done for seed 2

Calculating tp/fp/fn for seed 3 and current session (session_len) 2 ...
Calculating tp/fp/fn for seed 3 and current session (session_len) 3 ...
Calculating overall PRF scores for seed 3 ...
Done for seed 3

Calculating tp/fp/fn for seed 4 and current session (session_len) 2 ...
Calculating tp/fp/fn for seed 4 and current session (session_len) 3 ...
Calculating overall PRF scor

Unnamed: 0,p,r,f
Standard,0.554,0.311,0.398
Memory,0.47,0.411,0.438


In [5]:
prf_mean_overall_movie_for_both_2_3 = calculate_combined_prf('movie', target_session_lens=[2, 3])

Calculating tp/fp/fn for seed 0 and current session (session_len) 2 ...
Calculating tp/fp/fn for seed 0 and current session (session_len) 3 ...
Calculating overall PRF scores for seed 0 ...
Done for seed 0

Calculating tp/fp/fn for seed 1 and current session (session_len) 2 ...
Calculating tp/fp/fn for seed 1 and current session (session_len) 3 ...
Calculating overall PRF scores for seed 1 ...
Done for seed 1

Calculating tp/fp/fn for seed 2 and current session (session_len) 2 ...
Calculating tp/fp/fn for seed 2 and current session (session_len) 3 ...
Calculating overall PRF scores for seed 2 ...
Done for seed 2

Calculating tp/fp/fn for seed 3 and current session (session_len) 2 ...
Calculating tp/fp/fn for seed 3 and current session (session_len) 3 ...
Calculating overall PRF scores for seed 3 ...
Done for seed 3

Calculating tp/fp/fn for seed 4 and current session (session_len) 2 ...
Calculating tp/fp/fn for seed 4 and current session (session_len) 3 ...
Calculating overall PRF scor

Unnamed: 0,p,r,f
Standard,0.508,0.364,0.424
Memory,0.443,0.397,0.419
