# Import and Set Up


In [3]:
import pandas as pd
import os 
import ast
import numpy as np
from sklearn.model_selection import train_test_split


In [8]:
current_dir = os.getcwd()
if current_dir.endswith("notebooks"):
    prefix = "../"
else:
    prefix = "./"

base_dir = os.path.join(prefix, "summeval-data")

df = pd.read_csv(os.path.join(base_dir, "summeval_with_mistral_ratings.csv"))
# df = pd.read_csv(os.path.join(base_dir, "summeval_og.csv"))

print(df.columns)

Index(['summary', 'expert_annotations', 'turker_annotations', 'references',
       'model_id', 'raw', 'mistral_relevance', 'mistral_fluency',
       'mistral_coherence', 'mistral_consistency'],
      dtype='object')


In [9]:
properties = ["coherence", "consistency", "fluency", "relevance"]


# Processing SummEval Data

In [10]:
# Generate all_annotations column
# 3 experts, 5 turker (crowdsources)

df["expert_annotations"] = df["expert_annotations"].apply(ast.literal_eval)
df["turker_annotations"] = df["turker_annotations"].apply(ast.literal_eval)
df["all_annotations"] = df["expert_annotations"] + df["turker_annotations"]

print(df["all_annotations"][0])

[{'coherence': 2, 'consistency': 1, 'fluency': 4, 'relevance': 2}, {'coherence': 1, 'consistency': 1, 'fluency': 2, 'relevance': 1}, {'coherence': 1, 'consistency': 1, 'fluency': 3, 'relevance': 2}, {'coherence': 3, 'consistency': 3, 'fluency': 4, 'relevance': 3}, {'coherence': 3, 'consistency': 3, 'fluency': 4, 'relevance': 3}, {'coherence': 3, 'consistency': 3, 'fluency': 4, 'relevance': 3}, {'coherence': 3, 'consistency': 3, 'fluency': 4, 'relevance': 3}, {'coherence': 3, 'consistency': 3, 'fluency': 4, 'relevance': 3}]


In [None]:
# All human ratings are range 1~5, disc column to discretize to {0, 1}
DISC_THRESHOLD = 3

# New columns: 
# Scores by property: scores_{prop}_{expert/turker/all}
# Variances: var_{prop}_{expert/turker/all}
# Means: mean_{prop}_{expert/turker/all}
# Difference of Mistral (LLM) score and mean_{prop}_all : diff_{prop}
# NOTE: Mistral score is a preliminary score generated (1-5) prior to the proper heuristics.ipynb run across 5 judges

for prop in properties:
    df[f"scores_{prop}_expert"] = pd.Series([[]]*len(df), dtype=object)
    df[f"scores_{prop}_turker"] = pd.Series([[]]*len(df), dtype=object)
    df[f"scores_{prop}_all"] = pd.Series([[]]*len(df), dtype=object)

    df[f"var_{prop}_expert"] = np.nan
    df[f"var_{prop}_turker"] = np.nan
    df[f"var_{prop}_all"] = np.nan

    df[f"mean_{prop}_expert"] = np.nan
    df[f"mean_{prop}_turker"] = np.nan
    df[f"mean_{prop}_all"] = np.nan

    df[f"var_{prop}_expert_disc"] = np.nan
    df[f"var_{prop}_turker_disc"] = np.nan
    df[f"var_{prop}_all_disc"] = np.nan
    
    df[f"diff_{prop}"] = np.nan  # <-- new column for difference

for idx, row in df.iterrows():
    expert_ann = row["expert_annotations"]
    turker_ann = row["turker_annotations"]
    all_ann = row["all_annotations"]

    for prop in properties:
        expert_scores = [ann[prop] for ann in expert_ann if prop in ann]
        turker_scores = [ann[prop] for ann in turker_ann if prop in ann]
        all_scores = [ann[prop] for ann in all_ann if prop in ann]
        expert_scores_disc = [ann[prop] >= DISC_THRESHOLD for ann in expert_ann if prop in ann]
        turker_scores_disc = [ann[prop] >= DISC_THRESHOLD for ann in turker_ann if prop in ann]
        all_scores_disc = [ann[prop] >= DISC_THRESHOLD for ann in all_ann if prop in ann]

        df.at[idx, f"scores_{prop}_expert"] = expert_scores
        df.at[idx, f"scores_{prop}_turker"] = turker_scores
        df.at[idx, f"scores_{prop}_all"] = all_scores
        
        if expert_scores:
            df.at[idx, f"mean_{prop}_expert"] = np.mean(expert_scores)
        if turker_scores:
            df.at[idx, f"mean_{prop}_turker"] = np.mean(turker_scores)
        if all_scores:
            mean_all = np.mean(all_scores)
            df.at[idx, f"mean_{prop}_all"] = mean_all

            if pd.notna(row.get(f"mistral_{prop}", np.nan)):
                df.at[idx, f"diff_{prop}"] = mean_all - row[f"mistral_{prop}"]

        if len(expert_scores) > 1:
            df.at[idx, f"var_{prop}_expert"] = np.var(expert_scores, ddof=1)
            df.at[idx, f"var_{prop}_expert_disc"] = np.var(expert_scores_disc, ddof=1)
        if len(turker_scores) > 1:
            df.at[idx, f"var_{prop}_turker"] = np.var(turker_scores, ddof=1)
            df.at[idx, f"var_{prop}_turker_disc"] = np.var(turker_scores_disc, ddof=1)
        if len(all_scores) > 1:
            df.at[idx, f"var_{prop}_all"] = np.var(all_scores, ddof=1)
            df.at[idx, f"var_{prop}_all_disc"] = np.var(all_scores_disc, ddof=1)


In [12]:
print(df.columns)

Index(['summary', 'expert_annotations', 'turker_annotations', 'references',
       'model_id', 'raw', 'mistral_relevance', 'mistral_fluency',
       'mistral_coherence', 'mistral_consistency', 'all_annotations',
       'scores_coherence_expert', 'scores_coherence_turker',
       'scores_coherence_all', 'var_coherence_expert', 'var_coherence_turker',
       'var_coherence_all', 'mean_coherence_expert', 'mean_coherence_turker',
       'mean_coherence_all', 'var_coherence_expert_disc',
       'var_coherence_turker_disc', 'var_coherence_all_disc', 'diff_coherence',
       'scores_consistency_expert', 'scores_consistency_turker',
       'scores_consistency_all', 'var_consistency_expert',
       'var_consistency_turker', 'var_consistency_all',
       'mean_consistency_expert', 'mean_consistency_turker',
       'mean_consistency_all', 'var_consistency_expert_disc',
       'var_consistency_turker_disc', 'var_consistency_all_disc',
       'diff_consistency', 'scores_fluency_expert', 'scores_flu

In [13]:
print(df.iloc[0])

summary                      paul merson was brought on with only seven min...
expert_annotations           [{'coherence': 2, 'consistency': 1, 'fluency':...
turker_annotations           [{'coherence': 3, 'consistency': 3, 'fluency':...
references                   ["Andros Townsend an 83rd minute sub in Totten...
model_id                                                                   M11
                                                   ...                        
mean_relevance_all                                                         2.5
var_relevance_expert_disc                                                  0.0
var_relevance_turker_disc                                                  0.0
var_relevance_all_disc                                                0.267857
diff_relevance                                                             0.5
Name: 0, Length: 63, dtype: object


In [14]:
df.to_json(os.path.join(base_dir, "summeval_processed_full.jsonl"), orient="records", lines=True)


# Split df into Train, Val, Holdout Sets

Split sets used in HypotheSAEs code, not heuristics code

In [15]:
# Train 0.7, Val 0.15, Holdout 0.15

train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)

val_df, holdout_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Train: {len(train_df)}, Validation: {len(val_df)}, Holdout: {len(holdout_df)}")

train_df.to_json(os.path.join(base_dir, "summeval_processed_train.jsonl"), orient="records", lines=True)
val_df.to_json(os.path.join(base_dir, "summeval_processed_val.jsonl"), orient="records", lines=True)
holdout_df.to_json(os.path.join(base_dir, "summeval_processed_holdout.jsonl"), orient="records", lines=True)

Train: 1120, Validation: 240, Holdout: 240
