# Interannotator Agreement

In [1]:
import pandas as pd
from statsmodels.stats.contingency_tables import mcnemar 
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import simpledorff
import math


In [2]:
# Load Combined_Data.xlsx
combined_data = pd.read_excel('Combined_Data.xlsx')

# GSM8k data - limit to task = "gsm8k"
gsm8k_data = combined_data[combined_data['task'] == 'gsm8k']

# Creative writing data - limit to task = "cw"
cw_data = combined_data[combined_data['task'] == 'cw']

# Load direct_prompting_comparison.xlsx
direct_prompting_comparison = pd.read_excel('direct_prompting_comparison.xlsx')


### A closer look at creative writing scores

In [3]:
# Summary statistics for variables: coherence_1_incoherent_10_very_coherent, Prediction_Based_On_First_10, Prediction_Based_On_Last_10, Aggregated_Prediction
print(cw_data[['coherence_1_incoherent_10_very_coherent', 'Prediction_Based_On_First_10', 'Prediction_Based_On_Last_10', 'Aggregated_Prediction']].describe())


KeyError: "['Aggregated_Prediction'] not in index"

In [None]:
# Differences between coherence_1_incoherent_10_very_coherent and Aggregated_Prediction
print(stats.describe(cw_data['coherence_1_incoherent_10_very_coherent'] - cw_data['Aggregated_Prediction']))

# Histogram
plt.hist(cw_data['coherence_1_incoherent_10_very_coherent'] - cw_data['Aggregated_Prediction'], bins = 1000)


In [None]:
print(cw_data.columns)


In [None]:
# Krippendorff's alpha
# I am using the simpledorff package
# experiment_col should be the conversation (row of the dataframe) - create model_task_method_conversation_id
# annotator_col should be 'Human', 'GPT-3.5 - First 10', 'GPT-3.5 - Last 10'
# class col should be the score from coherence_1_incoherent_10_very_coherent, Prediction_Based_On_First_10, Prediction_Based_On_Last_10

# Manipulate and reshape dataframe
cw_data['model_task_method_conversation_id'] = cw_data['model'] + '_' + cw_data['task'] + '_' + cw_data['method'] + '_' + cw_data['conversation_number'].astype(str)
cw_data['Human'] = cw_data['coherence_1_incoherent_10_very_coherent']
cw_data['GPT-3.5 - First 10'] = cw_data['Prediction_Based_On_First_10']
cw_data['GPT-3.5 - Last 10'] = cw_data['Prediction_Based_On_Last_10']
ka_data = cw_data[['model_task_method_conversation_id', 'Human', 'GPT-3.5 - First 10', 'GPT-3.5 - Last 10']]
ka_data_melted = ka_data.melt(id_vars=['model_task_method_conversation_id'], value_vars=['Human', 'GPT-3.5 - First 10', 'GPT-3.5 - Last 10'])
ka_data_melted = ka_data_melted.rename(columns={'variable': 'grader', 'value': 'coherence_score'})

# Calculate Krippendorff's alpha
print('overall KA')
print(simpledorff.calculate_krippendorffs_alpha_for_df(ka_data_melted,
                                                 experiment_col='model_task_method_conversation_id',
                                                 annotator_col='grader',
                                                 class_col='coherence_score'))

# Version excluding first 10
ka_data_no_first_10 = cw_data[cw_data['conversation_number'] > 10][['model_task_method_conversation_id', 'Human', 'GPT-3.5 - First 10', 'GPT-3.5 - Last 10']]
ka_data_no_first_10_melted = ka_data_no_first_10.melt(id_vars=['model_task_method_conversation_id'], value_vars=['Human', 'GPT-3.5 - First 10', 'GPT-3.5 - Last 10'])
ka_data_no_first_10_melted = ka_data_no_first_10_melted.rename(columns={'variable': 'grader', 'value': 'coherence_score'})
print('no first 10 KA')
print(simpledorff.calculate_krippendorffs_alpha_for_df(ka_data_no_first_10_melted,
                                                 experiment_col='model_task_method_conversation_id',
                                                 annotator_col='grader',
                                                 class_col='coherence_score'))

# Version excluding last 10
ka_data_no_last_10 = cw_data[cw_data['conversation_number'] <= 90][['model_task_method_conversation_id', 'Human', 'GPT-3.5 - First 10', 'GPT-3.5 - Last 10']]
ka_data_no_last_10_melted = ka_data_no_last_10.melt(id_vars=['model_task_method_conversation_id'], value_vars=['Human', 'GPT-3.5 - First 10', 'GPT-3.5 - Last 10'])
ka_data_no_last_10_melted = ka_data_no_last_10_melted.rename(columns={'variable': 'grader', 'value': 'coherence_score'})
print('no last 10 KA')
print(simpledorff.calculate_krippendorffs_alpha_for_df(ka_data_no_last_10_melted,
                                                 experiment_col='model_task_method_conversation_id',
                                                 annotator_col='grader',
                                                 class_col='coherence_score'))

# Version excluding first and last 10
ka_data_no_first_last_10 = cw_data[(cw_data['conversation_number'] > 10) & (cw_data['conversation_number'] <= 90)][['model_task_method_conversation_id', 'Human', 'GPT-3.5 - First 10', 'GPT-3.5 - Last 10']]
ka_data_no_first_last_10_melted = ka_data_no_first_last_10.melt(id_vars=['model_task_method_conversation_id'], value_vars=['Human', 'GPT-3.5 - First 10', 'GPT-3.5 - Last 10'])
ka_data_no_first_last_10_melted = ka_data_no_first_last_10_melted.rename(columns={'variable': 'grader', 'value': 'coherence_score'})
print('no first and last 10 KA')
print(simpledorff.calculate_krippendorffs_alpha_for_df(ka_data_no_first_last_10_melted,
                                                 experiment_col='model_task_method_conversation_id',
                                                 annotator_col='grader',
                                                 class_col='coherence_score'))


In [None]:
# Try aggregated predictions
ka_data_agg = cw_data[['model_task_method_conversation_id', 'Human', 'Aggregated_Prediction']]
ka_data_agg_melted = ka_data_agg.melt(id_vars=['model_task_method_conversation_id'], value_vars=['Human', 'Aggregated_Prediction'])
ka_data_agg_melted = ka_data_agg_melted.rename(columns={'variable': 'grader', 'value': 'coherence_score'})
print('aggregated predictions KA')
print(simpledorff.calculate_krippendorffs_alpha_for_df(ka_data_agg_melted,
                                                 experiment_col='model_task_method_conversation_id',
                                                 annotator_col='grader',
                                                 class_col='coherence_score'))


In [None]:
# Try to demand less of the data - in ka_data_melted, bin 1-10 into 1-5
# Divide by 2 and take the ceiling
ka_data_melted['coherence_1_to_5'] = ka_data_melted.dropna()['coherence_score'].apply(lambda x: math.ceil(x/2))

# Calculate Krippendorff's alpha
print(simpledorff.calculate_krippendorffs_alpha_for_df(ka_data_melted,
                                                       experiment_col='model_task_method_conversation_id',
                                                       annotator_col='grader',
                                                       class_col='coherence_1_to_5'))


In [None]:
# Try coherence on a scale of 1 to 3
ka_data_melted['coherence_1_to_3'] = ka_data_melted.dropna()['coherence_score'].apply(lambda x: math.ceil(x/3.33))

# Calculate Krippendorff's alpha
print(simpledorff.calculate_krippendorffs_alpha_for_df(ka_data_melted,
                                                       experiment_col='model_task_method_conversation_id',
                                                       annotator_col='grader',
                                                       class_col='coherence_1_to_3'))


In [None]:
# Try coherence vs incoherence binary
ka_data_melted['coherence_binary'] = ka_data_melted.dropna()['coherence_score'].apply(lambda x: 1 if x > 5 else 0)

# Calculate Krippendorff's alpha
print(simpledorff.calculate_krippendorffs_alpha_for_df(ka_data_melted,
                                                       experiment_col='model_task_method_conversation_id',
                                                       annotator_col='grader',
                                                       class_col='coherence_binary'))
