# Compute PC survey results

This document will allow us to compute the PC values from the users and Krippendorf's alpha between them (maybe)

In [2]:
import pandas as pd
import json
import uuid

In [2]:
# data paths
csv_pc_1_path = "../../evals/human/pairwise-comparison/results/twiga-pc-1.csv"
csv_pc_2_path = "../../evals/human/pairwise-comparison/results/twiga-pc-2.csv"
csv_pc_3_path = "../../evals/human/pairwise-comparison/results/twiga-pc-3.csv"
json_pc_1_path = "../../evals/human/pairwise-comparison/pairwise-comparison-survey-part-1.json"
json_pc_2_path = "../../evals/human/pairwise-comparison/pairwise-comparison-survey-part-2.json"
json_pc_3_path = "../../evals/human/pairwise-comparison/pairwise-comparison-survey-part-3.json"

## Convert the question list to a DataFrame per questionnaire

In [5]:
"""Read JSON file of queries into three separate pc survey dataframes"""
# Extract the relevant information and store it in a dictionary
pc_queries_1_df = pd.read_json(json_pc_1_path)
pc_queries_2_df = pd.read_json(json_pc_2_path)
pc_queries_3_df = pd.read_json(json_pc_3_path)

verbose = True
if verbose:
    print(pc_queries_1_df.columns)
    print(pc_queries_2_df.columns)
    print(pc_queries_3_df.columns)

    print(len(pc_queries_1_df))
    print(len(pc_queries_2_df))
    print(len(pc_queries_3_df))

Index(['query', 'human_response', 'response', 'exercise_format', 'topic',
       'source_file'],
      dtype='object')
Index(['query', 'human_response', 'response', 'exercise_format', 'topic',
       'source_file'],
      dtype='object')
Index(['query', 'human_response', 'response', 'exercise_format', 'topic',
       'source_file'],
      dtype='object')
12
12
12


## Convert the pairwise-comparison surveys into DataFrames

In [9]:
# Read the CSV file into a DataFrame, skipping two unnecessary rows

# pc_responses_1_df = pd.read_csv(csv_pc_1_path, skiprows=[1,2])
pc_responses_2_df = pd.read_csv(csv_pc_2_path, skiprows=[1,2])
# pc_responses_3_df = pd.read_csv(csv_pc_3_path, skiprows=[1,2])

# This should be the same for all three surveys as they are essentially identical
name_column = pc_responses_2_df.columns[17]  # The title of the name column
sanity_check_columns = pc_responses_2_df.columns[21:24] # TODO: get the actual query-response pairs I made manually and show them somewhere along with these (can put in appendix)
response_columns = pc_responses_2_df.columns[24:36] # The titles of the question columns and associated responses

def fill_data(pc_df: pd.DataFrame, num_respondents:int) -> pd.DataFrame:
    data_dict = {
        "question_number": ["Q3","Q4","Q5","Q6","Q7","Q8","Q9","Q10","Q11","Q12","Q13","Q14"]*num_respondents,
        "respondent": [],
        "preferred_answer": [], # this is a 1 or a 0, if its a 1 then the second response was preferred and 0 if the first was preferred
    }
    for _, row in pc_df.iterrows():
        name = row[name_column]
        name = uuid.uuid4() # TODO: make the ID the same as in the previous survey! Can just do this manually
        responses = row[response_columns].tolist()
        
        preferred_answer_responses = [int(res)-1 for res in responses]
        
        for pa in preferred_answer_responses:
            data_dict["respondent"].append(name)
            data_dict["preferred_answer"].append(pa)
    
    return pd.DataFrame(data_dict)

# pc_1_data_df = fill_data(pc_responses_1_df, 4)
pc_2_data_df = fill_data(pc_responses_2_df, 3)
# pc_3_data_df = fill_data(pc_responses_3_df, 3)

verbose = True
if verbose:
    print(pc_2_data_df)

   question_number                            respondent  preferred_answer
0               Q3  33a7f594-9ffc-46d3-b6ae-60d543baaf30                 1
1               Q4  33a7f594-9ffc-46d3-b6ae-60d543baaf30                 0
2               Q5  33a7f594-9ffc-46d3-b6ae-60d543baaf30                 1
3               Q6  33a7f594-9ffc-46d3-b6ae-60d543baaf30                 1
4               Q7  33a7f594-9ffc-46d3-b6ae-60d543baaf30                 0
5               Q8  33a7f594-9ffc-46d3-b6ae-60d543baaf30                 1
6               Q9  33a7f594-9ffc-46d3-b6ae-60d543baaf30                 1
7              Q10  33a7f594-9ffc-46d3-b6ae-60d543baaf30                 1
8              Q11  33a7f594-9ffc-46d3-b6ae-60d543baaf30                 0
9              Q12  33a7f594-9ffc-46d3-b6ae-60d543baaf30                 0
10             Q13  33a7f594-9ffc-46d3-b6ae-60d543baaf30                 0
11             Q14  33a7f594-9ffc-46d3-b6ae-60d543baaf30                 1
12              Q3  1221f

## Merge the questions with the responses in one DataFrame

In [12]:
# First expand the queries DataFrames to be the same length as the response DataFrames
# pc_queries_1_df_expanded = pd.concat([pc_queries_1_df] * 4, ignore_index=True) # note that the first survey has 4 respondents
pc_queries_2_df_expanded = pd.concat([pc_queries_2_df] * 3, ignore_index=True)
# pc_queries_3_df_expanded = pd.concat([pc_queries_3_df] * 3, ignore_index=True)

# Concatenate the DataFrames along the columns
# pc_queries_1_complete = pd.concat([pc_queries_1_df_expanded, pc_1_data_df], axis=1)
pc_queries_2_complete = pd.concat([pc_queries_2_df_expanded, pc_2_data_df], axis=1)
# pc_queries_3_complete = pd.concat([pc_queries_3_df_expanded, pc_3_data_df], axis=1)

verbose = False
if verbose:
    # print(pc_queries_1_complete.columns)
    print(pc_queries_2_complete.columns)
    # print(pc_queries_3_complete.columns)

# pc_queries_1_complete.to_csv("../../evals/human/pairwise-comparison/results/data1.csv")
pc_queries_2_complete.to_csv("../../evals/human/pairwise-comparison/results/data2.csv")
# pc_queries_3_complete.to_csv("../../evals/human/pairwise-comparison/results/data3.csv")

## Create an index of questions per model and position per question

In [11]:
# Double check this directly in the survey
pc_survey_1_index = {
    "pipeline-gpt-3-5-turbo-16k-0613": {
        "locations": [0,2,5,11],
        "model_response_number": [1,1,0,0]
    },
    "pipeline-gpt-4-1106-preview":{
        "locations": [1,4,6,7,10],
        "model_response_number": [0,0,1,1,1]
    },
    "pipeline-llama-3-70B-instruct":{
        "locations": [3,8,9],
        "model_response_number": [1,0,0]
    }
}

# Double check this directly in the survey
pc_survey_2_index = {
    "pipeline-gpt-3-5-turbo-16k-0613": {
            "locations": [1,3,4,8,10],
            "model_response_number": [0,1,0,0,1]
        },
    "pipeline-gpt-4-1106-preview": {
            "locations": [0,9,11],
            "model_response_number": [1,0,0]
        },
    "pipeline-llama-3-70B-instruct": {
            "locations": [2,5,6,7],
            "model_response_number": [1,0,1,1]
        }
}

# Double check this directly in the survey
pc_survey_3_index = {
    "pipeline-gpt-3-5-turbo-16k-0613": {
            "locations": [6,7,9],
            "model_response_number": [1,1,0]
        },
    "pipeline-gpt-4-1106-preview": {
            "locations": [2,3,5,8],
            "model_response_number": [1,1,0,0]
        },
    "pipeline-llama-3-70B-instruct": {
            "locations": [0,1,4,10,11],
            "model_response_number": [1,0,0,1,0]
        }
}

QUESTION_ID_CONVERSION = {
    0: "Q3",
    1: "Q4",
    2: "Q5",
    3: "Q6",
    4: "Q7",
    5: "Q8",
    6: "Q9",
    7: "Q10",
    8: "Q11",
    9: "Q12",
    10: "Q13",
    11: "Q14",
}

## Extract the relevant results per pipeline (first off gpt-3-5)

In [22]:
"""Get the relevant gpt-3-5 questions from each survey"""
# List of question numbers to match in the respective surveys
pipeline_match_1 = [QUESTION_ID_CONVERSION[location] for location in pc_survey_1_index["pipeline-gpt-3-5-turbo-16k-0613"]["locations"]]
pipeline_match_2 = [QUESTION_ID_CONVERSION[location] for location in pc_survey_2_index["pipeline-gpt-3-5-turbo-16k-0613"]["locations"]]
pipeline_match_3 = [QUESTION_ID_CONVERSION[location] for location in pc_survey_3_index["pipeline-gpt-3-5-turbo-16k-0613"]["locations"]]

model_response_number_index_1 = [(QUESTION_ID_CONVERSION[location], model_res_number) for model_res_number, location in zip(pc_survey_1_index["pipeline-gpt-3-5-turbo-16k-0613"]["model_response_number"],pc_survey_1_index["pipeline-gpt-3-5-turbo-16k-0613"]["locations"])]
model_response_number_index_2 = [(QUESTION_ID_CONVERSION[location], model_res_number) for model_res_number, location in zip(pc_survey_2_index["pipeline-gpt-3-5-turbo-16k-0613"]["model_response_number"],pc_survey_2_index["pipeline-gpt-3-5-turbo-16k-0613"]["locations"])]
model_response_number_index_3 = [(QUESTION_ID_CONVERSION[location], model_res_number) for model_res_number, location in zip(pc_survey_3_index["pipeline-gpt-3-5-turbo-16k-0613"]["model_response_number"],pc_survey_3_index["pipeline-gpt-3-5-turbo-16k-0613"]["locations"])]

# Create dictionaries
# model_response_dict_1 = dict(model_response_number_index_1)
model_response_dict_2 = dict(model_response_number_index_2)
# model_response_dict_3 = dict(model_response_number_index_3)

# Filter the DataFrame to only include rows where the 'question_number' column matches any name in the list
# pipeline_gpt35_df_3 = pc_queries_1_complete[pc_queries_1_complete['question_number'].isin(pipeline_match_1)]
pipeline_gpt35_df_2 = pc_queries_2_complete[pc_queries_2_complete['question_number'].isin(pipeline_match_2)]
# pipeline_gpt35_df_3 = pc_queries_3_complete[pc_queries_3_complete['question_number'].isin(pipeline_match_3)]

# Map the 'question_number' to 'model_res_number' using the dictionary
# pipeline_gpt35_df_1['model_res_number'] = pipeline_gpt35_df_1['question_number'].map(model_response_dict_1)
pipeline_gpt35_df_2['model_res_number'] = pipeline_gpt35_df_2['question_number'].map(model_response_dict_2)
# pipeline_gpt35_df_3['model_res_number'] = pipeline_gpt35_df_3['question_number'].map(model_response_dict_3)

# Update the question_number identifiers
# pipeline_gpt35_df_1['question_number'] = pipeline_gpt35_df_1['question_number']+"-survey-1"
pipeline_gpt35_df_2['question_number'] = pipeline_gpt35_df_2['question_number']+"-survey-2"
# pipeline_gpt35_df_3['question_number'] = pipeline_gpt35_df_3['question_number']+"-survey-3"

# Concatenate the three DataFrames along the rows
pipeline_df_gpt35 = pipeline_gpt35_df_2
# pipeline_df_gpt35 = pd.concat([pipeline_gpt35_df_1, pipeline_gpt35_df_2, pipeline_gpt35_df_3], axis=0, ignore_index=True)

"""Compute the model score for all the gpt-3.5 responses across the three surveys by comparing pref answer and model res number"""
# Perform the AND operation and create the final column
pipeline_df_gpt35['model_score'] = pipeline_df_gpt35['preferred_answer'] == pipeline_df_gpt35['model_res_number']

# Convert boolean values to integers
pipeline_df_gpt35['model_score'] = pipeline_df_gpt35['model_score'].astype(int)
pipeline_df_gpt35['preferred_answer'] = pipeline_df_gpt35['preferred_answer'].astype(int)
pipeline_df_gpt35['model_res_number'] = pipeline_df_gpt35['model_res_number'].astype(int)

pipeline_df_gpt35.to_csv("../../evals/human/pairwise-comparison/results/pipeline_gpt3_5_results.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pipeline_gpt35_df_2['model_res_number'] = pipeline_gpt35_df_2['question_number'].map(model_response_dict_2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pipeline_gpt35_df_2['question_number'] = pipeline_gpt35_df_2['question_number']+"-survey-2"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pipel

## (now for GPT-4 pipeline)

In [23]:
"""Get the relevant gpt-4 questions from each survey"""
# List of question numbers to match in the respective surveys
pipeline_match_1 = [QUESTION_ID_CONVERSION[location] for location in pc_survey_1_index["pipeline-gpt-4-1106-preview"]["locations"]]
pipeline_match_2 = [QUESTION_ID_CONVERSION[location] for location in pc_survey_2_index["pipeline-gpt-4-1106-preview"]["locations"]]
pipeline_match_3 = [QUESTION_ID_CONVERSION[location] for location in pc_survey_3_index["pipeline-gpt-4-1106-preview"]["locations"]]

model_response_number_index_1 = [(QUESTION_ID_CONVERSION[location], model_res_number) for model_res_number, location in zip(pc_survey_1_index["pipeline-gpt-4-1106-preview"]["model_response_number"],pc_survey_1_index["pipeline-gpt-4-1106-preview"]["locations"])]
model_response_number_index_2 = [(QUESTION_ID_CONVERSION[location], model_res_number) for model_res_number, location in zip(pc_survey_2_index["pipeline-gpt-4-1106-preview"]["model_response_number"],pc_survey_2_index["pipeline-gpt-4-1106-preview"]["locations"])]
model_response_number_index_3 = [(QUESTION_ID_CONVERSION[location], model_res_number) for model_res_number, location in zip(pc_survey_3_index["pipeline-gpt-4-1106-preview"]["model_response_number"],pc_survey_3_index["pipeline-gpt-4-1106-preview"]["locations"])]

# Create dictionaries
# model_response_dict_1 = dict(model_response_number_index_1)
model_response_dict_2 = dict(model_response_number_index_2)
# model_response_dict_3 = dict(model_response_number_index_3)

# Filter the DataFrame to only include rows where the 'question_number' column matches any name in the list
# pipeline_gpt4_df_3 = pc_queries_1_complete[pc_queries_1_complete['question_number'].isin(pipeline_match_1)]
pipeline_gpt4_df_2 = pc_queries_2_complete[pc_queries_2_complete['question_number'].isin(pipeline_match_2)]
# pipeline_gpt4_df_3 = pc_queries_3_complete[pc_queries_3_complete['question_number'].isin(pipeline_match_3)]

# Map the 'question_number' to 'model_res_number' using the dictionary
# pipeline_gpt4_df_1['model_res_number'] = pipeline_gpt4_df_1['question_number'].map(model_response_dict_1)
pipeline_gpt4_df_2['model_res_number'] = pipeline_gpt4_df_2['question_number'].map(model_response_dict_2)
# pipeline_gpt4_df_3['model_res_number'] = pipeline_gpt4_df_3['question_number'].map(model_response_dict_3)

# Update the question_number identifiers
# pipeline_gpt4_df_1['question_number'] = pipeline_gpt4_df_1['question_number']+"-survey-1"
pipeline_gpt4_df_2['question_number'] = pipeline_gpt4_df_2['question_number']+"-survey-2"
# pipeline_gpt4_df_3['question_number'] = pipeline_gpt4_df_3['question_number']+"-survey-3"

# Concatenate the three DataFrames along the rows
pipeline_df_gpt4 = pipeline_gpt4_df_2
# pipeline_df_gpt4 = pd.concat([pipeline_gpt4_df_1, pipeline_gpt4_df_2, pipeline_gpt4_df_3], axis=0, ignore_index=True)

"""Compute the model score for all the gpt-3.5 responses across the three surveys by comparing pref answer and model res number"""
# Perform the AND operation and create the final column
pipeline_df_gpt4['model_score'] = pipeline_df_gpt4['preferred_answer'] == pipeline_df_gpt4['model_res_number']

# Convert boolean values to integers
pipeline_df_gpt4['model_score'] = pipeline_df_gpt4['model_score'].astype(int)
pipeline_df_gpt4['preferred_answer'] = pipeline_df_gpt4['preferred_answer'].astype(int)
pipeline_df_gpt4['model_res_number'] = pipeline_df_gpt4['model_res_number'].astype(int)

pipeline_df_gpt4.to_csv("../../evals/human/pairwise-comparison/results/pipeline_gpt4_results.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pipeline_gpt4_df_2['model_res_number'] = pipeline_gpt4_df_2['question_number'].map(model_response_dict_2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pipeline_gpt4_df_2['question_number'] = pipeline_gpt4_df_2['question_number']+"-survey-2"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pipeline_

## (now for Llama3 pipeline)

In [24]:
"""Get the relevant llama3 questions from each survey"""
# List of question numbers to match in the respective surveys
pipeline_match_1 = [QUESTION_ID_CONVERSION[location] for location in pc_survey_1_index["pipeline-llama-3-70B-instruct"]["locations"]]
pipeline_match_2 = [QUESTION_ID_CONVERSION[location] for location in pc_survey_2_index["pipeline-llama-3-70B-instruct"]["locations"]]
pipeline_match_3 = [QUESTION_ID_CONVERSION[location] for location in pc_survey_3_index["pipeline-llama-3-70B-instruct"]["locations"]]

model_response_number_index_1 = [(QUESTION_ID_CONVERSION[location], model_res_number) for model_res_number, location in zip(pc_survey_1_index["pipeline-llama-3-70B-instruct"]["model_response_number"],pc_survey_1_index["pipeline-llama-3-70B-instruct"]["locations"])]
model_response_number_index_2 = [(QUESTION_ID_CONVERSION[location], model_res_number) for model_res_number, location in zip(pc_survey_2_index["pipeline-llama-3-70B-instruct"]["model_response_number"],pc_survey_2_index["pipeline-llama-3-70B-instruct"]["locations"])]
model_response_number_index_3 = [(QUESTION_ID_CONVERSION[location], model_res_number) for model_res_number, location in zip(pc_survey_3_index["pipeline-llama-3-70B-instruct"]["model_response_number"],pc_survey_3_index["pipeline-llama-3-70B-instruct"]["locations"])]

# Create dictionaries
# model_response_dict_1 = dict(model_response_number_index_1)
model_response_dict_2 = dict(model_response_number_index_2)
# model_response_dict_3 = dict(model_response_number_index_3)

# Filter the DataFrame to only include rows where the 'question_number' column matches any name in the list
# pipeline_llama3_df_3 = pc_queries_1_complete[pc_queries_1_complete['question_number'].isin(pipeline_match_1)]
pipeline_llama3_df_2 = pc_queries_2_complete[pc_queries_2_complete['question_number'].isin(pipeline_match_2)]
# pipeline_llama3_df_3 = pc_queries_3_complete[pc_queries_3_complete['question_number'].isin(pipeline_match_3)]

# Map the 'question_number' to 'model_res_number' using the dictionary
# pipeline_llama3_df_1['model_res_number'] = pipeline_llama3_df_1['question_number'].map(model_response_dict_1)
pipeline_llama3_df_2['model_res_number'] = pipeline_llama3_df_2['question_number'].map(model_response_dict_2)
# pipeline_llama3_df_3['model_res_number'] = pipeline_llama3_df_3['question_number'].map(model_response_dict_3)

# Update the question_number identifiers
# pipeline_llama3_df_1['question_number'] = pipeline_llama3_df_1['question_number']+"-survey-1"
pipeline_llama3_df_2['question_number'] = pipeline_llama3_df_2['question_number']+"-survey-2"
# pipeline_llama3_df_3['question_number'] = pipeline_llama3_df_3['question_number']+"-survey-3"

# Concatenate the three DataFrames along the rows
pipeline_df_llama3 = pipeline_llama3_df_2
# pipeline_df_llama3 = pd.concat([pipeline_llama3_df_1, pipeline_llama3_df_2, pipeline_llama3_df_3], axis=0, ignore_index=True)

"""Compute the model score for all the gpt-3.5 responses across the three surveys by comparing pref answer and model res number"""
# Perform the AND operation and create the final column
pipeline_df_llama3['model_score'] = pipeline_df_llama3['preferred_answer'] == pipeline_df_llama3['model_res_number']

# Convert boolean values to integers
pipeline_df_llama3['model_score'] = pipeline_df_llama3['model_score'].astype(int)
pipeline_df_llama3['preferred_answer'] = pipeline_df_llama3['preferred_answer'].astype(int)
pipeline_df_llama3['model_res_number'] = pipeline_df_llama3['model_res_number'].astype(int)

pipeline_df_llama3.to_csv("../../evals/human/pairwise-comparison/results/pipeline_llama3_results.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pipeline_llama3_df_2['model_res_number'] = pipeline_llama3_df_2['question_number'].map(model_response_dict_2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pipeline_llama3_df_2['question_number'] = pipeline_llama3_df_2['question_number']+"-survey-2"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  p

## Get values of interest from the data

In [5]:
pipeline_gpt35_results = pd.read_csv("../../evals/human/pairwise-comparison/results/pipeline_gpt3_5_results.csv")
pipeline_gpt4_results = pd.read_csv("../../evals/human/pairwise-comparison/results/pipeline_gpt4_results.csv")
pipeline_llama3_results = pd.read_csv("../../evals/human/pairwise-comparison/results/pipeline_llama3_results.csv")

pipeline_gpt35_average = pipeline_gpt35_results.groupby('source_file')[['model_score']].mean()
pipeline_gpt4_average = pipeline_gpt4_results.groupby('source_file')[['model_score']].mean()
pipeline_llama3_average = pipeline_llama3_results.groupby('source_file')[['model_score']].mean()

results_df = pd.concat([pipeline_gpt35_average, pipeline_gpt4_average, pipeline_llama3_average], axis=0, ignore_index=True)
results_df["human_score"] = 1.0 - results_df["model_score"]

# Specify the new column order
new_column_order = ['human_score', 'model_score']

# Reorder the DataFrame columns
results_df = results_df[new_column_order]

print("Averages (0=gpt3.5, 1=gpt4, 2=llama3)")
print(results_df)


Averages (0=gpt3.5, 1=gpt4, 2=llama3)
   human_score  model_score
0     0.400000     0.600000
1     0.333333     0.666667
2     0.250000     0.750000


## Compute krippendorffs alpha

In [8]:
import krippendorff
import numpy as np

# Prepare data for the Krippendorf calculation
df1 = pd.read_csv("../../evals/human/pairwise-comparison/results/pipeline_gpt3_5_results.csv")
df2 = pd.read_csv("../../evals/human/pairwise-comparison/results/pipeline_gpt4_results.csv")
df3 = pd.read_csv("../../evals/human/pairwise-comparison/results/pipeline_llama3_results.csv")

df = pd.concat([df1, df2, df3], axis=0, ignore_index=True)

# Create three pivot tables, one for each metric (the aggfunc is only used to deal with duplicate question_number-respondent pairs)
pivot_preferred_answer = df.pivot_table(index='respondent', columns='question_number', values='preferred_answer', aggfunc='mean')

# Replace missing values with np.nan
pivot_preferred_answer = pivot_preferred_answer.applymap(lambda x: np.nan if pd.isnull(x) else x)

# Convert pivot tables to numpy arrays
array_preferred_answer = pivot_preferred_answer.to_numpy()


verbose = True
if verbose:
    # Print the arrays to check
    print("Preferred Answer Array:")
    print(array_preferred_answer)


Preferred Answer Array:
[[1. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1.]
 [0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1.]
 [1. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 1.]]


  pivot_preferred_answer = pivot_preferred_answer.applymap(lambda x: np.nan if pd.isnull(x) else x)


In [12]:
# Calculate Krippendorff's alpha for each question
alpha_preferred_answer = krippendorff.alpha(reliability_data=array_preferred_answer, level_of_measurement="ordinal")

print(alpha_preferred_answer)

0.453125
