# Combine Data.ipynb

Merge all datasets together before conducting analysis.

In [1]:
import pandas as pd


In [2]:
# Concatenate cw_graded.xlsx and gsm8k_graded.xlsx
cw_grading = pd.read_excel('cw_graded.xlsx')
gsm8k_grading = pd.read_excel('gsm8k_graded.xlsx')

stacked_df = pd.concat([cw_grading, gsm8k_grading])
stacked_df


Unnamed: 0,model_task_method,conversation_number,coherence_1_incoherent_10_very_coherent,task_constraints_followed_0_not_followed_1_followed,ease_of_review_1_easy_10_hard,correct
0,td3_cw_direct_prompting_responses,1,1.0,1.0,,
1,td3_cw_direct_prompting_responses,2,7.0,0.0,,
2,td3_cw_direct_prompting_responses,3,1.0,1.0,,
3,td3_cw_direct_prompting_responses,4,10.0,1.0,,
4,td3_cw_direct_prompting_responses,5,4.0,1.0,,
...,...,...,...,...,...,...
1595,gpt4_gsm8k_manual_cot_responses,96,,,,1.0
1596,gpt4_gsm8k_manual_cot_responses,97,,,,1.0
1597,gpt4_gsm8k_manual_cot_responses,98,,,,1.0
1598,gpt4_gsm8k_manual_cot_responses,99,,,,1.0


### Adding fine-tuned GPT-3.5 predictions

In [3]:
# Predictions from fine-tuned GPT-3.5
# Merge on "predictions_based_on_first_10.csv", "items_to_predict_using_last_10.csv"
# Limit columns to model_task_method, conversation_number, Prediction

first_10 = pd.read_csv('predictions_based_on_first_10.csv')[['model_task_method', 'conversation_number', 'Prediction']]
# Rename column Prediction to Prediction_Based_On_First_10
first_10.rename(columns={'Prediction': 'Prediction_Based_On_First_10'}, inplace=True)

last_10 = pd.read_csv('predictions_based_on_last_10.csv')[['model_task_method', 'conversation_number', 'Prediction']]
# Rename column Prediction to Prediction_Based_On_Last_10
last_10.rename(columns={'Prediction': 'Prediction_Based_On_Last_10'}, inplace=True)

print(first_10)
print(last_10)


                      model_task_method  conversation_number  \
0     td3_cw_direct_prompting_responses                   11   
1     td3_cw_direct_prompting_responses                   12   
2     td3_cw_direct_prompting_responses                   13   
3     td3_cw_direct_prompting_responses                   14   
4     td3_cw_direct_prompting_responses                   15   
...                                 ...                  ...   
1435       gpt4_cw_manual_cot_responses                   96   
1436       gpt4_cw_manual_cot_responses                   97   
1437       gpt4_cw_manual_cot_responses                   98   
1438       gpt4_cw_manual_cot_responses                   99   
1439       gpt4_cw_manual_cot_responses                  100   

     Prediction_Based_On_First_10  
0                  {"Score": "7"}  
1                  {"Score": "7"}  
2                  {"Score": "3"}  
3                  {"Score": "8"}  
4                  {"Score": "3"}  
...            

In [4]:
# Merge on model_task_method and conversation_number
stacked_df_w_pred = stacked_df.merge(first_10, on=['model_task_method', 'conversation_number'], how = 'left')
stacked_df_w_pred = stacked_df_w_pred.merge(last_10, on=['model_task_method', 'conversation_number'], how = 'left')


In [5]:
# Remove {"Score": "
# Remove "}
stacked_df_w_pred['Prediction_Based_On_First_10'] = stacked_df_w_pred['Prediction_Based_On_First_10'].str.replace('{"Score": "', '')
stacked_df_w_pred['Prediction_Based_On_First_10'] = stacked_df_w_pred['Prediction_Based_On_First_10'].str.replace('"}', '')
stacked_df_w_pred['Prediction_Based_On_Last_10'] = stacked_df_w_pred['Prediction_Based_On_Last_10'].str.replace('{"Score": "', '')
stacked_df_w_pred['Prediction_Based_On_Last_10'] = stacked_df_w_pred['Prediction_Based_On_Last_10'].str.replace('"}', '')

# Convert score to float
stacked_df_w_pred['Prediction_Based_On_First_10'] = stacked_df_w_pred['Prediction_Based_On_First_10'].astype(float)
stacked_df_w_pred['Prediction_Based_On_Last_10'] = stacked_df_w_pred['Prediction_Based_On_Last_10'].astype(float)

stacked_df_w_pred


Unnamed: 0,model_task_method,conversation_number,coherence_1_incoherent_10_very_coherent,task_constraints_followed_0_not_followed_1_followed,ease_of_review_1_easy_10_hard,correct,Prediction_Based_On_First_10,Prediction_Based_On_Last_10
0,td3_cw_direct_prompting_responses,1,1.0,1.0,,,,1.0
1,td3_cw_direct_prompting_responses,2,7.0,0.0,,,,7.0
2,td3_cw_direct_prompting_responses,3,1.0,1.0,,,,1.0
3,td3_cw_direct_prompting_responses,4,10.0,1.0,,,,7.0
4,td3_cw_direct_prompting_responses,5,4.0,1.0,,,,1.0
...,...,...,...,...,...,...,...,...
3195,gpt4_gsm8k_manual_cot_responses,96,,,,1.0,,
3196,gpt4_gsm8k_manual_cot_responses,97,,,,1.0,,
3197,gpt4_gsm8k_manual_cot_responses,98,,,,1.0,,
3198,gpt4_gsm8k_manual_cot_responses,99,,,,1.0,,


In [6]:
# Create column "Aggregated_Prediction" as the mean of the two predictions
# If one is missing, use the other
stacked_df_w_pred['Aggregated_Prediction'] = stacked_df_w_pred[['Prediction_Based_On_First_10', 'Prediction_Based_On_Last_10']].mean(axis=1)
stacked_df_w_pred['Aggregated_Prediction'] = stacked_df_w_pred['Aggregated_Prediction'].fillna(stacked_df_w_pred['Prediction_Based_On_First_10'])
stacked_df_w_pred['Aggregated_Prediction'] = stacked_df_w_pred['Aggregated_Prediction'].fillna(stacked_df_w_pred['Prediction_Based_On_Last_10'])


#### Finetuning with longer prompts

In [7]:
# Predictions from fine-tuned GPT-3.5
# Merge on "predictions_based_on_first_10.csv", "items_to_predict_using_last_10.csv"
# Limit columns to model_task_method, conversation_number, Prediction

first_10_LP = pd.read_csv('predictions_based_on_first_10_LP.csv')[['model_task_method', 'conversation_number', 'Prediction', 'response']]
# Rename column Prediction to Prediction_Based_On_First_10
first_10_LP.rename(columns={'Prediction': 'Prediction_Based_On_First_10_LP'}, inplace=True)
# Rename column response
first_10_LP.rename(columns={'response': 'response_Based_On_First_10_LP'}, inplace=True)

last_10_LP = pd.read_csv('predictions_based_on_last_10_LP.csv')[['model_task_method', 'conversation_number', 'Prediction', 'response']]
# Rename column Prediction to Prediction_Based_On_Last_10
last_10_LP.rename(columns={'Prediction': 'Prediction_Based_On_Last_10_LP'}, inplace=True)
# Rename column response
last_10_LP.rename(columns={'response': 'response_Based_On_Last_10_LP'}, inplace=True)

print(first_10_LP)
print(last_10_LP)


                      model_task_method  conversation_number  \
0     td3_cw_direct_prompting_responses                   11   
1     td3_cw_direct_prompting_responses                   12   
2     td3_cw_direct_prompting_responses                   13   
3     td3_cw_direct_prompting_responses                   14   
4     td3_cw_direct_prompting_responses                   15   
...                                 ...                  ...   
1435       gpt4_cw_manual_cot_responses                   96   
1436       gpt4_cw_manual_cot_responses                   97   
1437       gpt4_cw_manual_cot_responses                   98   
1438       gpt4_cw_manual_cot_responses                   99   
1439       gpt4_cw_manual_cot_responses                  100   

     Prediction_Based_On_First_10_LP  \
0                     {"Score": "7"}   
1                     {"Score": "7"}   
2                     {"Score": "3"}   
3                     {"Score": "8"}   
4                     {"Score":

In [8]:
# Merge on model_task_method and conversation_number
stacked_df_w_pred = stacked_df_w_pred.merge(first_10_LP, on=['model_task_method', 'conversation_number'], how = 'left')
stacked_df_w_pred = stacked_df_w_pred.merge(last_10_LP, on=['model_task_method', 'conversation_number'], how = 'left')


In [9]:
# Create response column based on response_Based_On_First_10_LP and response_Based_On_Last_10_LP
# Use response_Based_On_First_10_LP if available, otherwise use response_Based_On_Last_10_LP
stacked_df_w_pred['response_LP'] = stacked_df_w_pred['response_Based_On_First_10_LP'].fillna(stacked_df_w_pred['response_Based_On_Last_10_LP'])


In [10]:
# Remove {"Score": "
# Remove "}
stacked_df_w_pred['Prediction_Based_On_First_10_LP'] = stacked_df_w_pred['Prediction_Based_On_First_10_LP'].str.replace('{"Score": "', '')
stacked_df_w_pred['Prediction_Based_On_First_10_LP'] = stacked_df_w_pred['Prediction_Based_On_First_10_LP'].str.replace('"}', '')
stacked_df_w_pred['Prediction_Based_On_Last_10_LP'] = stacked_df_w_pred['Prediction_Based_On_Last_10_LP'].str.replace('{"Score": "', '')
stacked_df_w_pred['Prediction_Based_On_Last_10_LP'] = stacked_df_w_pred['Prediction_Based_On_Last_10_LP'].str.replace('"}', '')

# Convert score to float
stacked_df_w_pred['Prediction_Based_On_First_10_LP'] = stacked_df_w_pred['Prediction_Based_On_First_10_LP'].astype(float)
stacked_df_w_pred['Prediction_Based_On_Last_10_LP'] = stacked_df_w_pred['Prediction_Based_On_Last_10_LP'].astype(float)

stacked_df_w_pred


Unnamed: 0,model_task_method,conversation_number,coherence_1_incoherent_10_very_coherent,task_constraints_followed_0_not_followed_1_followed,ease_of_review_1_easy_10_hard,correct,Prediction_Based_On_First_10,Prediction_Based_On_Last_10,Aggregated_Prediction,Prediction_Based_On_First_10_LP,response_Based_On_First_10_LP,Prediction_Based_On_Last_10_LP,response_Based_On_Last_10_LP,response_LP
0,td3_cw_direct_prompting_responses,1,1.0,1.0,,,,1.0,1.0,,,1.0,Learning to do a handstand is a fun activity f...,Learning to do a handstand is a fun activity f...
1,td3_cw_direct_prompting_responses,2,7.0,0.0,,,,7.0,7.0,,,6.0,The hawk was used to hunting what he needed fo...,The hawk was used to hunting what he needed fo...
2,td3_cw_direct_prompting_responses,3,1.0,1.0,,,,1.0,1.0,,,2.0,I love the smell of roasting almonds in the ki...,I love the smell of roasting almonds in the ki...
3,td3_cw_direct_prompting_responses,4,10.0,1.0,,,,7.0,7.0,,,8.0,Ralph's bedroom was routinely filled up with s...,Ralph's bedroom was routinely filled up with s...
4,td3_cw_direct_prompting_responses,5,4.0,1.0,,,,1.0,1.0,,,2.0,Joe had a unique way of dealing with the hospi...,Joe had a unique way of dealing with the hospi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3195,gpt4_gsm8k_manual_cot_responses,96,,,,1.0,,,,,,,,
3196,gpt4_gsm8k_manual_cot_responses,97,,,,1.0,,,,,,,,
3197,gpt4_gsm8k_manual_cot_responses,98,,,,1.0,,,,,,,,
3198,gpt4_gsm8k_manual_cot_responses,99,,,,1.0,,,,,,,,


In [11]:
# Create column "Aggregated_Prediction" as the mean of the two predictions
# If one is missing, use the other
stacked_df_w_pred['Aggregated_Prediction_LP'] = stacked_df_w_pred[['Prediction_Based_On_First_10_LP', 'Prediction_Based_On_Last_10_LP']].mean(axis=1)
stacked_df_w_pred['Aggregated_Prediction_LP'] = stacked_df_w_pred['Aggregated_Prediction_LP'].fillna(stacked_df_w_pred['Prediction_Based_On_First_10_LP'])
stacked_df_w_pred['Aggregated_Prediction_LP'] = stacked_df_w_pred['Aggregated_Prediction_LP'].fillna(stacked_df_w_pred['Prediction_Based_On_Last_10_LP'])


### More finetuned predictions

First-last split

In [12]:
# Predictions from fine-tuned GPT-3.5
# Limit columns to model_task_method, conversation_number, Prediction

first_50_LP = pd.read_csv('predictions_based_on_first_50_LP.csv')[['model_task_method', 'conversation_number', 'Prediction', 'response']]
# Rename column Prediction to Prediction_Based_On_First_10
first_50_LP.rename(columns={'Prediction': 'Prediction_Based_On_First_50_LP'}, inplace=True)
# Rename column response
first_50_LP.rename(columns={'response': 'response_Based_On_First_50_LP'}, inplace=True)

last_50_LP = pd.read_csv('predictions_based_on_last_50_LP.csv')[['model_task_method', 'conversation_number', 'Prediction', 'response']]
# Rename column Prediction to Prediction_Based_On_Last_10
last_50_LP.rename(columns={'Prediction': 'Prediction_Based_On_Last_50_LP'}, inplace=True)
# Rename column response
last_50_LP.rename(columns={'response': 'response_Based_On_Last_50_LP'}, inplace=True)

# Merge on model_task_method and conversation_number
stacked_df_w_pred = stacked_df_w_pred.merge(first_50_LP, on=['model_task_method', 'conversation_number'], how = 'left')
stacked_df_w_pred = stacked_df_w_pred.merge(last_50_LP, on=['model_task_method', 'conversation_number'], how = 'left')

# Create response column based on response_Based_On_First_10_LP and response_Based_On_Last_10_LP
# Use response_Based_On_First_10_LP if available, otherwise use response_Based_On_Last_10_LP
# stacked_df_w_pred['response_LP'] = stacked_df_w_pred['response_Based_On_First_50_LP'].fillna(stacked_df_w_pred['response_Based_On_Last_50_LP'])

# Remove {"Score": "
# Remove "}
stacked_df_w_pred['Prediction_Based_On_First_50_LP'] = stacked_df_w_pred['Prediction_Based_On_First_50_LP'].str.replace('{"Score": "', '')
stacked_df_w_pred['Prediction_Based_On_First_50_LP'] = stacked_df_w_pred['Prediction_Based_On_First_50_LP'].str.replace('"}', '')
stacked_df_w_pred['Prediction_Based_On_Last_50_LP'] = stacked_df_w_pred['Prediction_Based_On_Last_50_LP'].str.replace('{"Score": "', '')
stacked_df_w_pred['Prediction_Based_On_Last_50_LP'] = stacked_df_w_pred['Prediction_Based_On_Last_50_LP'].str.replace('"}', '')

# Convert score to float
stacked_df_w_pred['Prediction_Based_On_First_50_LP'] = stacked_df_w_pred['Prediction_Based_On_First_50_LP'].astype(float)
stacked_df_w_pred['Prediction_Based_On_Last_50_LP'] = stacked_df_w_pred['Prediction_Based_On_Last_50_LP'].astype(float)

# Create column "Aggregated_Prediction" as the mean of the two predictions
# If one is missing, use the other
stacked_df_w_pred['Aggregated_Prediction_50_LP'] = stacked_df_w_pred[['Prediction_Based_On_First_50_LP', 'Prediction_Based_On_Last_50_LP']].mean(axis=1)
stacked_df_w_pred['Aggregated_Prediction_50_LP'] = stacked_df_w_pred['Aggregated_Prediction_50_LP'].fillna(stacked_df_w_pred['Prediction_Based_On_First_50_LP'])
stacked_df_w_pred['Aggregated_Prediction_50_LP'] = stacked_df_w_pred['Aggregated_Prediction_50_LP'].fillna(stacked_df_w_pred['Prediction_Based_On_Last_50_LP'])


In [13]:
print(stacked_df_w_pred.columns)


Index(['model_task_method', 'conversation_number',
       'coherence_1_incoherent_10_very_coherent',
       'task_constraints_followed_0_not_followed_1_followed',
       'ease_of_review_1_easy_10_hard', 'correct',
       'Prediction_Based_On_First_10', 'Prediction_Based_On_Last_10',
       'Aggregated_Prediction', 'Prediction_Based_On_First_10_LP',
       'response_Based_On_First_10_LP', 'Prediction_Based_On_Last_10_LP',
       'response_Based_On_Last_10_LP', 'response_LP',
       'Aggregated_Prediction_LP', 'Prediction_Based_On_First_50_LP',
       'response_Based_On_First_50_LP', 'Prediction_Based_On_Last_50_LP',
       'response_Based_On_Last_50_LP', 'Aggregated_Prediction_50_LP'],
      dtype='object')


Random split

In [14]:
random_50_LP_1 = pd.read_csv('predictions_based_on_random_50_LP_1.csv')[['model_task_method', 'conversation_number', 'Prediction', 'response']]
# Rename column Prediction to Prediction_Based_On_random_50_LP_1
random_50_LP_1.rename(columns={'Prediction': 'Prediction_Based_On_random_50_LP_1'}, inplace=True)
# Rename column response
random_50_LP_1.rename(columns={'response': 'response_Based_On_random_50_LP_1'}, inplace=True)

random_50_LP_2 = pd.read_csv('predictions_based_on_random_50_LP_2.csv')[['model_task_method', 'conversation_number', 'Prediction', 'response']]
# Rename column Prediction to Prediction_Based_On_random_50_LP_2
random_50_LP_2.rename(columns={'Prediction': 'Prediction_Based_On_random_50_LP_2'}, inplace=True)
# Rename column response
random_50_LP_2.rename(columns={'response': 'response_Based_On_random_50_LP_2'}, inplace=True)

# Merge on model_task_method and conversation_number
stacked_df_w_pred = stacked_df_w_pred.merge(random_50_LP_1, on=['model_task_method', 'conversation_number'], how = 'left')
stacked_df_w_pred = stacked_df_w_pred.merge(random_50_LP_2, on=['model_task_method', 'conversation_number'], how = 'left')

# Create response column based on response_Based_On_First_10_LP and response_Based_On_Last_10_LP
# Use response_Based_On_First_10_LP if available, otherwise use response_Based_On_Last_10_LP
# stacked_df_w_pred['response_LP'] = stacked_df_w_pred['response_Based_On_First_50_LP'].fillna(stacked_df_w_pred['response_Based_On_Last_50_LP'])

# Remove {"Score": "
# Remove "}
stacked_df_w_pred['Prediction_Based_On_random_50_LP_1'] = stacked_df_w_pred['Prediction_Based_On_random_50_LP_1'].str.replace('{"Score": "', '')
stacked_df_w_pred['Prediction_Based_On_random_50_LP_1'] = stacked_df_w_pred['Prediction_Based_On_random_50_LP_1'].str.replace('"}', '')
stacked_df_w_pred['Prediction_Based_On_random_50_LP_2'] = stacked_df_w_pred['Prediction_Based_On_random_50_LP_2'].str.replace('{"Score": "', '')
stacked_df_w_pred['Prediction_Based_On_random_50_LP_2'] = stacked_df_w_pred['Prediction_Based_On_random_50_LP_2'].str.replace('"}', '')

# Convert score to float
stacked_df_w_pred['Prediction_Based_On_random_50_LP_1'] = stacked_df_w_pred['Prediction_Based_On_random_50_LP_1'].astype(float)
stacked_df_w_pred['Prediction_Based_On_random_50_LP_2'] = stacked_df_w_pred['Prediction_Based_On_random_50_LP_2'].astype(float)

# Create column "Aggregated_Prediction" as the mean of the two predictions
# If one is missing, use the other
stacked_df_w_pred['Aggregated_Prediction_random_50_LP'] = stacked_df_w_pred[['Prediction_Based_On_random_50_LP_1', 'Prediction_Based_On_random_50_LP_2']].mean(axis=1)
stacked_df_w_pred['Aggregated_Prediction_random_50_LP'] = stacked_df_w_pred['Aggregated_Prediction_random_50_LP'].fillna(stacked_df_w_pred['Prediction_Based_On_random_50_LP_1'])
stacked_df_w_pred['Aggregated_Prediction_random_50_LP'] = stacked_df_w_pred['Aggregated_Prediction_random_50_LP'].fillna(stacked_df_w_pred['Prediction_Based_On_random_50_LP_2'])


### Back to more metrics

In [15]:
# Automatic_Metrics.xlsx
Automatic_Metrics = pd.read_excel('Automatic_Metrics.xlsx')

Automatic_Metrics


Unnamed: 0,model_task_method,conversation_number,conversation_length,input_length,output_length,conversation_cost,gsm8k_question_index,gsm8k_answer,gsm8k_length_vs_provided,length_vs_direct_prompting,...,num_linebreaks_prompts,num_sentences_prompts,num_step_i_prompts,num_1_dot_etc_prompts,sentence_length_prompts,fres_prompts,num_linebreaks_provided,num_sentences_provided,num_step_i_provided,num_1_dot_etc_provided
0,td3_gsm8k_direct_prompting_responses,1,82,69,12,0.00164,0.0,Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eg...,0.689076,1.000000,...,1,5,0,0,,,2.0,3.0,0.0,0.0
1,td3_gsm8k_direct_prompting_responses,2,36,30,5,0.00072,1.0,It takes 2/2=<<2/2=1>>1 bolt of white fiber\nS...,0.514286,1.000000,...,1,3,0,0,,,2.0,1.0,0.0,0.0
2,td3_gsm8k_direct_prompting_responses,3,104,53,50,0.00208,2.0,The cost of the house and repairs came out to ...,0.608187,1.000000,...,1,5,0,0,,,4.0,1.0,0.0,2.0
3,td3_gsm8k_direct_prompting_responses,4,39,36,2,0.00078,3.0,He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*...,0.573529,1.000000,...,1,4,0,0,,,2.0,1.0,0.0,0.0
4,td3_gsm8k_direct_prompting_responses,5,115,109,5,0.00230,4.0,"If each chicken eats 3 cups of feed per day, t...",0.598958,1.000000,...,1,6,0,0,,,2.0,3.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3195,gpt4_cw_manual_cot_responses,96,949,704,245,0.03582,,,,2.711429,...,0,35,0,12,17.971429,73.78,,,,
3196,gpt4_cw_manual_cot_responses,97,1012,714,298,0.03930,,,,2.899713,...,0,35,0,12,18.228571,73.58,,,,
3197,gpt4_cw_manual_cot_responses,98,979,711,268,0.03741,,,,2.734637,...,0,35,0,12,18.200000,73.58,,,,
3198,gpt4_cw_manual_cot_responses,99,1023,703,320,0.04029,,,,3.177019,...,0,35,0,12,17.971429,73.78,,,,


In [16]:
# Merge on model_task_method, conversation_number
merged_df = pd.merge(stacked_df_w_pred, Automatic_Metrics, on=['model_task_method', 'conversation_number'], how='left')

merged_df


Unnamed: 0,model_task_method,conversation_number,coherence_1_incoherent_10_very_coherent,task_constraints_followed_0_not_followed_1_followed,ease_of_review_1_easy_10_hard,correct,Prediction_Based_On_First_10,Prediction_Based_On_Last_10,Aggregated_Prediction,Prediction_Based_On_First_10_LP,...,num_linebreaks_prompts,num_sentences_prompts,num_step_i_prompts,num_1_dot_etc_prompts,sentence_length_prompts,fres_prompts,num_linebreaks_provided,num_sentences_provided,num_step_i_provided,num_1_dot_etc_provided
0,td3_cw_direct_prompting_responses,1,1.0,1.0,,,,1.0,1.0,,...,0,5,0,2,10.0,94.35,,,,
1,td3_cw_direct_prompting_responses,2,7.0,0.0,,,,7.0,7.0,,...,0,5,0,2,11.4,86.20,,,,
2,td3_cw_direct_prompting_responses,3,1.0,1.0,,,,1.0,1.0,,...,0,5,0,2,12.2,74.69,,,,
3,td3_cw_direct_prompting_responses,4,10.0,1.0,,,,7.0,7.0,,...,0,5,0,2,9.0,69.99,,,,
4,td3_cw_direct_prompting_responses,5,4.0,1.0,,,,1.0,1.0,,...,0,5,0,2,12.0,74.90,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3195,gpt4_gsm8k_manual_cot_responses,96,,,,1.0,,,,,...,0,45,0,20,,,4.0,5.0,0.0,1.0
3196,gpt4_gsm8k_manual_cot_responses,97,,,,1.0,,,,,...,0,46,0,20,,,2.0,3.0,0.0,0.0
3197,gpt4_gsm8k_manual_cot_responses,98,,,,1.0,,,,,...,0,47,0,20,,,3.0,4.0,0.0,0.0
3198,gpt4_gsm8k_manual_cot_responses,99,,,,1.0,,,,,...,0,47,0,20,,,2.0,3.0,0.0,0.0


### CW Compliance

In [17]:
# Add Creative_Writing_Compliance.xlsx
Creative_Writing_Compliance = pd.read_excel('Creative_Writing_Compliance.xlsx')

merged_df = pd.merge(merged_df, Creative_Writing_Compliance, on=['model_task_method', 'conversation_number'], how='left')
merged_df


Unnamed: 0,model_task_method,conversation_number,coherence_1_incoherent_10_very_coherent,task_constraints_followed_0_not_followed_1_followed,ease_of_review_1_easy_10_hard,correct,Prediction_Based_On_First_10,Prediction_Based_On_Last_10,Aggregated_Prediction,Prediction_Based_On_First_10_LP,...,num_sentences_prompts,num_step_i_prompts,num_1_dot_etc_prompts,sentence_length_prompts,fres_prompts,num_linebreaks_provided,num_sentences_provided,num_step_i_provided,num_1_dot_etc_provided,compliance
0,td3_cw_direct_prompting_responses,1,1.0,1.0,,,,1.0,1.0,,...,5,0,2,10.0,94.35,,,,,1.0
1,td3_cw_direct_prompting_responses,2,7.0,0.0,,,,7.0,7.0,,...,5,0,2,11.4,86.20,,,,,0.0
2,td3_cw_direct_prompting_responses,3,1.0,1.0,,,,1.0,1.0,,...,5,0,2,12.2,74.69,,,,,1.0
3,td3_cw_direct_prompting_responses,4,10.0,1.0,,,,7.0,7.0,,...,5,0,2,9.0,69.99,,,,,1.0
4,td3_cw_direct_prompting_responses,5,4.0,1.0,,,,1.0,1.0,,...,5,0,2,12.0,74.90,,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3195,gpt4_gsm8k_manual_cot_responses,96,,,,1.0,,,,,...,45,0,20,,,4.0,5.0,0.0,1.0,
3196,gpt4_gsm8k_manual_cot_responses,97,,,,1.0,,,,,...,46,0,20,,,2.0,3.0,0.0,0.0,
3197,gpt4_gsm8k_manual_cot_responses,98,,,,1.0,,,,,...,47,0,20,,,3.0,4.0,0.0,0.0,
3198,gpt4_gsm8k_manual_cot_responses,99,,,,1.0,,,,,...,47,0,20,,,2.0,3.0,0.0,0.0,


In [18]:
# Create coherence_1_incoherent_10_very_coherent_compliance_adjusted
# This is coherence_1_incoherent_10_very_coherent but if the conversation is not compliant, the score is 1
merged_df['coherence_1_incoherent_10_very_coherent_compliance_adjusted'] = merged_df['coherence_1_incoherent_10_very_coherent']
merged_df.loc[merged_df['compliance'] == 0, 'coherence_1_incoherent_10_very_coherent_compliance_adjusted'] = 1


## Adding Columns

In [19]:
# Get model, task, and method from model_task_method
# Model - first item when split on underscores
# Task - second item when split on underscores
# method - all remaining items when split on underscores, removing the last item
merged_df['model'] = merged_df['model_task_method'].str.split('_').str[0]
merged_df['task'] = merged_df['model_task_method'].str.split('_').str[1]
merged_df['method'] = merged_df['model_task_method'].str.split('_').str[2:].str[:-1].str.join('_')

# Print value counts of model, task, and method
print(merged_df['model'].value_counts())
print(merged_df['task'].value_counts())
print(merged_df['method'].value_counts())


model
td3     1600
gpt4    1600
Name: count, dtype: int64
task
cw       1600
gsm8k    1600
Name: count, dtype: int64
method
direct_prompting     400
zero_shot_cot        400
ape_zero_shot_cot    400
least_to_most        400
manual_few_shot      400
manual_cot           400
tree_of_thought      400
self_refine          400
Name: count, dtype: int64


In [20]:
# Column for combined accuracy_quality - correct_or_incorrect if task = "gsm8k", creative_writing_score if task = "cw"
merged_df['accuracy_quality'] = merged_df.apply(lambda row: row['correct'] if row['task'] == 'gsm8k' else row['coherence_1_incoherent_10_very_coherent'], axis=1)

# Also create accuracy_quality_compliance_adjusted
merged_df['accuracy_quality_compliance_adjusted'] = merged_df.apply(lambda row: row['correct'] if row['task'] == 'gsm8k' else row['coherence_1_incoherent_10_very_coherent_compliance_adjusted'], axis=1)


## Combined Output

In [21]:
# Output Combined_Data.xlsx
merged_df.to_excel('Combined_Data.xlsx', index=False)


In [24]:
print(merged_df.columns)


Index(['model_task_method', 'conversation_number',
       'coherence_1_incoherent_10_very_coherent',
       'task_constraints_followed_0_not_followed_1_followed',
       'ease_of_review_1_easy_10_hard', 'correct',
       'Prediction_Based_On_First_10', 'Prediction_Based_On_Last_10',
       'Aggregated_Prediction', 'Prediction_Based_On_First_10_LP',
       'response_Based_On_First_10_LP', 'Prediction_Based_On_Last_10_LP',
       'response_Based_On_Last_10_LP', 'response_LP',
       'Aggregated_Prediction_LP', 'Prediction_Based_On_First_50_LP',
       'response_Based_On_First_50_LP', 'Prediction_Based_On_Last_50_LP',
       'response_Based_On_Last_50_LP', 'Aggregated_Prediction_50_LP',
       'Prediction_Based_On_random_50_LP_1',
       'response_Based_On_random_50_LP_1',
       'Prediction_Based_On_random_50_LP_2',
       'response_Based_On_random_50_LP_2',
       'Aggregated_Prediction_random_50_LP', 'conversation_length',
       'input_length', 'output_length', 'conversation_cost',
  

## Direct Prompting Comparison

In [22]:
# Split dataset
combined_data = pd.read_excel('Combined_Data.xlsx')
no_direct_prompting_data = combined_data[combined_data['method'] != 'direct_prompting']
direct_prompting_data = combined_data[combined_data['method'] == 'direct_prompting']

# Add prefix dp_ to columns in direct_prompting_data
direct_prompting_data = direct_prompting_data.add_prefix('dp_')

# Left join datasets on model = dp_model, task = dp_task
direct_prompting_comparison = no_direct_prompting_data.merge(direct_prompting_data, left_on=['model', 'task', 'conversation_number'], right_on=['dp_model', 'dp_task', 'dp_conversation_number'], how='left')

direct_prompting_comparison


Unnamed: 0,model_task_method,conversation_number,coherence_1_incoherent_10_very_coherent,task_constraints_followed_0_not_followed_1_followed,ease_of_review_1_easy_10_hard,correct,Prediction_Based_On_First_10,Prediction_Based_On_Last_10,Aggregated_Prediction,Prediction_Based_On_First_10_LP,...,dp_num_sentences_provided,dp_num_step_i_provided,dp_num_1_dot_etc_provided,dp_compliance,dp_coherence_1_incoherent_10_very_coherent_compliance_adjusted,dp_model,dp_task,dp_method,dp_accuracy_quality,dp_accuracy_quality_compliance_adjusted
0,td3_cw_zero_shot_cot_responses,1,2.0,1.0,2.0,,,1.0,1.0,,...,,,,1.0,1.0,td3,cw,direct_prompting,1,1
1,td3_cw_zero_shot_cot_responses,2,8.0,1.0,3.0,,,3.0,3.0,,...,,,,0.0,1.0,td3,cw,direct_prompting,7,1
2,td3_cw_zero_shot_cot_responses,3,8.0,1.0,2.0,,,1.0,1.0,,...,,,,1.0,1.0,td3,cw,direct_prompting,1,1
3,td3_cw_zero_shot_cot_responses,4,9.0,0.0,1.0,,,8.0,8.0,,...,,,,1.0,10.0,td3,cw,direct_prompting,10,10
4,td3_cw_zero_shot_cot_responses,5,1.0,1.0,6.0,,,1.0,1.0,,...,,,,1.0,4.0,td3,cw,direct_prompting,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2795,gpt4_gsm8k_manual_cot_responses,96,,,,1.0,,,,,...,5.0,0.0,1.0,,,gpt4,gsm8k,direct_prompting,1,1
2796,gpt4_gsm8k_manual_cot_responses,97,,,,1.0,,,,,...,3.0,0.0,0.0,,,gpt4,gsm8k,direct_prompting,0,0
2797,gpt4_gsm8k_manual_cot_responses,98,,,,1.0,,,,,...,4.0,0.0,0.0,,,gpt4,gsm8k,direct_prompting,0,0
2798,gpt4_gsm8k_manual_cot_responses,99,,,,1.0,,,,,...,3.0,0.0,0.0,,,gpt4,gsm8k,direct_prompting,1,1


In [23]:
# Save direct_prompting_comparison to Excel
direct_prompting_comparison.to_excel('direct_prompting_comparison.xlsx', index=False)
