# Prepare results for statistical testing

In [1]:
import pandas as pd
from plot_utils import load_results, get_dataframes

In [2]:
def process_df(df, task='category', aggregated=False):
    # Calculate successes and failures
    if task == 'category':
        # Calculate successes
        if aggregated:
            total_trials = 320 # 32 trials per episode, 10 episodes
            df['Successes'] = df['Accuracy'] * total_trials
            assert all(df['Successes'] % 1 == 0)
            df['Successes'] = df['Successes'].astype(int)
            df['Failures'] = total_trials - df['Successes']
        else:
            total_trials = 16 # 16 trials per group per episode
            df['Successes_TrainA'] = df['TrainA'] * total_trials
            df['Successes_TrainB'] = df['TrainB'] * total_trials
            df['Successes_TestA'] = df['TestA'] * total_trials
            df['Successes_TestB'] = df['TestB'] * total_trials
            # Make sure number of successes are integers
            assert all(df['Successes_TrainA'] % 1 == 0)
            assert all(df['Successes_TrainB'] % 1 == 0)
            assert all(df['Successes_TestA'] % 1 == 0)
            assert all(df['Successes_TestB'] % 1 == 0)
            # Convert to integers
            df['Successes_TrainA'] = df['Successes_TrainA'].astype(int)
            df['Successes_TrainB'] = df['Successes_TrainB'].astype(int)
            df['Successes_TestA'] = df['Successes_TestA'].astype(int)
            df['Successes_TestB'] = df['Successes_TestB'].astype(int)
            # Calculate failures
            df['Failures_TrainA'] = total_trials - df['Successes_TrainA']
            df['Failures_TrainB'] = total_trials - df['Successes_TrainB']
            df['Failures_TestA'] = total_trials - df['Successes_TestA']
            df['Failures_TestB'] = total_trials - df['Successes_TestB']
            # Take the last step
            max_step = df['Step'].max()
            df = df[df['Step'] == max_step]
            # Aggregate
            count_cols = ['Successes_TrainA', 'Successes_TrainB',
                          'Successes_TestA', 'Successes_TestB',
                          'Failures_TrainA', 'Failures_TrainB',
                          'Failures_TestA', 'Failures_TestB']
            group_by = ['Metalearned', 'Seed', 'Rotation', 'Curriculum']
            agg_dict = {k:'sum' for k in count_cols}
            df = df.groupby(group_by).agg(agg_dict).reset_index()
            # Calculate total successes and failures
            df['Successes'] = df['Successes_TrainA'] + df['Successes_TrainB']
            df['Failures'] = df['Failures_TrainA'] + df['Failures_TrainB']
            df['Accuracy'] = df['Successes'] / (df['Successes'] + df['Failures'])
            # Drop columns
            df = df.drop(columns=count_cols)
    elif task == 'grid':
        if aggregated:
            total_trials = 160 # 16 trials per episode, 10 episodes
            df['Successes'] = df['Accuracy'] * total_trials
            assert all(df['Successes'] % 1 == 0)
            df['Successes'] = df['Successes'].astype(int)
            df['Failures'] = total_trials - df['Successes']
        else:
            # Calculate successes and failures for test
            total_trials = 16
            df['Successes_Test'] = df['Test'] * total_trials
            assert all(df['Successes_Test'] % 1 == 0)
            df['Successes_Test'] = df['Successes_Test'].astype(int)
            df['Failures_Test'] = total_trials - df['Successes_Test']
            # Calculate successes and failures for train
            total_trialsA = 5
            total_trialsB = 4
            df['Successes_TrainA'] = df['TrainA'] * total_trialsA
            df['Successes_TrainB'] = df['TrainB'] * total_trialsB
            # Find rows where there were 5 in group B and 4 in group A
            is_intA = df['Successes_TrainA'] % 1 == 0
            is_intB = df['Successes_TrainB'] % 1 == 0
            is_flipped = ~(is_intA & is_intB)
            # Recalculate successes and failures for flipped rows
            df.loc[is_flipped, 'Successes_TrainA'] = df.loc[is_flipped, 'TrainB'] * total_trialsA
            df.loc[is_flipped, 'Successes_TrainB'] = df.loc[is_flipped, 'TrainA'] * total_trialsB
            # Make sure number of successes are integers
            assert all(df['Successes_TrainA'] % 1 == 0)
            assert all(df['Successes_TrainB'] % 1 == 0)
            # Convert to integers
            df['Successes_TrainA'] = df['Successes_TrainA'].astype(int)
            df['Successes_TrainB'] = df['Successes_TrainB'].astype(int)
            # Calculate failures
            df['Failures_TrainA'] = total_trialsA - df['Successes_TrainA']
            df['Failures_TrainB'] = total_trialsB - df['Successes_TrainB']
            # Make sure number of failures are integers
            assert all(df['Failures_TrainA'] % 1 == 0)
            assert all(df['Failures_TrainB'] % 1 == 0)
            # Take the last step
            max_step = df['Step'].max()
            df = df[df['Step'] == max_step]
            # Aggregate
            count_cols = ['Successes_TrainA', 'Successes_TrainB',
                          'Successes_Test',
                          'Failures_TrainA', 'Failures_TrainB',
                          'Failures_Test']
            group_by = ['Seed', 'Rotation', 'Curriculum']
            agg_dict = {k:'sum' for k in count_cols}
            df = df.groupby(group_by).agg(agg_dict).reset_index()
            # Calculate total successes and failures
            df['Successes'] = df['Successes_TrainA'] + df['Successes_TrainB']
            df['Failures'] = df['Failures_TrainA'] + df['Failures_TrainB']
            df['Accuracy'] = df['Successes'] / (df['Successes'] + df['Failures'])
            # Drop columns
            df = df.drop(columns=count_cols)
            df['Rotation'] = df['Rotation'].replace('Unrotated', 'Rule-like')
    else:
        raise ValueError(f"Unknown task: {task}")
        
    return df.reset_index()

### Category-learning task

IWL only

In [3]:
path = f'../../results/cat_iwl_only'
seeds = [i for i in range(1,11)]
fns = [f'{path}_seed{seed}' for seed in seeds]

results = load_results(seeds, fns)

dfs = get_dataframes(results, n_groups=2)

iwl_df = dfs[4] # finetune_df: accuracies after finetuning
iwl_df = process_df(iwl_df, task='category', aggregated=True)
iwl_df = iwl_df.drop(columns=['loss', 'TrainA', 'TestA', 'TrainB', 'TestB'])
iwl_df.reset_index(drop=True, inplace=True)
iwl_df['Metalearned'] = False
iwl_df = iwl_df[['Metalearned', 'Rotation', 'Curriculum', 'Seed', 
                 'Accuracy', 'Successes', 'Failures']]

IWL + ICL

In [4]:
path = f'../../results/cat_iwl_icl_unrotated_blocked'
seeds = [i for i in range(1,11)]
fns = [path + f'_seed{seed}' for seed in seeds]

results = load_results(seeds, fns)

dfs = get_dataframes(results, n_groups=2, thresh=0.9)
few_shot_df, finetune_df = dfs[3], dfs[4]

# ICL only df
few_shot_df = process_df(few_shot_df, task='category', aggregated=True)
icl_only_df = few_shot_df.copy()
icl_only_df['Metalearned'] = True
icl_only_df = icl_only_df[['Metalearned', 'Rotation', 'Curriculum', 'Seed', 
                           'Accuracy', 'Successes', 'Failures']]

# Only take Rotation=Rule-like from few_shot_df
few_shot_df = few_shot_df[few_shot_df['Rotation'] == 'Rule-like']

# Only take Rotation=Rotated from finetune_df
finetune_df = process_df(finetune_df, task='category', aggregated=True)
finetune_df = finetune_df.drop(columns=['loss', 'TrainA', 'TestA', 
                                        'TrainB', 'TestB'])
finetune_df.reset_index(drop=True, inplace=True)
finetune_df = finetune_df[finetune_df['Rotation'] == 'Rotated']

# Concatenate few_shot_df and finetune_df
icl_df = pd.concat([few_shot_df, finetune_df], ignore_index=True)
icl_df['Metalearned'] = True
icl_df = icl_df[['Metalearned', 'Rotation', 'Curriculum', 'Seed', 
                 'Accuracy', 'Successes', 'Failures']]


Concatenate and save

In [5]:
# Merge dataframes
category_df = pd.concat([iwl_df, icl_df], ignore_index=True)

# Save both dataframes as csv
icl_only_df.to_csv('category_icl_only.csv', index=False)
category_df.to_csv('category_icl_iwl.csv', index=False)

### Compositional task

IWL only

In [8]:
path = f'../../results/grid_iwl_only'
seeds = [i for i in range(1,11)]
fns = [f'{path}_seed{seed}' for seed in seeds]

results = load_results(seeds, fns)

dfs = get_dataframes(results, n_groups=2, task='grid')

iwl_df = dfs[0]
iwl_df = process_df(iwl_df, task='grid', aggregated=False)
iwl_df['Metalearned'] = False
iwl_df = iwl_df[['Metalearned', 'Rotation', 'Curriculum', 'Seed', 
                 'Accuracy', 'Successes', 'Failures']]

IWL + ICL

In [9]:
path = f'../../results/grid_iwl_icl_unrotated_blocked'
seeds = [i for i in range(1,11)]
fns = [path + f'_seed{seed}' for seed in seeds]

results = load_results(seeds, fns)

dfs = get_dataframes(results, n_groups=2, thresh=0.9, task='grid')
df, few_shot_df = dfs[0], dfs[3]

# ICL only df
few_shot_df = process_df(few_shot_df, task='grid', aggregated=True)
few_shot_df['Metalearned'] = True
icl_only_df = few_shot_df.copy()
icl_only_df = icl_only_df[['Metalearned', 'Rotation', 'Curriculum', 
                           'Seed', 'Accuracy', 'Successes', 'Failures']]

# Only take Rotation=Rule-like from few_shot_df
few_shot_df = few_shot_df[few_shot_df['Rotation'] == 'Rule-like']

# Only take Rotation=Rotated from finetune_df
finetune_df = process_df(df, task='grid', aggregated=False)
finetune_df = finetune_df[finetune_df['Rotation'] == 'Rotated']

# Concatenate few_shot_df and finetune_df
icl_df = pd.concat([few_shot_df, finetune_df], ignore_index=True)
icl_df['Metalearned'] = True
icl_df = icl_df[['Metalearned', 'Rotation', 'Curriculum', 'Seed', 
                 'Accuracy', 'Successes', 'Failures']]


Removed seeds {1, 5, 7, 8, 10} due to threshold 0.9


Concatenate and save

In [10]:
# Merge dataframes
grid_df = pd.concat([iwl_df, icl_df], ignore_index=True)

# Save both dataframes as csv
icl_only_df.to_csv('grid_icl_only.csv', index=False)
grid_df.to_csv('grid_icl_iwl.csv', index=False)

### LLM results

In [13]:
llm_results = pd.read_csv('llm_results.csv')

In [19]:
llm_results

Unnamed: 0,Model,Rotation,Condition,Split,Episode,Accuracy,Successes,Failures
0,GPT,Rule-like,Aligned,test,0,1.0000,16,0
1,GPT,Rule-like,Aligned,test,1,0.8750,14,2
2,GPT,Rule-like,Aligned,test,2,1.0000,16,0
3,GPT,Rule-like,Aligned,test,3,0.8750,14,2
4,GPT,Rule-like,Aligned,test,4,0.9375,15,1
...,...,...,...,...,...,...,...,...
287,Llama2,Rotated,Interleaved,test,episode_10,0.1875,3,13
288,Llama2,Rotated,Interleaved,test,episode_36,0.1875,3,13
289,Llama2,Rotated,Interleaved,test,episode_22,0.3125,5,11
290,Llama2,Rotated,Interleaved,test,episode_45,0.3125,5,11


In [None]:
# Calculate successes and failures for test
total_trials = 16
llm_results['Successes'] = llm_results['Accuracy'] * total_trials
assert all(llm_results['Successes'] % 1 == 0)
llm_results['Successes'] = llm_results['Successes'].astype(int)
llm_results['Failures'] = total_trials - llm_results['Successes']


# Aggregate
# Rename 'Condition' to 'Curriculum'
llm_results.rename(columns={'Condition': 'Curriculum'}, inplace=True)
group_by = ['Model', 'Rotation', 'Curriculum']
agg_dict = {'Successes': 'sum',
            'Failures': 'sum',
            'Accuracy': 'mean'}
llm_results = llm_results.groupby(group_by).agg(agg_dict).reset_index()


In [22]:
llm_results.to_csv('llm_results_aggregated.csv', index=False)