# Result logs cleaning

In [6]:
import os
import re
import pandas as pd
import numpy as np
import json

## Initial Cleaning

In [7]:
def get_start_idx(lines, substring):
    '''
    Obtain the beginning of the cleaning
    '''
    return [line_idx for line_idx, line in enumerate(lines) if substring in line]

### Clean logs of attention head mask

In [46]:
LOGS_PATH = './logs'

def get_sliced_logs(lines):
    '''
    Slice the entire logs of multiple experiemnts into seperate experiements.
    '''
    
    # Step 1: Find the starting point.
    starter_ides = get_start_idx(lines, 'attention_head_mask')
    res = pd.DataFrame({'starter_ides': starter_ides})
    
    # Step 2: Base on all the start point, find the range
    res['ender_ides'] = res.apply(lambda x: x.shift(-1))
    res.iloc[-1, -1] = len(lines) # Upper bound of last row is the length!
    res['ender_ides'] = res['ender_ides'].astype('int')

    # Step 3: Obtain the corresponding lines base on the ranges
    sliced_lines = []
    for index, row in res.iterrows():
        starter_idx = row['starter_ides']
        ender_idx = row['ender_ides']
        sliced_lines.append(lines[starter_idx:ender_idx])
    return sliced_lines
        
def clean_log(logs):
    '''
    Clean the log of 1 experiment
    '''
    def clean_one_line(log_line):
        '''
        Clean 1 line of 1 log
        Examples
        --------
        >>> clean_one_line("00:31:00-INFO:   Batch size = 8")
        ('Batch size', '8')
        '''
        if '-INFO:   ' in log_line:
            result = log_line.split('-INFO:   ')

            if result:
                variable, value = result[1].split(' = ')
                value = re.findall(r'[-+]?(?:\d*\.\d+|\d+)', value)
                if value:
                    value = value[0]
                else:
                    value = None
                return variable, value
    variables = []
    values = []
    for log_line in logs:
        result = clean_one_line(log_line)
        if result:
            variable, value = result
            variables.append(variable)
            values.append(value)
    return variables, values

def get_experiment_result(task):
    with open(f'{LOGS_PATH}/head_pruning/{task}.txt') as f:
        lines = f.readlines()
    dfs = []
    for log in get_sliced_logs(lines):
        experiment = log[0].split(' ')[0]
        parameters = eval(log[1].replace('\n', ''))
        variables, values = clean_log(log[2:])
        # if task == 'rte':
        #     print(task, parameters, variables, values)
        df = pd.DataFrame({'task': task.lower(),
                           'experiments': experiment,
                           'drop_head_at_layer': int(parameters[0]),
                           'drop_head': int(parameters[1]),
                           'variables': variables,
                           'values': values})
        dfs.append(df)
    result = pd.concat(dfs, axis=0, ignore_index=True)
    result.columns.name = None
    result.loc[result['variables'].isin(['acc', 'eval_accuracy']), 'variables'] = 'accuracy'
    return result

def get_baseline_result(task):
    with open(f'{LOGS_PATH}/head_pruning/{task}.txt') as f:
        lines = f.readlines()
    header_start = min(get_start_idx(lines, 'Running evaluation'))
    header_end = min(get_start_idx(lines, 'attention_head_mask'))
    variables, values = clean_log(lines[header_start:header_end])

    result = pd.DataFrame({'task': task,
                       'variables': variables,
                       'values': values})
    result.loc[result['variables'].isin(['acc', 'eval_accuracy']), 'variables'] = 'accuracy'
    return result

In [47]:
experiment_results = []
baseline_results = []
for task in [task.replace('.txt', '') for task in os.listdir(LOGS_PATH + '/head_pruning') if '.ipynb_checkpoints' not in task]:
    experiment_results.append(get_experiment_result(task))
    baseline_results.append(get_baseline_result(task))
    
pd.concat(experiment_results, axis=0).to_csv('logs_cleaned/head_pruning_experiment_results.csv', index=False)
pd.concat(experiment_results, axis=0).to_pickle('logs_cleaned/head_pruning_experiment_results.pickle')
pd.concat(baseline_results, axis=0).to_csv('logs_cleaned/baseline_results.csv', index=False)
pd.concat(baseline_results, axis=0).to_pickle('logs_cleaned/baseline_results.pickle')

### Clean logs of layer pruning

In [333]:
def get_logs_2(lines):
    logs = []
    start_ides = get_start_idx(lines, 'EXPERIMENT')
    for log_idx, strat_idx in enumerate(start_ides):
        if log_idx != len(start_ides) - 1:
            end_idx = start_ides[log_idx+1]
            log = lines[strat_idx:end_idx]
        else:
            log = lines[strat_idx:]
        logs.append(log)
    return logs

def get_experiment_result_2(task):
    with open(f'{LOGS_PATH}/layer_drop/{task}.txt') as f:
        lines = f.readlines()
    dfs = []
    for log in get_logs_2(lines):
        experiment = 'Remove Layers'
        parameter = log[0].split(' remove layers ')[1].replace('\n', '')
        variables, values = clean_log(log[3:])
        df = pd.DataFrame({'task': task.lower(),
                           'experiments': experiment,
                           'parameter': parameter,
                           'variables': variables,
                           'values': values})
        dfs.append(df)
    result = pd.concat(dfs, axis=0, ignore_index=True)
    result.columns.name = None
    result.loc[result['variables'].isin(['acc', 'eval_accuracy']), 'variables'] = 'accuracy'
    return result

experiment_results_2 = []
for task in [task.replace('.txt', '') for task in os.listdir(LOGS_PATH + '/layer_drop') if '.ipynb_checkpoints' not in task]:
    experiment_results_2.append(get_experiment_result_2(task))
pd.concat(experiment_results_2, axis=0).to_csv('logs_cleaned/layer_drop_results.csv', index=False)
pd.concat(experiment_results_2, axis=0).to_pickle('logs_cleaned/layer_drop_results.pickle')

## Result re-formatting for heads pruning

[GLUE](https://openreview.net/pdf?id=rJ4km2R5t7https://openreview.net/pdf?id=rJ4km2R5t7)

<img width = "50%" src="https://cdn.mathpix.com/snip/images/pS3Kb2-_3rym-Zd4LhhdPZkqIs7-K1cMmMekf7QQ2HE.original.fullsize.png" />

- Note that at [BERT](https://arxiv.org/abs/1810.04805https://arxiv.org/abs/1810.04805), F1 scores are reported for QQP and MRPC.

    <img width = "50%" src="https://cdn.mathpix.com/snip/images/TyBsRFSkPxAnklR4GijMblC8w8kcwXuTcAIVCqfaPdA.original.fullsize.png" />

### Core scores of every experiments

In [334]:
benchmark_mapper = pd.DataFrame(
    {'task': ['sst-2', 'rte', 'mrpc', 'wnli', 'sts-b', 'cola'],
     'benchmark': ['accuracy', 'accuracy', 'F-1 score', 'accuracy', 'spearmanr', "Matthew's correlation"]}
)

In [335]:
head_prune = pd.read_pickle('logs_cleaned/head_pruning_experiment_results.pickle')

In [336]:
head_prune_core_benchmark = head_prune.merge(benchmark_mapper, how='inner', on='task') \
    .query('variables == benchmark') \
    .drop(columns=['experiments', 'variables'])
head_prune_core_benchmark.to_csv('logs_cleaned/head_prune_core_benchmark.csv', index=False)

### GLUE scores

#### Baseline

In [358]:
baseline = pd.read_pickle('logs_cleaned/baseline_results.pickle')
baseline_core_benchmark = baseline.merge(benchmark_mapper, how='inner', 
               left_on=['task', 'variables'],
               right_on=['task', 'benchmark']) \
    .drop(columns=['variables']) \
    .rename(columns={'values': 'baseline'})
baseline_core_benchmark['baseline'] = baseline_core_benchmark['baseline'].astype('double')
baseline_core_benchmark.to_csv('logs_cleaned/baseline_core_benchmark.csv', index=False)

#### Experiment

In [359]:
head_prune_core_benchmark['values'] = head_prune_core_benchmark['values'].astype('double')

In [360]:
# Average
res = head_prune_core_benchmark \
    .merge(baseline_core_benchmark, how='left', on=['task', 'benchmark']) \
    .rename(columns={'values':'scores'}) \
    .assign(score_diff = lambda df: (df.scores - df.baseline) / df.baseline) \
    .groupby(["drop_head_at_layer", "drop_head"], as_index=False) \
    .agg(avg_glue = ('score_diff', 'mean')) 

res = res.pivot_table(index=['drop_head_at_layer'],
                values=['avg_glue'],
                columns=['drop_head'])

res.applymap(lambda row: str(round(row* 100, 2)) + '%')

Unnamed: 0_level_0,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue
drop_head,1,2,3,4,5,6,7,8,9,10,11,12
drop_head_at_layer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,-0.54%,-0.65%,-0.59%,-0.91%,-0.9%,-1.46%,-1.0%,-0.71%,-0.82%,-0.92%,-0.98%,-0.28%
2,-0.59%,-0.28%,-0.73%,-0.79%,-0.58%,-0.41%,-0.92%,-1.56%,-1.45%,-1.42%,-1.2%,-1.9%
3,-2.05%,-2.15%,-2.32%,-2.45%,-2.52%,-2.95%,-6.26%,-6.25%,-5.7%,-5.35%,-7.8%,-7.05%
4,-9.64%,-9.82%,-9.38%,-10.02%,-10.03%,-10.29%,-9.92%,-9.81%,-9.29%,-8.23%,-9.25%,-9.14%
5,-8.68%,-9.31%,-9.82%,-15.19%,-14.57%,-15.2%,-15.13%,-15.32%,-15.64%,-17.31%,-17.03%,-17.53%
6,-18.6%,-17.98%,-18.56%,-18.25%,-18.76%,-18.45%,-18.5%,-18.66%,-18.95%,-18.72%,-18.47%,-18.09%
7,-17.5%,-17.52%,-18.29%,-18.25%,-17.89%,-17.65%,-18.1%,-17.98%,-16.83%,-17.02%,-16.93%,-17.2%
8,-17.91%,-17.7%,-18.15%,-18.27%,-18.63%,-17.95%,-17.83%,-18.23%,-17.02%,-17.57%,-17.23%,-16.99%
9,-16.98%,-16.62%,-17.29%,-17.27%,-17.51%,-17.8%,-17.46%,-16.78%,-14.25%,-13.69%,-14.08%,-13.76%
10,-13.59%,-14.22%,-13.71%,-13.4%,-13.0%,-12.14%,-12.96%,-12.97%,-16.3%,-16.0%,-14.99%,-15.39%


In [361]:
# By task
def get_task_result(task, df = head_prune_core_benchmark):
    
    task_df = df[df['task'] == task]
    res = head_prune_core_benchmark \
        .merge(baseline_core_benchmark, how='left', on=['task', 'benchmark']) \
        .rename(columns={'values':'scores'}) \
        .assign(score_diff = lambda df: (df.scores - df.baseline) / df.baseline) \
        .groupby(["drop_head_at_layer", "drop_head"], as_index=False) \
        .agg(avg_glue = ('score_diff', 'mean')) 

    res = res.pivot_table(index=['drop_head_at_layer'],
                    values=['avg_glue'],
                    columns=['drop_head'])

    res = res.applymap(lambda row: str(round(row* 100, 2)) + '%')
    
    return res

In [362]:
get_task_result('mrpc')

Unnamed: 0_level_0,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue
drop_head,1,2,3,4,5,6,7,8,9,10,11,12
drop_head_at_layer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,-0.54%,-0.65%,-0.59%,-0.91%,-0.9%,-1.46%,-1.0%,-0.71%,-0.82%,-0.92%,-0.98%,-0.28%
2,-0.59%,-0.28%,-0.73%,-0.79%,-0.58%,-0.41%,-0.92%,-1.56%,-1.45%,-1.42%,-1.2%,-1.9%
3,-2.05%,-2.15%,-2.32%,-2.45%,-2.52%,-2.95%,-6.26%,-6.25%,-5.7%,-5.35%,-7.8%,-7.05%
4,-9.64%,-9.82%,-9.38%,-10.02%,-10.03%,-10.29%,-9.92%,-9.81%,-9.29%,-8.23%,-9.25%,-9.14%
5,-8.68%,-9.31%,-9.82%,-15.19%,-14.57%,-15.2%,-15.13%,-15.32%,-15.64%,-17.31%,-17.03%,-17.53%
6,-18.6%,-17.98%,-18.56%,-18.25%,-18.76%,-18.45%,-18.5%,-18.66%,-18.95%,-18.72%,-18.47%,-18.09%
7,-17.5%,-17.52%,-18.29%,-18.25%,-17.89%,-17.65%,-18.1%,-17.98%,-16.83%,-17.02%,-16.93%,-17.2%
8,-17.91%,-17.7%,-18.15%,-18.27%,-18.63%,-17.95%,-17.83%,-18.23%,-17.02%,-17.57%,-17.23%,-16.99%
9,-16.98%,-16.62%,-17.29%,-17.27%,-17.51%,-17.8%,-17.46%,-16.78%,-14.25%,-13.69%,-14.08%,-13.76%
10,-13.59%,-14.22%,-13.71%,-13.4%,-13.0%,-12.14%,-12.96%,-12.97%,-16.3%,-16.0%,-14.99%,-15.39%


In [342]:
get_task_result('sst-2')

Unnamed: 0_level_0,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue
drop_head,1,2,3,4,5,6,7,8,9,10,11,12
drop_head_at_layer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,-0.54%,-0.65%,-0.59%,-0.91%,-0.9%,-1.46%,-1.0%,-0.71%,-0.82%,-0.92%,-0.98%,-0.28%
2,-0.59%,-0.28%,-0.73%,-0.79%,-0.58%,-0.41%,-0.92%,-1.56%,-1.45%,-1.42%,-1.2%,-1.9%
3,-2.05%,-2.15%,-2.32%,-2.45%,-2.52%,-2.95%,-6.26%,-6.25%,-5.7%,-5.35%,-7.8%,-7.05%
4,-9.64%,-9.82%,-9.38%,-10.02%,-10.03%,-10.29%,-9.92%,-9.81%,-9.29%,-8.23%,-9.25%,-9.14%
5,-8.68%,-9.31%,-9.82%,-15.19%,-14.57%,-15.2%,-15.13%,-15.32%,-15.64%,-17.31%,-17.03%,-17.53%
6,-18.6%,-17.98%,-18.56%,-18.25%,-18.76%,-18.45%,-18.5%,-18.66%,-18.95%,-18.72%,-18.47%,-18.09%
7,-17.5%,-17.52%,-18.29%,-18.25%,-17.89%,-17.65%,-18.1%,-17.98%,-16.83%,-17.02%,-16.93%,-17.2%
8,-17.91%,-17.7%,-18.15%,-18.27%,-18.63%,-17.95%,-17.83%,-18.23%,-17.02%,-17.57%,-17.23%,-16.99%
9,-16.98%,-16.62%,-17.29%,-17.27%,-17.51%,-17.8%,-17.46%,-16.78%,-14.25%,-13.69%,-14.08%,-13.76%
10,-13.59%,-14.22%,-13.71%,-13.4%,-13.0%,-12.14%,-12.96%,-12.97%,-16.3%,-16.0%,-14.99%,-15.39%


In [343]:
get_task_result('cola')

Unnamed: 0_level_0,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue
drop_head,1,2,3,4,5,6,7,8,9,10,11,12
drop_head_at_layer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,-0.54%,-0.65%,-0.59%,-0.91%,-0.9%,-1.46%,-1.0%,-0.71%,-0.82%,-0.92%,-0.98%,-0.28%
2,-0.59%,-0.28%,-0.73%,-0.79%,-0.58%,-0.41%,-0.92%,-1.56%,-1.45%,-1.42%,-1.2%,-1.9%
3,-2.05%,-2.15%,-2.32%,-2.45%,-2.52%,-2.95%,-6.26%,-6.25%,-5.7%,-5.35%,-7.8%,-7.05%
4,-9.64%,-9.82%,-9.38%,-10.02%,-10.03%,-10.29%,-9.92%,-9.81%,-9.29%,-8.23%,-9.25%,-9.14%
5,-8.68%,-9.31%,-9.82%,-15.19%,-14.57%,-15.2%,-15.13%,-15.32%,-15.64%,-17.31%,-17.03%,-17.53%
6,-18.6%,-17.98%,-18.56%,-18.25%,-18.76%,-18.45%,-18.5%,-18.66%,-18.95%,-18.72%,-18.47%,-18.09%
7,-17.5%,-17.52%,-18.29%,-18.25%,-17.89%,-17.65%,-18.1%,-17.98%,-16.83%,-17.02%,-16.93%,-17.2%
8,-17.91%,-17.7%,-18.15%,-18.27%,-18.63%,-17.95%,-17.83%,-18.23%,-17.02%,-17.57%,-17.23%,-16.99%
9,-16.98%,-16.62%,-17.29%,-17.27%,-17.51%,-17.8%,-17.46%,-16.78%,-14.25%,-13.69%,-14.08%,-13.76%
10,-13.59%,-14.22%,-13.71%,-13.4%,-13.0%,-12.14%,-12.96%,-12.97%,-16.3%,-16.0%,-14.99%,-15.39%


In [344]:
get_task_result('sst-2')

Unnamed: 0_level_0,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue
drop_head,1,2,3,4,5,6,7,8,9,10,11,12
drop_head_at_layer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,-0.54%,-0.65%,-0.59%,-0.91%,-0.9%,-1.46%,-1.0%,-0.71%,-0.82%,-0.92%,-0.98%,-0.28%
2,-0.59%,-0.28%,-0.73%,-0.79%,-0.58%,-0.41%,-0.92%,-1.56%,-1.45%,-1.42%,-1.2%,-1.9%
3,-2.05%,-2.15%,-2.32%,-2.45%,-2.52%,-2.95%,-6.26%,-6.25%,-5.7%,-5.35%,-7.8%,-7.05%
4,-9.64%,-9.82%,-9.38%,-10.02%,-10.03%,-10.29%,-9.92%,-9.81%,-9.29%,-8.23%,-9.25%,-9.14%
5,-8.68%,-9.31%,-9.82%,-15.19%,-14.57%,-15.2%,-15.13%,-15.32%,-15.64%,-17.31%,-17.03%,-17.53%
6,-18.6%,-17.98%,-18.56%,-18.25%,-18.76%,-18.45%,-18.5%,-18.66%,-18.95%,-18.72%,-18.47%,-18.09%
7,-17.5%,-17.52%,-18.29%,-18.25%,-17.89%,-17.65%,-18.1%,-17.98%,-16.83%,-17.02%,-16.93%,-17.2%
8,-17.91%,-17.7%,-18.15%,-18.27%,-18.63%,-17.95%,-17.83%,-18.23%,-17.02%,-17.57%,-17.23%,-16.99%
9,-16.98%,-16.62%,-17.29%,-17.27%,-17.51%,-17.8%,-17.46%,-16.78%,-14.25%,-13.69%,-14.08%,-13.76%
10,-13.59%,-14.22%,-13.71%,-13.4%,-13.0%,-12.14%,-12.96%,-12.97%,-16.3%,-16.0%,-14.99%,-15.39%


In [345]:
get_task_result('sts-b')

Unnamed: 0_level_0,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue
drop_head,1,2,3,4,5,6,7,8,9,10,11,12
drop_head_at_layer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,-0.54%,-0.65%,-0.59%,-0.91%,-0.9%,-1.46%,-1.0%,-0.71%,-0.82%,-0.92%,-0.98%,-0.28%
2,-0.59%,-0.28%,-0.73%,-0.79%,-0.58%,-0.41%,-0.92%,-1.56%,-1.45%,-1.42%,-1.2%,-1.9%
3,-2.05%,-2.15%,-2.32%,-2.45%,-2.52%,-2.95%,-6.26%,-6.25%,-5.7%,-5.35%,-7.8%,-7.05%
4,-9.64%,-9.82%,-9.38%,-10.02%,-10.03%,-10.29%,-9.92%,-9.81%,-9.29%,-8.23%,-9.25%,-9.14%
5,-8.68%,-9.31%,-9.82%,-15.19%,-14.57%,-15.2%,-15.13%,-15.32%,-15.64%,-17.31%,-17.03%,-17.53%
6,-18.6%,-17.98%,-18.56%,-18.25%,-18.76%,-18.45%,-18.5%,-18.66%,-18.95%,-18.72%,-18.47%,-18.09%
7,-17.5%,-17.52%,-18.29%,-18.25%,-17.89%,-17.65%,-18.1%,-17.98%,-16.83%,-17.02%,-16.93%,-17.2%
8,-17.91%,-17.7%,-18.15%,-18.27%,-18.63%,-17.95%,-17.83%,-18.23%,-17.02%,-17.57%,-17.23%,-16.99%
9,-16.98%,-16.62%,-17.29%,-17.27%,-17.51%,-17.8%,-17.46%,-16.78%,-14.25%,-13.69%,-14.08%,-13.76%
10,-13.59%,-14.22%,-13.71%,-13.4%,-13.0%,-12.14%,-12.96%,-12.97%,-16.3%,-16.0%,-14.99%,-15.39%


In [346]:
get_task_result('rte')

Unnamed: 0_level_0,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue
drop_head,1,2,3,4,5,6,7,8,9,10,11,12
drop_head_at_layer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,-0.54%,-0.65%,-0.59%,-0.91%,-0.9%,-1.46%,-1.0%,-0.71%,-0.82%,-0.92%,-0.98%,-0.28%
2,-0.59%,-0.28%,-0.73%,-0.79%,-0.58%,-0.41%,-0.92%,-1.56%,-1.45%,-1.42%,-1.2%,-1.9%
3,-2.05%,-2.15%,-2.32%,-2.45%,-2.52%,-2.95%,-6.26%,-6.25%,-5.7%,-5.35%,-7.8%,-7.05%
4,-9.64%,-9.82%,-9.38%,-10.02%,-10.03%,-10.29%,-9.92%,-9.81%,-9.29%,-8.23%,-9.25%,-9.14%
5,-8.68%,-9.31%,-9.82%,-15.19%,-14.57%,-15.2%,-15.13%,-15.32%,-15.64%,-17.31%,-17.03%,-17.53%
6,-18.6%,-17.98%,-18.56%,-18.25%,-18.76%,-18.45%,-18.5%,-18.66%,-18.95%,-18.72%,-18.47%,-18.09%
7,-17.5%,-17.52%,-18.29%,-18.25%,-17.89%,-17.65%,-18.1%,-17.98%,-16.83%,-17.02%,-16.93%,-17.2%
8,-17.91%,-17.7%,-18.15%,-18.27%,-18.63%,-17.95%,-17.83%,-18.23%,-17.02%,-17.57%,-17.23%,-16.99%
9,-16.98%,-16.62%,-17.29%,-17.27%,-17.51%,-17.8%,-17.46%,-16.78%,-14.25%,-13.69%,-14.08%,-13.76%
10,-13.59%,-14.22%,-13.71%,-13.4%,-13.0%,-12.14%,-12.96%,-12.97%,-16.3%,-16.0%,-14.99%,-15.39%


In [347]:
# By task
head_prune_core_benchmark.query("task == 'sst-2'") \
    .merge(baseline, how='left', on=['task', 'benchmark']) \
    .rename(columns={'values':'scores'}) \
    .assign(score_diff = lambda df: df.scores - df.baseline) \
    .groupby(["drop_head_at_layer", "drop_head"], as_index=False) \
    .agg(avg_glue = ('score_diff', 'mean'))\
    .pivot_table(index=['drop_head_at_layer'],
                 values=['avg_glue'],
                 columns=['drop_head'])

Unnamed: 0_level_0,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue
drop_head,1,2,3,4,5,6,7,8,9,10,11,12
drop_head_at_layer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,0.001147,0.0,0.001147,-0.001147,0.001147,0.0,-0.001147,-0.001147,-0.002294,-0.001147,0.00344,0.001147
2,0.002294,0.002294,0.001147,0.001147,0.0,0.0,0.002294,0.001147,0.0,0.001147,0.002294,0.0
3,-0.00344,0.0,0.0,0.0,0.0,-0.001147,0.001147,0.0,0.0,0.001147,0.0,0.001147
4,0.002294,0.001147,0.001147,0.002294,0.001147,0.001147,0.002294,0.002294,0.001147,-0.002294,0.002294,0.002294
5,0.002294,0.001147,0.0,0.001147,0.0,0.001147,0.002294,0.00344,0.0,0.001147,0.001147,0.0
6,-0.002294,0.0,0.0,-0.00344,-0.00344,-0.002294,-0.00344,-0.001147,0.0,-0.005734,-0.004587,0.001147
7,-0.001147,0.001147,0.0,0.001147,0.0,-0.001147,0.002294,0.002294,0.0,0.0,0.001147,0.002294
8,-0.002294,-0.00344,0.0,-0.001147,0.001147,0.0,0.0,0.002294,0.001147,0.002294,0.0,0.001147
9,0.0,-0.00344,0.0,0.0,0.001147,0.001147,0.002294,0.0,-0.001147,-0.004587,-0.002294,0.001147
10,0.0,0.0,0.001147,0.001147,0.002294,-0.001147,0.001147,0.0,0.001147,0.001147,-0.001147,0.001147


## Results re-formmating for layer droping

### Core scores of every experiments

In [348]:
benchmark_mapper = pd.DataFrame(
    {'task': ['sst-2', 'rte', 'mrpc', 'wnli', 'sts-b', 'cola'],
     'benchmark': ['accuracy', 'accuracy', 'F-1 score', 'accuracy', 'spearmanr', "Matthew's correlation"]}
)
layer_drop = pd.read_pickle('logs_cleaned/layer_drop_results.pickle')

In [350]:
# Clean the mixed parameters column
layer_drop['strategy'] = layer_drop['parameter'].apply(lambda x: re.search(r'\(([^()]+)\)', x).group(1))
layer_drop['n_layer_drop'] = layer_drop['strategy'].apply(lambda x: re.search(r'\d', x).group()).astype('int')
layer_drop['strategy'] = layer_drop['strategy'].apply(lambda x: re.sub(r' \d', '', x).replace('drop bottom', 'bottom drop').title())
layer_drop['layer_drop'] = layer_drop['parameter'].apply(lambda x: re.search(r'([^\(]+)', x).group(1))
layer_drop = layer_drop.drop(columns = 'parameter')

In [351]:
layer_drop_core_benchmark = layer_drop.merge(benchmark_mapper, how='inner', on='task') \
    .query('variables == benchmark') \
    .drop(columns=['experiments', 'variables'])

In [352]:
layer_drop_core_benchmark.groupby('task').size()

task
cola     9
mrpc     9
rte      9
sst-2    9
sts-b    9
wnli     9
dtype: int64

In [353]:
layer_drop_core_benchmark = layer_drop.merge(benchmark_mapper, how='inner', on='task') \
    .query('variables == benchmark') \
    .drop(columns=['experiments', 'variables'])

In [354]:
layer_drop_core_benchmark.head()

Unnamed: 0,task,values,strategy,n_layer_drop,layer_drop,benchmark
5,sst-2,0.9231651376146788,Top Drop,2,1011,accuracy
15,sst-2,0.911697247706422,Top Drop,4,891011,accuracy
25,sst-2,0.904816513761468,Top Drop,6,67891011,accuracy
35,sst-2,0.9220183486238532,Bottom Drop,2,1,accuracy
45,sst-2,0.9059633027522936,Bottom Drop,4,123,accuracy


In [355]:
layer_drop_core_benchmark.to_csv('logs_cleaned/layer_drop_core_benchmark.csv', index=False)