In [1]:
import json
import numpy as np
import pandas as pd
import glob

In [2]:
# https://medium.com/huggingface/distilbert-8cf3380435b5# 
distillbert = [42.5, (81.6, 81.1), (82.4, 88.3), 85.5, (90.6, 87.7), 60.0, 92.7, (84.5, 85.0), 55.6  ]
# Macro average
columns = ['cola', 'mnli', 'mrpc', 'qnli', 'qqp', 'rte', 'sst2', 'stsb', 'wnli']
distillbert_macro = []
for item in distillbert:
    if isinstance(item, tuple):
        distillbert_macro.append(np.mean(item))
    else:
        distillbert_macro.append(item)
        
print("Distill bert macro average", np.mean(distillbert_macro) )

Distill bert macro average 75.21111111111111


In [3]:
# tf-transformers joint loss GLUE
all_data = []
for col in columns:
    file_name = 'eval_{}.json'.format(col)
    data = json.load(open(file_name))
    all_data.append(data)
    if col == 'mnli':
        file_name = 'eval_{}_mismatched.json'.format(col)
        data = json.load(open(file_name))
        all_data.append(data)
        
df = pd.DataFrame(all_data, index = ['cola', 'mnli', 'mnli_mismatched', 'mrpc', 'qnli', 'qqp', 'rte', 'sst2', 'stsb', 'wnli']
)

In [4]:
# Join (f1 and accuracy) together
df_macro = df.copy()
# Take MNLI mean
df_macro.loc['mnli'] = df.loc[['mnli', 'mnli_mismatched']].mean(axis=0)
# MRPC mean
df_macro.loc['mrpc'] = df.loc['mrpc'].map(lambda x: (x['acc'] + x['f1'])/2.0)
# QQP mean
df_macro.loc['qqp'] = df.loc['qqp'].map(lambda x: (x['acc'] + x['f1'])/2.0)
# STS mean
df_macro.loc['stsb'] = df.loc['stsb'].map(lambda x: (x['pearson'][0] + x['spearman'][0])/2.0)
# Drop index
df_macro = df_macro.drop(index=['mnli_mismatched'])

df_macro.columns = ['layer_{}'.format(i) for i in range(12)]
df_macro = df_macro.transpose()


In [5]:
# Distillbert results
df_distillbert = pd.DataFrame(distillbert_macro).transpose()
df_distillbert.columns=columns
df_distillbert.index = ['distillbert']

In [6]:
df_final = pd.concat([df_macro, df_distillbert], axis=0)
# Add glue score
df_final['glue_score'] = df_final.mean(axis=1)

In [7]:
df_final

Unnamed: 0,cola,mnli,mrpc,qnli,qqp,rte,sst2,stsb,wnli,glue_score
layer_0,0.0,0.588027,0.747751,0.60864,0.721775,0.512635,0.802752,0.056938,0.577465,0.512887
layer_1,0.0,0.733188,0.78921,0.824272,0.83256,0.592058,0.87156,0.819157,0.535211,0.666357
layer_2,0.272488,0.77803,0.814339,0.860516,0.863139,0.613718,0.885321,0.862362,0.422535,0.70805
layer_3,0.383146,0.808468,0.841794,0.876991,0.876136,0.620939,0.905963,0.875921,0.352113,0.72683
layer_4,0.462619,0.826284,0.863821,0.893282,0.883802,0.649819,0.918578,0.887245,0.380282,0.751748
layer_5,0.50701,0.836718,0.882761,0.901153,0.887166,0.646209,0.925459,0.893838,0.366197,0.760724
layer_6,0.512397,0.842825,0.874924,0.905913,0.889191,0.66065,0.928899,0.896986,0.408451,0.768915
layer_7,0.526382,0.845573,0.886949,0.910672,0.89046,0.689531,0.930046,0.897835,0.450704,0.780906
layer_8,0.529669,0.848525,0.883263,0.910489,0.890699,0.700361,0.928899,0.899427,0.380282,0.774624
layer_9,0.535036,0.848881,0.87545,0.911038,0.890596,0.703971,0.925459,0.901022,0.338028,0.769942


In [8]:
# WNLI is the task where scores are actually differing a lot
# Even in BERT-base it si 43.4

# But distillbert scores 55.6 which makes a big
# difference while taking average

# WNLI has only 635+ train examples and 71 dev examples

# That might be the reason for this bias
# But look ate layer0 . It gives 57 % accuracy

# Anyway lets evaluate without WNLI
columns_without_wnli = columns[:-1]
df_final['glue_score_without_wnli'] = df_final[columns_without_wnli].mean(axis=1)

In [9]:
df_final

Unnamed: 0,cola,mnli,mrpc,qnli,qqp,rte,sst2,stsb,wnli,glue_score,glue_score_without_wnli
layer_0,0.0,0.588027,0.747751,0.60864,0.721775,0.512635,0.802752,0.056938,0.577465,0.512887,0.504815
layer_1,0.0,0.733188,0.78921,0.824272,0.83256,0.592058,0.87156,0.819157,0.535211,0.666357,0.682751
layer_2,0.272488,0.77803,0.814339,0.860516,0.863139,0.613718,0.885321,0.862362,0.422535,0.70805,0.743739
layer_3,0.383146,0.808468,0.841794,0.876991,0.876136,0.620939,0.905963,0.875921,0.352113,0.72683,0.77367
layer_4,0.462619,0.826284,0.863821,0.893282,0.883802,0.649819,0.918578,0.887245,0.380282,0.751748,0.798181
layer_5,0.50701,0.836718,0.882761,0.901153,0.887166,0.646209,0.925459,0.893838,0.366197,0.760724,0.810039
layer_6,0.512397,0.842825,0.874924,0.905913,0.889191,0.66065,0.928899,0.896986,0.408451,0.768915,0.813973
layer_7,0.526382,0.845573,0.886949,0.910672,0.89046,0.689531,0.930046,0.897835,0.450704,0.780906,0.822181
layer_8,0.529669,0.848525,0.883263,0.910489,0.890699,0.700361,0.928899,0.899427,0.380282,0.774624,0.823916
layer_9,0.535036,0.848881,0.87545,0.911038,0.890596,0.703971,0.925459,0.901022,0.338028,0.769942,0.823932


In [12]:
df_final[['glue_score_without_wnli']]

Unnamed: 0,glue_score_without_wnli
layer_0,0.504815
layer_1,0.682751
layer_2,0.743739
layer_3,0.77367
layer_4,0.798181
layer_5,0.810039
layer_6,0.813973
layer_7,0.822181
layer_8,0.823916
layer_9,0.823932
