In [1]:
import pandas as pd

## Basic comparison

In [22]:
# load the results
results = pd.read_csv("../data/final_eval_set.csv", index_col = 0)

In [23]:
# model by model, check the accuracy (match rate)
results.groupby('svm_match').size().transform(lambda x: round(x/sum(x),3))

svm_match
False    0.609
True     0.391
dtype: float64

In [24]:
results.groupby('log_match').size().transform(lambda x: round(x/sum(x),3))

log_match
False    0.6
True     0.4
dtype: float64

In [25]:
results.groupby('bert_match').size().transform(lambda x: round(x/sum(x),3))

bert_match
False    0.599
True     0.401
dtype: float64

## Accuracy by category

In [36]:
# model by model, check the accuracy by true category (0 = good, 1 = controversial, 2 = bad)
# https://stackoverflow.com/questions/23377108/pandas-percentage-of-total-with-groupby
results.groupby(['true', 'bert_match']).size().groupby(level=0).apply(lambda x: x / float(x.sum()))

true  bert_match
0     False         0.561386
      True          0.438614
1     False         0.553089
      True          0.446911
2     False         0.683863
      True          0.316137
dtype: float64

In [37]:
results.groupby(['true', 'svm_match']).size().groupby(level=0).apply(lambda x: x / float(x.sum()))

true  svm_match
0     False        0.591337
      True         0.408663
1     False        0.675973
      True         0.324027
2     False        0.554034
      True         0.445966
dtype: float64

In [38]:
results.groupby(['true', 'log_match']).size().groupby(level=0).apply(lambda x: x / float(x.sum()))

true  log_match
0     False        0.584406
      True         0.415594
1     False        0.657666
      True         0.342334
2     False        0.552812
      True         0.447188
dtype: float64

## Compare relative performance

In [26]:
# check actual label distribution - it is relatively even but a little more controversial
results.groupby('true').size().transform(lambda x: round(x/sum(x),3))

true
0    0.323
1    0.350
2    0.327
dtype: float64

In [27]:
# model by model, check the cases where it "wins" i.e. where that model is correct and the others are wrong

bert_wins = results[(results['svm_match'] == False) & (results['log_match'] == False) & (results['bert_match'] == True)]

In [32]:
len(bert_wins)

1995

In [28]:
bert_wins.groupby('true').size().transform(lambda x: round(x/sum(x),3))

true
0    0.314
1    0.512
2    0.174
dtype: float64

In [29]:
svm_wins = results[(results['svm_match'] == True) & (results['log_match'] == False) & (results['bert_match'] == False)]
svm_wins.groupby('true').size().transform(lambda x: round(x/sum(x),3))

true
0    0.288
1    0.322
2    0.390
dtype: float64

In [39]:
len(svm_wins)

621

In [40]:
log_wins = results[(results['svm_match'] == False) & (results['log_match'] == True) & (results['bert_match'] == False)]
log_wins.groupby('true').size().transform(lambda x: round(x/sum(x),3))

true
0    0.254
1    0.356
2    0.389
dtype: float64

In [41]:
len(log_wins)

606

In [44]:
all_agree_win = results[(results['svm_match'] == True) & (results['log_match'] == True) & (results['bert_match'] == True)]

In [46]:
all_agree_win.groupby('true').size().transform(lambda x: round(x/sum(x),3))

true
0    0.400
1    0.257
2    0.343
dtype: float64

In [47]:
all_agree_lose = results[(results['svm_match'] == False) & (results['log_match'] == False) & (results['bert_match'] == False)]

In [49]:
all_agree_lose.groupby('true').size().transform(lambda x: round(x/sum(x),3))

true
0    0.319
1    0.332
2    0.349
dtype: float64