In [1]:
import pandas as pd
from krippendorff import alpha
from sklearn.metrics import precision_score, recall_score

In [2]:
df = pd.read_csv('../../data/raw/test_vs_goldstandard.csv')
df

Unnamed: 0.1,Unnamed: 0,joepLI,joepDF,joepAL,scriptLI,scriptDF,scriptAL
0,Achmea 2011,0,0,0,0,0,0
1,Aegon 2014,0,2,0,0,3,0
2,ASML 2018,2,5,0,3,9,0
3,BallastNedam 2010,0,1,0,0,0,0
4,BAM 2009,0,1,1,0,0,1
5,Brunel 2004,0,0,0,0,0,0
6,DSM 2004,0,0,0,0,0,0
7,E&Y 2015,3,3,2,3,6,2
8,Essent 2011,0,6,0,0,5,0
9,FrieslandCampina 2010,0,0,0,0,0,0


## Krippendorff's alpha

In [3]:
print(f"LI: {alpha(df[['joepLI','scriptLI']].T)}")
print(f"DF: {alpha(df[['joepDF','scriptDF']].T)}")
print(f"AL: {alpha(df[['joepAL','scriptAL']].T)}")

LI: 0.9513888888888888
DF: 0.8518998272884283
AL: 0.7012195121951219


## Precision and Recall
Hier berekenen we precision en recall als binaire maat. We willen weten:
- van alle verslagen die het perspectief bevatten (>0), hoe veel verslagen vindt het script terug (recall)?
- van alle verslagen die het script aanmerkt als het perspectief bevattend, hoe vaak klopt dit? (precision)?

In [4]:
df2 = df.drop("Unnamed: 0", axis=1).applymap(lambda x: int(x>0))
df2

Unnamed: 0,joepLI,joepDF,joepAL,scriptLI,scriptDF,scriptAL
0,0,0,0,0,0,0
1,0,1,0,0,1,0
2,1,1,0,1,1,0
3,0,1,0,0,0,0
4,0,1,1,0,0,1
5,0,0,0,0,0,0
6,0,0,0,0,0,0
7,1,1,1,1,1,1
8,0,1,0,0,1,0
9,0,0,0,0,0,0


In [5]:
print(f"LI: precision = {precision_score(df2['joepLI'], df2['scriptLI'])}, recall = {recall_score(df2['joepLI'], df2['scriptLI'])}")
print(f"DF: precision = {precision_score(df2['joepDF'], df2['scriptDF'])}, recall = {recall_score(df2['joepDF'], df2['scriptDF'])}")
print(f"AL: precision = {precision_score(df2['joepAL'], df2['scriptAL'])}, recall = {recall_score(df2['joepAL'], df2['scriptAL'])}")


LI: precision = 1.0, recall = 0.8888888888888888
DF: precision = 1.0, recall = 0.8571428571428571
AL: precision = 1.0, recall = 0.42857142857142855


## With revisited search strings 

In [45]:
dfall = pd.read_csv('../../data/intermediate/automatedcoding.csv')
dfall

Unnamed: 0,country,year,company,marketperspective,moralperspective,innovationperspective
0,us,1999,lowe's,0,0,0
1,us,1999,fedex,0,0,0
2,us,1999,southwestairlines,0,0,0
3,us,1999,fluor,0,0,0
4,us,1999,capitalonefinancial,0,0,0
...,...,...,...,...,...,...
2293,nl,2005,sligro,0,0,0
2294,nl,2005,aalberts,0,0,0
2295,nl,2005,rabobank,0,0,0
2296,nl,2005,kpmg,0,2,0


In [46]:
dfall.replace({'e&y-':'e&y'}, inplace=True)
df['company'] = df['Unnamed: 0'].map(lambda x:x.split()[0].lower()).replace({'tata':'tatasteel'})
df['year'] = df['Unnamed: 0'].map(lambda x:int(x.split()[1]))

In [47]:
dfnew = df[['company','year','joepLI','joepDF','joepAL']].merge(dfall, on=['company','year'], how='left')
dfnew.rename({'marketperspective':'scriptAL', 'innovationperspective':'scriptLI', 'moralperspective':'scriptDF'},axis=1, inplace=True)

In [51]:
dfnew

Unnamed: 0,company,year,joepLI,joepDF,joepAL,country,scriptAL,scriptDF,scriptLI
0,achmea,2011,0,0,0,nl,0,0,0
1,aegon,2014,0,2,0,nl,0,3,0
2,asml,2018,2,5,0,nl,0,9,3
3,ballastnedam,2010,0,1,0,nl,0,0,0
4,bam,2009,0,1,1,nl,1,0,0
5,brunel,2004,0,0,0,nl,0,0,0
6,dsm,2004,0,0,0,nl,0,0,0
7,e&y,2015,3,3,2,nl,2,6,3
8,essent,2011,0,6,0,nl,0,5,0
9,frieslandcampina,2010,0,0,0,nl,0,0,0


In [49]:
print(f"LI: {alpha(dfnew[['joepLI','scriptLI']].T)}")
print(f"DF: {alpha(dfnew[['joepDF','scriptDF']].T)}")
print(f"AL: {alpha(dfnew[['joepAL','scriptAL']].T)}")

LI: 0.930298719772404
DF: 0.837841726618705
AL: 0.4842105263157894


In [56]:
df4 = dfnew.drop(['company','year','country'], axis=1).applymap(lambda x: int(x>0))

print(f"LI: precision = {precision_score(df4['joepLI'], df4['scriptLI'])}, recall = {recall_score(df4['joepLI'], df4['scriptLI'])}")
print(f"DF: precision = {precision_score(df4['joepDF'], df4['scriptDF'])}, recall = {recall_score(df4['joepDF'], df4['scriptDF'])}")
print(f"AL: precision = {precision_score(df4['joepAL'], df4['scriptAL'])}, recall = {recall_score(df4['joepAL'], df4['scriptAL'])}")


LI: precision = 1.0, recall = 0.8888888888888888
DF: precision = 0.9230769230769231, recall = 0.8571428571428571
AL: precision = 0.8333333333333334, recall = 0.7142857142857143


In [62]:
check = dfnew.join(df,rsuffix='OLD')
check['scriptLI']-check['scriptLIOLD']

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    1
20    0
21    0
22    0
23    0
24    0
dtype: int64

In [63]:
check['scriptDF']-check['scriptDFOLD']

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    1
17    0
18    0
19    0
20    0
21    1
22    0
23    0
24    0
dtype: int64

In [64]:
check['scriptAL']-check['scriptALOLD']

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    5
17    0
18    0
19    1
20    0
21    0
22    1
23    1
24    0
dtype: int64