In [48]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix, precision_score
from scipy.stats import chi2_contingency

In [3]:
df_preds_green = pd.read_csv("data/model_splits/green_split/green_as_train/green_set_01/predictions_green_set_01_context.csv")
df_preds_nongreen = pd.read_csv("data/model_splits/green_split/green_as_train/green_set_02/predictions_green_set_02_context.csv")

## Predictions from the set_01 model: Trained on Green Party Manifestos:

In [7]:
df_preds_green.shape

(178827, 9)

In [4]:
df_preds_green.head(2)

Unnamed: 0,q_sentence,q_sentence_nr,manifesto_id,main_codes,label,original_index,input_ids,attention_mask,preds
0,The year 2014 is a crucial year in the history...,1,181210_201405,601,0,192247,"[0, 133, 76, 777, 16, 10, 4096, 76, 11, 5, 750...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
1,It is crucial because it marks exactly 20 year...,2,181210_201405,202,0,192248,"[0, 243, 16, 4096, 142, 24, 4863, 2230, 291, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0


In [10]:
# Ratio of code 501 in the True labels:
df_preds_green[df_preds_green["label"] == 1].shape[0] / df_preds_green.shape[0]

0.03909924116604316

In [11]:
# Ratio of code 501 in the Predicted labels:
df_preds_green[df_preds_green["preds"] == 1].shape[0] / df_preds_green.shape[0]

0.05042303455294782

In [60]:
#                    Pred. Neg:   Pred. Pos
#    Real Neg:       True Neg --- False Pos
#    Real Pos:       False Neg --- True Pos
confusion_matrix(df_preds_green["label"], df_preds_green["preds"])

array([[167869,   3966],
       [  1941,   5051]], dtype=int64)

In [61]:
f1_score(df_preds_green["label"], df_preds_green["preds"], average='binary') # weighted f1 is way better, but not really fair here I think

0.6310200512211881

In [62]:
accuracy_score(df_preds_green["label"], df_preds_green["preds"])

0.966968075290644

In [67]:
precision_score(df_preds_green["label"], df_preds_green["preds"], average='binary')

0.5601641344127759

In [66]:
recall_score(df_preds_green["label"], df_preds_green["preds"], average='binary')

0.7223970251716247

In [23]:
# set up contingency table
contingency_table = pd.DataFrame({#"Group": ["# 501 codes", "# non-501 codes"],
                                  "Model": [df_preds_green[df_preds_green["preds"] == 1].shape[0],
                                            df_preds_green[df_preds_green["preds"] == 0].shape[0]],
                                  "Coders": [df_preds_green[df_preds_green["label"] == 1].shape[0],
                                             df_preds_green[df_preds_green["label"] == 0].shape[0]]})


contingency_table

Unnamed: 0,Model,Coders
0,9017,6992
1,169810,171835


In [24]:
chi2_contingency(contingency_table)

(267.88279964075474,
 3.283601566547837e-60,
 1,
 array([[  8004.5,   8004.5],
        [170822.5, 170822.5]]))

Same thing, but only for the 20% of non-Green codes that were also in the test set for set_02:

In [25]:
df = df_preds_green[df_preds_green["manifesto_id"].isin(df_preds_nongreen["manifesto_id"])]

In [26]:
df.shape

(35605, 9)

In [27]:
# Ratio of code 501 in the True labels:
df[df["label"] == 1].shape[0] / df.shape[0]

0.04367364134250808

In [28]:
# Ratio of code 501 in the Predicted labels:
df[df["preds"] == 1].shape[0] / df.shape[0]

0.0517062210363713

In [29]:
# set up contingency table
contingency_table = pd.DataFrame({#"Group": ["# 501 codes", "# non-501 codes"],
                                  "Model": [df[df["preds"] == 1].shape[0],
                                            df[df["preds"] == 0].shape[0]],
                                  "Coders": [df[df["label"] == 1].shape[0],
                                             df[df["label"] == 0].shape[0]]})


contingency_table

Unnamed: 0,Model,Coders
0,1841,1555
1,33764,34050


In [30]:
chi2_contingency(contingency_table)

(25.115606047137252,
 5.399411570687179e-07,
 1,
 array([[ 1698.,  1698.],
        [33907., 33907.]]))

Having a look at the model performance on the validation data (so on Green-Party manifestos, similar to what it was trained on)

In [31]:
df = pd.read_csv("data/model_splits/green_split/green_as_train/green_set_01/predictions_green_set_01_context_validation.csv")


In [43]:
df.head(2)

Unnamed: 0,q_sentence,q_sentence_nr,manifesto_id,main_codes,label,original_index,input_ids,attention_mask,preds
0,Section 1: A Vibrant Green Economy,1,53110_200705,-1,0,35029,"[0, 43480, 112, 35, 83, 468, 11804, 927, 1628,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
1,Energy,2,53110_200705,-1,0,35030,"[0, 30189, 2, 2, 43480, 112, 35, 83, 468, 1180...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0


In [32]:
df.shape

(2297, 9)

In [39]:
#                    Pred. Neg:   Pred. Pos
#    Real Neg:       True Neg --- False Pos
#    Real Pos:       False Neg --- True Pos
confusion_matrix(df["label"], df["preds"])

array([[1887,  152],
       [  59,  199]], dtype=int64)

In [None]:
#What real code do the false positives have??

In [40]:
# Ratio of code 501 in the True labels:
df[df["label"] == 1].shape[0] / df.shape[0]

0.11232041793643883

In [41]:
# Ratio of code 501 in the predicted labels:
df[df["preds"] == 1].shape[0] / df.shape[0]

0.15280801044841097

In [51]:
f1_score(df["label"], df["preds"], average='binary') # weighted f1 is way better, but not really fair here I think

0.6535303776683086

In [47]:
accuracy_score(df["label"], df["preds"])

0.9081410535481063

In [49]:
precision_score(df["label"], df["preds"])

0.5669515669515669

In [52]:
recall_score(df["label"], df["preds"], average="binary")

0.7713178294573644

## Predictions from the set_02: Model trained on non-Green Party manifestos

In [8]:
df_preds_nongreen.shape

(35605, 9)

In [5]:
df_preds_nongreen.head(2)

Unnamed: 0,q_sentence,q_sentence_nr,manifesto_id,main_codes,label,original_index,input_ids,attention_mask,preds
0,A. WHERE DOES THE ECONOMIC FREEDOM FI...,1,181210_201905,0,0,194825,"[0, 250, 4, 29919, 1437, 1437, 30540, 1437, 14...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
1,The Economic Freedom Fighters (EFF) is a revo...,2,181210_201905,415,0,194826,"[0, 133, 4713, 7978, 29423, 36, 28991, 43, 143...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0


In [14]:
# Ratio of code 501 in the True labels:
df_preds_nongreen[df_preds_nongreen["label"] == 1].shape[0] / df_preds_nongreen.shape[0]

0.04367364134250808

In [15]:
# Ratio of code 501 in the Predicted labels:
df_preds_nongreen[df_preds_nongreen["preds"] == 1].shape[0] / df_preds_nongreen.shape[0]

0.03808453868838646

In [59]:
#                    Pred. Neg:   Pred. Pos
#    Real Neg:       True Neg --- False Pos
#    Real Pos:       False Neg --- True Pos
confusion_matrix(df_preds_nongreen["label"], df_preds_nongreen["preds"])

array([[33727,   323],
       [  522,  1033]], dtype=int64)

In [53]:
f1_score(df_preds_nongreen["label"], df_preds_nongreen["preds"], average='binary')

0.7097217451047749

In [55]:
accuracy_score(df_preds_nongreen["label"], df_preds_nongreen["preds"])

0.9762673781772223

In [57]:
precision_score(df_preds_nongreen["label"], df_preds_nongreen["preds"], average='binary')

0.7617994100294986

In [58]:
recall_score(df_preds_nongreen["label"], df_preds_nongreen["preds"], average='binary')

0.6643086816720257

In [21]:
# set up contingency table
contingency_table = pd.DataFrame({#"Group": ["# 501 codes", "# non-501 codes"],
                                  "Model": [df_preds_nongreen[df_preds_nongreen["preds"] == 1].shape[0],
                                            df_preds_nongreen[df_preds_nongreen["preds"] == 0].shape[0]],
                                  "Coders": [df_preds_nongreen[df_preds_nongreen["label"] == 1].shape[0],
                                             df_preds_nongreen[df_preds_nongreen["label"] == 0].shape[0]]})


contingency_table

Unnamed: 0,Model,Coders
0,1356,1555
1,34249,34050


In [22]:
chi2_contingency(contingency_table)

(14.041542404812466,
 0.00017881621962339972,
 1,
 array([[ 1455.5,  1455.5],
        [34149.5, 34149.5]]))