In [8]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support

In [18]:
def calc_metrics(dataset, labels, header):
    pred_labels=[l+'_pred' for l in labels]
    y_true = dataset[labels]
    y_pred = dataset[pred_labels]
    metrics = precision_recall_fscore_support(y_true, y_pred)
    metrics = [list(m) for m in metrics]

    avg = precision_recall_fscore_support(y_true, y_pred, average='macro')
    avg = np.array(avg)
    avg[3] = np.sum(metrics[3])

    m = np.array(metrics)
    m2 = np.append(m.T,  np.array(avg).reshape(1,4), axis=0)
    m2=m2[:,:3]
    stat=m2.flatten()
    stat=stat.reshape(-1,1).T * 100
    lstat=stat.flatten().tolist()
    header.extend(lstat)
    return header

# Embeddings

In [31]:
path='/Users/johnny/data/toxicity/offens_eval/results/predictions_embeddings.csv'
ds = pd.read_csv(path,keep_default_na=False)
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39719 entries, 0 to 39718
Data columns (total 15 columns):
subtask_a         39719 non-null object
subtask_a_pred    39719 non-null object
run_id            39719 non-null object
timestamp         39719 non-null int64
model             39719 non-null object
set               39719 non-null object
kfold             39719 non-null int64
id                39719 non-null object
Unnamed: 8        39719 non-null object
Unnamed: 9        39719 non-null object
Unnamed: 10       39719 non-null object
Unnamed: 11       39719 non-null object
Unnamed: 12       39719 non-null object
Unnamed: 13       39719 non-null object
Unnamed: 14       39719 non-null object
dtypes: int64(2), object(13)
memory usage: 4.5+ MB


In [32]:
ds.groupby(['model','run_id','set']).agg({'subtask_a_pred': 'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,subtask_a_pred
model,run_id,set,Unnamed: 3_level_1
cnn,66adfa44-e68d-4fbe-a2b3-926124913125,cv,10592
cnn,66adfa44-e68d-4fbe-a2b3-926124913125,test,2648
cnn-glove,e5bf8023-0472-4399-990b-a234050bebb5,cv,10591
cnn-glove,e5bf8023-0472-4399-990b-a234050bebb5,test,2648
cnn-w2v,ec81747c-ce8e-4a0e-8a5a-60c4f5b552d0,cv,10592
cnn-w2v,ec81747c-ce8e-4a0e-8a5a-60c4f5b552d0,test,2648


In [36]:
results = ds[(ds.model=='cnn-glove') & (ds.set=='test')]
print(classification_report(results.subtask_a, results.subtask_a_pred))

              precision    recall  f1-score   support

         NOT       0.81      0.84      0.83      1759
         OFF       0.66      0.60      0.63       889

   micro avg       0.76      0.76      0.76      2648
   macro avg       0.73      0.72      0.73      2648
weighted avg       0.76      0.76      0.76      2648



In [34]:
benchmarking = []

labels =['subtask_a']

results = ds[(ds.model=='cnn') & (ds.set=='test')]
m=calc_metrics(results, labels, ['Random'])
benchmarking.append(m)

results = ds[(ds.model=='cnn-w2v') & (ds.set=='test')]
m=calc_metrics(results, labels, ['Word2Vec'])
benchmarking.append(m)

results = ds[(ds.model=='cnn-glove') & (ds.set=='test')]
m=calc_metrics(results, labels, ['Glove'])
benchmarking.append(m)


# results = ds[(ds.model=='lstm') & (ds.set=='test')]
# m=calc_metrics(results, labels, ['LSTM'])
# benchmarking.append(m)

# results = ds[(ds.model=='bilstm') & (ds.set=='test')]
# m=calc_metrics(results, labels, ['BI-LSTM'])
# benchmarking.append(m)

# results = ds[(ds.model=='fasttext') & (ds.set=='test')]
# m=calc_metrics(results, labels, ['FastText'])
# benchmarking.append(m)

In [35]:
c = ['model', 'NOT-P', 'NOT-R ', 'NOT-F1','OFF-P', 'OFF-R', 'OFF-F1','P', 'R', 'F1']
dsstat=pd.DataFrame(benchmarking, columns=c)
path = '~/data/toxicity/offens_eval/results/models_embeddings.csv'
dsstat.to_csv(path, index=False)
dsstat

Unnamed: 0,model,NOT-P,NOT-R,NOT-F1,OFF-P,OFF-R,OFF-F1,P,R,F1
0,Random,78.43342,85.389426,81.763745,64.938608,53.543307,58.692972,71.686014,69.466366,70.228358
1,Word2Vec,79.63374,81.580443,80.595338,61.702128,58.71766,60.172911,70.667934,70.149052,70.384125
2,Glove,80.717002,84.479818,82.555556,66.171004,60.067492,62.971698,73.444003,72.273655,72.763627


# Models

In [15]:
path='/Users/johnny/data/toxicity/offens_eval/results/predictions_taska.csv'
ds = pd.read_csv(path,keep_default_na=False)
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66200 entries, 0 to 66199
Data columns (total 8 columns):
subtask_a         66200 non-null object
subtask_a_pred    66200 non-null object
run_id            66200 non-null object
timestamp         66200 non-null float64
model             66200 non-null object
set               66200 non-null object
kfold             66200 non-null int64
id                66200 non-null int64
dtypes: float64(1), int64(2), object(5)
memory usage: 4.0+ MB


In [16]:
ds.subtask_a.unique(),ds.subtask_a_pred.unique()

(array(['NOT', 'OFF'], dtype=object), array(['NOT', 'OFF'], dtype=object))

In [17]:
ds.groupby(['model','run_id','set']).agg({'subtask_a_pred': 'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,subtask_a_pred
model,run_id,set,Unnamed: 3_level_1
bilstm,597e34df-b3d9-4ac7-bd68-3c8dec7d59a4,cv,10592
bilstm,597e34df-b3d9-4ac7-bd68-3c8dec7d59a4,test,2648
cnn,f97a003e-abfd-488d-85bb-84c769b95a3e,cv,10592
cnn,f97a003e-abfd-488d-85bb-84c769b95a3e,test,2648
fasttext,ee861b93-3059-478f-8b83-03cabe096680,cv,10592
fasttext,ee861b93-3059-478f-8b83-03cabe096680,test,2648
lr,407c2a6d-e60a-481e-84e7-6b62ffa97f07,cv,10592
lr,407c2a6d-e60a-481e-84e7-6b62ffa97f07,test,2648
lstm,0dbcf6a9-d4bf-47a9-898e-7ac60b36c139,cv,10592
lstm,0dbcf6a9-d4bf-47a9-898e-7ac60b36c139,test,2648


In [19]:
results = ds[(ds.model=='cnn') & (ds.set=='test')]
print(classification_report(results.subtask_a, results.subtask_a_pred))

              precision    recall  f1-score   support

         NOT       0.81      0.81      0.81      1759
         OFF       0.62      0.63      0.63       889

   micro avg       0.75      0.75      0.75      2648
   macro avg       0.72      0.72      0.72      2648
weighted avg       0.75      0.75      0.75      2648



In [20]:
benchmarking = []

labels =['subtask_a']

results = ds[(ds.model=='lr') & (ds.set=='test')]
m=calc_metrics(results, labels, ['LR'])
benchmarking.append(m)

results = ds[(ds.model=='cnn') & (ds.set=='test')]
m=calc_metrics(results, labels, ['CNN'])
benchmarking.append(m)

results = ds[(ds.model=='lstm') & (ds.set=='test')]
m=calc_metrics(results, labels, ['LSTM'])
benchmarking.append(m)

results = ds[(ds.model=='bilstm') & (ds.set=='test')]
m=calc_metrics(results, labels, ['BI-LSTM'])
benchmarking.append(m)

results = ds[(ds.model=='fasttext') & (ds.set=='test')]
m=calc_metrics(results, labels, ['FastText'])
benchmarking.append(m)


#benchmarking

In [21]:
c = ['model', 'NOT-P', 'NOT-R ', 'NOT-F1','OFF-P', 'OFF-R', 'OFF-F1','P', 'R', 'F1']
dsstat=pd.DataFrame(benchmarking, columns=c)
path = '~/data/toxicity/offens_eval/results/models_performance.csv'
dsstat.to_csv(path, index=False)
dsstat

Unnamed: 0,model,NOT-P,NOT-R,NOT-F1,OFF-P,OFF-R,OFF-F1,P,R,F1
0,LR,81.801471,75.895395,78.737835,58.267717,66.591676,62.152231,70.034594,71.243536,70.445033
1,CNN,81.332567,80.500284,80.914286,62.183021,63.44207,62.806236,71.757794,71.971177,71.860261
2,LSTM,77.728036,78.965321,78.341794,57.026713,55.230596,56.114286,67.377374,67.097959,67.22804
3,BI-LSTM,80.683091,81.921546,81.297602,63.109049,61.192351,62.135922,71.89607,71.556949,71.716762
4,FastText,77.871148,79.022172,78.442438,57.242178,55.568054,56.392694,67.556663,67.295113,67.417566
