In [1]:
%matplotlib inline
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support

In [162]:
ds = pd.read_csv('../results/predictions_lr_full_es.csv')
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4168 entries, 0 to 4167
Data columns (total 7 columns):
choose_one_category         4168 non-null int64
choose_one_category_pred    4168 non-null int64
kfold                       4168 non-null int64
set                         4168 non-null object
id                          4168 non-null int64
timestamp                   4168 non-null float64
run_id                      4168 non-null object
dtypes: float64(1), int64(4), object(2)
memory usage: 228.0+ KB


In [163]:
ds.groupby(['run_id','set']).size()

run_id                                set 
639dd352-1ced-4ada-9994-0cd1295205e5  cv      3334
                                      test     834
dtype: int64

In [164]:
ds.head()

Unnamed: 0,choose_one_category,choose_one_category_pred,kfold,set,id,timestamp,run_id
0,7,7,0,cv,721708581129588736,1552016000.0,639dd352-1ced-4ada-9994-0cd1295205e5
1,7,7,0,cv,721600361182572544,1552016000.0,639dd352-1ced-4ada-9994-0cd1295205e5
2,7,8,0,cv,721602071112196097,1552016000.0,639dd352-1ced-4ada-9994-0cd1295205e5
3,6,6,0,cv,721595440051789824,1552016000.0,639dd352-1ced-4ada-9994-0cd1295205e5
4,6,6,0,cv,721601491472146432,1552016000.0,639dd352-1ced-4ada-9994-0cd1295205e5


In [165]:
def show_results(results_ds, set_filter='cv', labels=None):
    results = results_ds[(results_ds.set==set_filter) & (results_ds.kfold==0)]
    print(results.shape)
    y_true = results[labels]
    print(y_true.shape)
    pred_labels=[l+'_pred' for l in labels]
    y_pred = results[pred_labels]
    print(y_pred.shape)
    print(classification_report(y_true, y_pred))

In [166]:
#labels =['affected', 'missing','evacuation', 'infrastructure', 'need', 'offer', 'precaution','emotional', 'other']
labels =['choose_one_category']
show_results(ds, labels=labels)

(334, 7)
(334, 1)
(334, 1)
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.00      0.00      0.00         1
           2       0.83      0.31      0.45        16
           3       0.75      0.38      0.50         8
           4       0.96      0.96      0.96        55
           5       1.00      1.00      1.00         4
           6       0.66      0.99      0.79       127
           7       0.69      0.52      0.59        67
           8       0.92      0.43      0.59        51

   micro avg       0.74      0.74      0.74       334
   macro avg       0.65      0.51      0.54       334
weighted avg       0.76      0.74      0.72       334



  'precision', 'predicted', average, warn_for)


In [154]:
import numpy as np

def calc_metrics(dataset_or_path, model_name, lang, labels,set_name='cv', kfold=1):
    if isinstance(dataset_or_path, str):
        ds = pd.read_csv(dataset_or_path)
        #print(ds.info())
    else:
        ds = dataset_or_path
        
    ds = ds[ds.set==set_name]
        
#     folds = ds[ds.set==set_name].groupby('kfold')
#     stats = []
    
#     for name, group in folds:
#         ds = group
        
    pred_labels=[l+'_pred' for l in labels]
    y_true = ds[labels]
    y_pred = ds[pred_labels]
    metrics = precision_recall_fscore_support(y_true, y_pred)
    #print(metrics)
    metrics = [list(m) for m in metrics]

    avg = precision_recall_fscore_support(y_true, y_pred, average='macro')
    avg = np.array(avg)
    #print(avg)
    avg[3] = np.sum(metrics[3])

    m = np.array(metrics)
    m2 = np.append(m.T,  np.array(avg).reshape(1,4), axis=0)
    m2=m2[:,:3]
    stat=m2.flatten()
    stat=stat.reshape(-1,1).T * 100
    lstat=stat.flatten().tolist()
    lstat.insert(0, model_name)
    lstat.insert(1, set_name)
    lstat.insert(2, lang)
    #lstat.insert(2, name)
    #stats.append(lstat)
    # calculate p-value
#     p_val = np.random.rand()*0.01
#     lstat.append(p_val)
    
    return lstat



# binary

In [159]:
benchmarks = []
labels =['crisis_related']

ds_path = '../results/predictions_lr_es.csv'
m=calc_metrics(ds_path, 'LR','es',  labels,'test')
benchmarks.append(m)

ds_path = '../results/predictions_lr_en.csv'
m=calc_metrics(ds_path, 'LR', 'en',  labels,'test')
benchmarks.append(m)

ds_path = '../results/predictions_cnn_es.csv'
m=calc_metrics(ds_path, 'CNN', 'es',  labels,'test')
benchmarks.append(m)

ds_path = '../results/predictions_cnn_en.csv'
m=calc_metrics(ds_path, 'CNN', 'en',  labels,'test')
benchmarks.append(m)

ds_path = '../results/predictions_lstm_en.csv'
m=calc_metrics(ds_path, 'LSTM', 'es',  labels,'test')
benchmarks.append(m)

ds_path = '../results/predictions_lstm_en.csv'
m=calc_metrics(ds_path, 'LSTM', 'en',  labels,'test')
benchmarks.append(m)


ds_path = '../results/predictions_lr_esen.csv'
m=calc_metrics(ds_path, 'LR', 'esen',  labels,'test')
benchmarks.append(m)

ds_path = '../results/predictions_lr_enes.csv'
m=calc_metrics(ds_path, 'LR', 'enes',  labels,'test')
benchmarks.append(m)


ds_path = '../results/predictions_cnn_esen.csv'
m=calc_metrics(ds_path, 'CNN', 'esen',  labels,'test')
benchmarks.append(m)

ds_path = '../results/predictions_cnn_enes.csv'
m=calc_metrics(ds_path, 'CNN', 'enes',  labels,'test')
benchmarks.append(m)

ds_path = '../results/predictions_lstm_esen.csv'
m=calc_metrics(ds_path, 'LSTM', 'esen',  labels,'test')
benchmarks.append(m)

ds_path = '../results/predictions_lstm_enes.csv'
m=calc_metrics(ds_path, 'LSTM', 'enes',  labels,'test')
benchmarks.append(m)

ds_path = '../results/predictions_lstm_enes.csv'
m=calc_metrics(ds_path, 'LSTM-glove', 'enes',  labels,'test')
benchmarks.append(m)

ds_path = '../results/predictions_lstm_se_es.csv'
m=calc_metrics(ds_path, 'LSTM-stacked', 'es',  labels,'test')
benchmarks.append(m)

ds_path = '../results/predictions_lstm_se_en.csv'
m=calc_metrics(ds_path, 'LSTM-stacked', 'en',  labels,'test')
benchmarks.append(m)

ds_path = '../results/predictions_lstm_se_esen.csv'
m=calc_metrics(ds_path, 'LSTM-stacked', 'esen',  labels,'test')
benchmarks.append(m)

ds_path = '../results/predictions_lstm_se_enes.csv'
m=calc_metrics(ds_path, 'LSTM-stacked', 'enes',  labels,'test')
benchmarks.append(m)

benchmarks = np.array(benchmarks)
benchmarks.shape


(17, 12)

In [160]:
c = ['model','set','lang', 'cr-p', 'cr-r', 'cr-f1','ncr-p', 'ncr-R', 'ncr-f1','p', 'r', 'f1']
dsstat=pd.DataFrame(benchmarks, columns=c)
#dsstat['p'] = dsstat.p.apply(lambda x: '*' if x <0.01 else '-' )
#dsstat.loc[dsstat.set=='test','p']=''
dsstat

Unnamed: 0,model,set,lang,cr-p,cr-r,cr-f1,ncr-p,ncr-R,ncr-f1,p,r,f1
0,LR,test,es,81.23393316195373,86.57534246575342,83.81962864721486,89.13525498891353,84.63157894736842,86.82505399568036,85.18459407543362,85.60346070656092,85.32234132144761
1,LR,test,en,90.81885856079404,94.81865284974094,92.77566539923954,95.41284403669724,91.83222958057397,93.58830146231722,93.11585129874564,93.32544121515744,93.18198343077836
2,CNN,test,es,79.95110024449878,89.58904109589041,84.49612403100775,91.1832946635731,82.73684210526315,86.75496688741721,85.56719745403593,86.16294160057679,85.62554545921247
3,CNN,test,en,90.3846153846154,97.40932642487049,93.76558603491272,97.63593380614655,91.16997792494482,94.29223744292236,94.01027459538098,94.28965217490764,94.02891173891754
4,LSTM,test,es,93.75,93.26424870466322,93.5064935064935,94.28571428571428,94.70198675496688,94.49339207048456,94.01785714285714,93.98311772981504,93.99994278848904
5,LSTM,test,en,93.75,93.26424870466322,93.5064935064935,94.28571428571428,94.70198675496688,94.49339207048456,94.01785714285714,93.98311772981504,93.99994278848904
6,LR,test,esen,73.70336669699728,83.24768756423433,78.18532818532819,83.67551326990485,74.2996887505558,78.70937352802639,78.68943998345105,78.77368815739507,78.44735085667729
7,LR,test,enes,44.6011673151751,98.49624060150376,61.39939738868429,68.18181818181817,2.5662959794696323,4.94641384995878,56.39149274849664,50.5312682904867,33.17290561932153
8,CNN,test,esen,80.23191606847045,74.66598150051388,77.34894862922545,79.32046979865773,84.0818141396176,81.63177206993308,79.77619293356409,79.37389782006574,79.49036034957926
9,CNN,test,enes,44.39048081593006,98.17400644468314,61.1371237458194,58.53658536585365,2.0530367835757057,3.9669421487603302,51.46353309089186,50.11352161412942,32.55203294728987


In [56]:
score = dsstat.f1.values[0]
permutation_scores = np.array(dsstat.f1.values[1:])
pvalue = (score - dsstat.f1.mean()) / dsstat.f1.std()
pvalue, dsstat.f1.mean(), dsstat.f1.std()


(0.20123328669044416, 83.10620241900796, 2.724219359115447)

In [57]:
from scipy import stats

In [68]:
rvs = stats.norm.rvs(loc=5, scale=10, size=(10))
rvs.mean()

11.233306784637401

In [75]:
stats.ttest_1samp(dsstat.f1.values,popmean=dsstat.f1.values[3])

Ttest_1sampResult(statistic=-4.495796197000257, pvalue=0.0014979761702092747)

In [161]:
dsstat[['model','set','lang','p', 'r', 'f1']].to_csv('../results/results_binary.csv', index=False)

In [119]:
benchmarks = []
labels =['crisis_related']

ds_path = '../results/binary_lr_roots.csv'
m=calc_metrics(ds_path, 'LR', 'test', labels=labels)
m[1] = 'roots'
benchmarks.append(m)

ds_path = '../results/binary_cnn_roots.csv'
m=calc_metrics(ds_path, 'CNN', 'test', labels=labels)
m[1] = 'roots'
benchmarks.append(m)

ds_path = '../results/binary_lstm_roots.csv'
m=calc_metrics(ds_path, 'LSTM', 'test', labels=labels)
m[1] = 'roots'
benchmarks.append(m)

ds_path = '../results/binary_lr_replies.csv'
m=calc_metrics(ds_path, 'LR', 'test', labels=labels)
m[1] = 'replies'
benchmarks.append(m)

ds_path = '../results/binary_cnn_replies.csv'
m=calc_metrics(ds_path, 'CNN', 'test', labels=labels)
m[1] = 'replies'
benchmarks.append(m)

ds_path = '../results/binary_lstm_replies.csv'
m=calc_metrics(ds_path, 'LSTM', 'test', labels=labels)
m[1] = 'replies'
benchmarks.append(m)

ds_path = '../results/binary_lr.csv'
m=calc_metrics(ds_path, 'LR', 'test', labels=labels)
m[1] = 'all'
benchmarks.append(m)

ds_path = '../results/binary_cnn.csv'
m=calc_metrics(ds_path, 'CNN', 'test', labels=labels)
m[1] = 'all'
benchmarks.append(m)

ds_path = '../results/binary_lstm.csv'
m=calc_metrics(ds_path, 'LSTM', 'test', labels=labels)
m[1] = 'all'
benchmarks.append(m)

In [120]:
c = ['model','set', 'NC-P', 'NC-R', 'NC-F1','C-P', 'C-R', 'C-F1','Precision', 'Recall', 'F1', 'p']
dsstat=pd.DataFrame(benchmarks, columns=c)
dsstat['p'] = dsstat.p.apply(lambda x: '*' if x <0.01 else '-' )
dsstat[['model','set', 'F1']]

Unnamed: 0,model,set,F1
0,LR,roots,85.142031
1,CNN,roots,84.660562
2,LSTM,roots,87.628866
3,LR,replies,72.197845
4,CNN,replies,65.568669
5,LSTM,replies,72.788488
6,LR,all,76.09277
7,CNN,all,77.353691
8,LSTM,all,74.003129


In [126]:
output=pd.pivot_table(dsstat, values='F1', index=['set'],   columns=['model'], aggfunc=np.sum).reset_index()
output.to_csv('../results/tables/results_clf_binary.csv', index=False)
output

model,set,CNN,LR,LSTM
0,all,77.353691,76.09277,74.003129
1,replies,65.568669,72.197845,72.788488
2,roots,84.660562,85.142031,87.628866


# multilabel

In [190]:
benchmarks = []

labels =['informative', 'expressive_positive', 'sarcasm',
       'people_deaths', 'expressive_negative', 'response_other']

ds_path = '../results/binary_lr_ml6.csv'
m=calc_metrics(ds_path, 'LR', 'test', labels=labels)
benchmarks.append(m)

ds_path = '../results/binary_cnn_ml6.csv'
m=calc_metrics(ds_path, 'CNN', 'test', labels=labels)
benchmarks.append(m)

ds_path = '../results/binary_lstm_ml6.csv'
m=calc_metrics(ds_path, 'LSTM', 'test', labels=labels)
benchmarks.append(m)

c= get_columns(labels)
dsstat=pd.DataFrame(benchmarks, columns=c)
dsstat=dsstat[['model','set', 'F1']]
dsstat['set'] = 6

benchmarks = []
labels ='informative,expressive_positive,sarcasm,people_deaths,expressive_negative,response_other,thanks,request_info'.split(',')

ds_path = '../results/binary_lr_ml8.csv'
m=calc_metrics(ds_path, 'LR', 'test', labels=labels)
benchmarks.append(m)

ds_path = '../results/binary_cnn_ml8.csv'
m=calc_metrics(ds_path, 'CNN', 'test', labels=labels)
benchmarks.append(m)

ds_path = '../results/binary_lstm_ml8.csv'
m=calc_metrics(ds_path, 'LSTM', 'test', labels=labels)
benchmarks.append(m)

c= get_columns(labels)
tmp =pd.DataFrame(benchmarks, columns=c)[['model','set', 'F1']]
tmp['set']=8
dsstat= dsstat.append(tmp)


benchmarks = []
labels ='informative,expressive_positive,sarcasm,people_deaths,expressive_negative,response_other,thanks,request_info,suggest_action,complain'.split(',')

ds_path = '../results/binary_lr_ml10.csv'
m=calc_metrics(ds_path, 'LR', 'test', labels=labels)
benchmarks.append(m)

ds_path = '../results/binary_cnn_ml10.csv'
m=calc_metrics(ds_path, 'CNN', 'test', labels=labels)
benchmarks.append(m)

ds_path = '../results/binary_lstm_ml10.csv'
m=calc_metrics(ds_path, 'LSTM', 'test', labels=labels)
benchmarks.append(m)

c= get_columns(labels)
tmp =pd.DataFrame(benchmarks, columns=c)[['model','set', 'F1']]
tmp['set']=10
dsstat= dsstat.append(tmp)



benchmarks = []
labels ='people_deaths,people_wounded,people_missing,people_other,infra_buildings,infra_roads,infra_houses,infra_business,infra_other,request_info,request_goods,request_services,request_other,offer_info,offer_goods,offer_services,offer_other,informative,update,expressive_positive,expressive_negative,complain,suggest_action,promise,sarcasm,yes_no_question,wh_question,open_question,yes_answer,no_answer,response_ack,response_other,opening_greeting,closing_greeting,thanks,apology,other_subcat'.split(',')

ds_path = '../results/binary_lr_mln.csv'
m=calc_metrics(ds_path, 'LR', 'test', labels=labels)
benchmarks.append(m)

ds_path = '../results/binary_cnn_mln.csv'
m=calc_metrics(ds_path, 'CNN', 'test', labels=labels)
benchmarks.append(m)

ds_path = '../results/binary_lstm_mln.csv'
m=calc_metrics(ds_path, 'LSTM', 'test', labels=labels)
benchmarks.append(m)

c= get_columns(labels)
tmp =pd.DataFrame(benchmarks, columns=c)[['model','set', 'F1']]
tmp['set']=37
dsstat= dsstat.append(tmp)


               
dsstat            


  'precision', 'predicted', average, warn_for)


6
8
10
37


  'recall', 'true', average, warn_for)


Unnamed: 0,model,set,F1
0,LR,6,56.238757
1,CNN,6,48.331387
2,LSTM,6,46.743806
0,LR,8,55.204413
1,CNN,8,48.53124
2,LSTM,8,40.072941
0,LR,10,52.704603
1,CNN,10,45.333556
2,LSTM,10,38.287301
0,LR,37,45.567184


In [191]:
def get_columns(labels):
    c = ['model','set']
    print(len(labels))
    for l in labels:
        c.append(l+'-P')
        c.append(l+'-R')
        c.append(l+'-F1')
    c.extend(['Precision', 'Recall', 'F1', 'p'])
    return c

In [193]:
output=pd.pivot_table(dsstat, values='F1', index=['set'],   columns=['model'], aggfunc=np.sum).reset_index()
output.sort_values('set',ascending=True, inplace=True)
output.to_csv('../results/tables/results_clf_multilabel.csv', index=False)
output

model,set,CNN,LR,LSTM
0,6,48.331387,56.238757,46.743806
1,8,48.53124,55.204413,40.072941
2,10,45.333556,52.704603,38.287301
3,37,39.795707,45.567184,8.988014


# cross-lingual analysis

In [4]:
data_dir='/Users/johnny/data/crisis/ecuador_earthquake_2016/'
path = os.path.join(data_dir, 'tweets_geocodes.csv')
geo_tweets = pd.read_csv(path)
geo_tweets.shape

(152269, 2)

In [27]:
path = os.path.join(data_dir, 'es', 'conversations.csv')
conv = pd.read_csv(path,parse_dates=['timestamp'])
geo_conv = conv.merge(geo_tweets, on='id', how='left')
geo_conv.head()

(94776, 13)

In [29]:
from datetime import datetime, timedelta

In [30]:
dt_init = datetime(2016,4,16,18,58)

In [36]:
geo_conv['elapsed'] = geo_conv.timestamp.apply(lambda x: (x - dt_init).seconds / 3600)


In [37]:
geo_conv.elapsed.describe()

count    94765.000000
mean        14.002531
std          5.729669
min          0.000278
25%          9.810000
50%         14.912778
75%         18.968611
max         23.998889
Name: elapsed, dtype: float64

In [38]:
path = os.path.join(data_dir, 'en', 'conversations.csv')
conv = pd.read_csv(path,parse_dates=['timestamp'])
geo_conv = conv.merge(geo_tweets, on='id', how='left')
geo_conv['elapsed'] = geo_conv.timestamp.apply(lambda x: (x - dt_init).seconds / 3600)
geo_conv.head()

Unnamed: 0,id,screen_name,latitude,longitude,lang,in_reply_to_status_id,text,timestamp,conversation_id,conversation_deep,num_replies,num_users,url,geocountry,elapsed
0,951919738501738497,TrumpsRockin,,,en,9.516475e+17,@robrousseau @realDonaldTrump You are welcome ...,2018-01-12 20:52:06,951647402599026689,3,356,310,https://www.twitter.com/TrumpsRockin/status/95...,,1.901667
1,951647511688728582,robrousseau,,,en,9.516474e+17,@realDonaldTrump President Dementia,2018-01-12 02:50:22,951647402599026689,2,356,310,https://www.twitter.com/robrousseau/status/951...,,7.872778
2,951647402599026689,realDonaldTrump,,,en,,More great news as a result of historical Tax ...,2018-01-12 02:49:56,951647402599026689,1,356,310,https://www.twitter.com/realDonaldTrump/status...,USA,7.865556
3,1017960717486522369,bernardo712,,,en,9.51933e+17,@HOCKEYRINK12 @realDonaldTrump How's that goin...,2018-07-14 02:35:22,951647402599026689,4,356,310,https://www.twitter.com/bernardo712/status/101...,USA,7.622778
4,951933027357200384,HOCKEYRINK12,,,en,9.519324e+17,@realDonaldTrump Wall Street Has Best Start to...,2018-01-12 21:44:54,951647402599026689,3,356,310,https://www.twitter.com/HOCKEYRINK12/status/95...,,2.781667


In [39]:
geo_conv.elapsed.describe()

count    39518.000000
mean        13.415872
std          5.504273
min          0.001111
25%         10.338611
50%         13.346111
75%         17.736528
max         23.999722
Name: elapsed, dtype: float64