In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import accuracy_score, classification_report
import pickle
import random

SEED = 7

In [8]:
full_data = pd.read_csv('articles_meta.csv',sep=';')

In [9]:
full_data.head()

Unnamed: 0,present,zero,a,an,the,raw_NP,Start_idx,Sent_start_idx,Initial,ML_L1,Ann,lm_a,lm_an,lm_the,lm_zero
0,0.625264,0.374736,0.109307,1.1e-05,0.890682,The pie-chart,0,0,the,the,the,-63.203705,-63.866825,-62.679638,-61.982605
1,0.915166,0.084834,4.3e-05,0.002106,0.997851,the ages,29,0,the,the,the,-66.313194,-67.218285,-62.679638,-63.78474
2,0.888019,0.111981,0.047525,6e-06,0.952469,the population,41,0,the,the,the,-64.24226,-66.0422,-62.679638,-62.111004
3,0.006016,0.993984,7e-06,0.046961,0.953033,Italy,64,0,zero,zero,zero,-67.61931,-67.98742,-65.99466,-62.679638
4,0.031115,0.968885,0.394534,0.002161,0.603305,Yemen,74,0,zero,zero,zero,-65.831055,-66.85119,-63.79864,-62.679638


In [10]:
probs_ratio = []
probs_delta = []
init_probs = []
corr_probs = []
lm_choice = []
for i in range(full_data.shape[0]):
    row = full_data.iloc[i]
    init_prob = row['lm_'+row['Initial']]
    corr_prob = row['lm_'+row['ML_L1']]
    init_probs.append(init_prob)
    corr_probs.append(corr_prob)
    probs_ratio.append(init_prob / corr_prob)
    probs_delta.append(init_prob - corr_prob)
full_data['init_prob'] = init_probs
full_data['corr_prob'] = corr_probs
full_data['probs_ratio'] = probs_ratio
full_data['probs_delta'] = probs_delta
full_data['LM'] = full_data[['lm_a','lm_an','lm_the','lm_zero']].idxmax(1)
full_data['LM'] = full_data['LM'].apply(lambda x: x.split('_')[1])

In [11]:
full_data.head()

Unnamed: 0,present,zero,a,an,the,raw_NP,Start_idx,Sent_start_idx,Initial,ML_L1,Ann,lm_a,lm_an,lm_the,lm_zero,init_prob,corr_prob,probs_ratio,probs_delta,LM
0,0.625264,0.374736,0.109307,1.1e-05,0.890682,The pie-chart,0,0,the,the,the,-63.203705,-63.866825,-62.679638,-61.982605,-62.679638,-62.679638,1.0,0.0,zero
1,0.915166,0.084834,4.3e-05,0.002106,0.997851,the ages,29,0,the,the,the,-66.313194,-67.218285,-62.679638,-63.78474,-62.679638,-62.679638,1.0,0.0,the
2,0.888019,0.111981,0.047525,6e-06,0.952469,the population,41,0,the,the,the,-64.24226,-66.0422,-62.679638,-62.111004,-62.679638,-62.679638,1.0,0.0,zero
3,0.006016,0.993984,7e-06,0.046961,0.953033,Italy,64,0,zero,zero,zero,-67.61931,-67.98742,-65.99466,-62.679638,-62.679638,-62.679638,1.0,0.0,zero
4,0.031115,0.968885,0.394534,0.002161,0.603305,Yemen,74,0,zero,zero,zero,-65.831055,-66.85119,-63.79864,-62.679638,-62.679638,-62.679638,1.0,0.0,zero


In [66]:
data = full_data.loc[(full_data['Initial'] != full_data['ML_L1']) | (full_data['Initial'] != full_data['LM']) | \
                     (full_data['Initial'] != full_data['Ann']),:]
data.head()

Unnamed: 0,present,zero,a,an,the,raw_NP,Start_idx,Sent_start_idx,Initial,ML_L1,Ann,lm_a,lm_an,lm_the,lm_zero,init_prob,corr_prob,probs_ratio,probs_delta,LM
0,0.625264,0.374736,0.109307,1.1e-05,0.890682,The pie-chart,0,0,the,the,the,-63.203705,-63.866825,-62.679638,-61.982605,-62.679638,-62.679638,1.0,0.0,zero
2,0.888019,0.111981,0.047525,6e-06,0.952469,the population,41,0,the,the,the,-64.24226,-66.0422,-62.679638,-62.111004,-62.679638,-62.679638,1.0,0.0,zero
8,0.256608,0.743392,0.017234,3e-06,0.982763,the precast data,13,221,the,zero,the,-80.810936,-81.13423,-79.80842,-79.36763,-79.80842,-79.36763,1.005554,-0.44079,zero
11,0.881925,0.118075,0.892104,0.014736,0.09316,reduction,70,221,zero,a,a,-76.58358,-81.37569,-79.20161,-79.80842,-79.80842,-76.58358,1.042109,-3.22484,a
13,0.695115,0.304885,0.08544,1e-05,0.91455,pie-chart,9,336,zero,the,the,-110.21181,-112.14194,-111.037766,-110.448265,-110.448265,-111.037766,0.994691,0.589501,a


In [67]:
target = data['Ann']
X = data.drop(['Ann','raw_NP','Start_idx','Sent_start_idx'],axis=1)

In [68]:
target_vect = CountVectorizer(token_pattern='.+')
target_vect.fit(['a','an','the','zero'])
X_sparse = hstack((X.drop(['Initial','ML_L1','LM'],axis=1).to_sparse(),
                   target_vect.transform(X['Initial']),
                   target_vect.transform(X['LM']),
                   target_vect.transform(X['ML_L1'])))

In [69]:
X_sparse.shape

(40000, 25)

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X_sparse, target, test_size=0.33, random_state=SEED)

In [72]:
xgb = XGBClassifier(n_estimators=500,max_depth=10,n_jobs=3,seed=SEED)
xgb.fit(X_train,y_train)
x_pred = xgb.predict(X_test)
print(accuracy_score(y_test,x_pred))
print(classification_report(y_test,x_pred))

0.8570454545454546
             precision    recall  f1-score   support

          a       0.86      0.79      0.82      1768
         an       0.87      0.76      0.81       357
        the       0.88      0.84      0.86      4940
       zero       0.84      0.90      0.87      6135

avg / total       0.86      0.86      0.86     13200



  if diff:


In [71]:
logit = LogisticRegression(random_state=SEED)
logit.fit(X_train,y_train)
l_pred = logit.predict(X_test)
print(accuracy_score(y_test,l_pred))
print(classification_report(y_test,l_pred))

0.8236363636363636
             precision    recall  f1-score   support

          a       0.85      0.75      0.80      1768
         an       0.86      0.74      0.80       357
        the       0.89      0.74      0.81      4940
       zero       0.78      0.91      0.84      6135

avg / total       0.83      0.82      0.82     13200



In [73]:
test_data = data.loc[y_test.index]

In [74]:
x_pred_prob = xgb.predict_proba(X_test)
options = ['a','an','the','zero']
final_preds = []
for l1_prob,lm_prob,meta_prob in zip(test_data[['a','an','the','zero','present']].values,
                                     test_data[['lm_a','lm_an','lm_the','lm_zero']].values,
                                    x_pred_prob):
    #print(l1_prob[:-1],meta_prob,lm_prob)
    l1_prob[:3] *= l1_prob[-1]
    lm_prob /= sum(lm_prob)
    final_preds.append(options[np.argmax(np.average((l1_prob[:-1],meta_prob,lm_prob),axis=0))])

In [75]:
test_data['Preds'] = x_pred
test_data['Final_preds'] = final_preds
corrected = test_data.loc[(y_test != test_data['Initial']),['Initial','ML_L1','LM','Preds','Final_preds','Ann']]
not_corrected = test_data.loc[(y_test == test_data['Initial']),['Initial','ML_L1','LM','Preds','Final_preds','Ann']]

In [78]:
corrected.shape

(2338, 6)

In [80]:
print(accuracy_score(corrected['Ann'],corrected['Final_preds']))
print(classification_report(corrected['Ann'],corrected['Final_preds']))

0.4773310521813516
             precision    recall  f1-score   support

          a       0.59      0.39      0.47       489
         an       0.47      0.28      0.35        96
        the       0.69      0.58      0.63      1330
       zero       0.16      0.31      0.21       423

avg / total       0.56      0.48      0.51      2338



In [62]:
corrected[(corrected['Preds'] == corrected['Ann']) & (corrected['Ann'] != corrected['ML_L1']) &
          (corrected['Ann'] != corrected['LM'])].shape

(19, 6)

# Train and save the model

In [34]:
xgb = XGBClassifier(n_estimators=500,max_depth=10,seed=SEED,n_jobs=3)
xgb.fit(X_sparse,target)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=3, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=7, silent=True,
       subsample=1)

In [35]:
with open('../models/article_choice_vectorizer.pickle','wb') as f:
    pickle.dump(target_vect,f)

with open('../models/article_metaclassifier_xgboost.pickle','wb') as f:
    pickle.dump(xgb,f)   

In [2]:
with open('../models/article_metaclassifier_xgboost.pickle','rb') as f:
    xgb = pickle.load(f)

In [4]:
import xgboost

In [43]:
feature_names = list(X.drop(['Initial','ML_L1','LM'],axis=1).columns) + [x+'_init' for x in ['a','an','the','zero']] + \
                [x+'_lm' for x in ['a','an','the','zero']] + [x+'_ml' for x in ['a','an','the','zero']]
for name,imp in sorted(zip(feature_names,xgb.feature_importances_),key=lambda x: x[1],reverse=True):
    print(name,imp)

present 0.15547104
an 0.14435138
a 0.122264005
probs_delta 0.11086422
probs_ratio 0.08576665
lm_a 0.08327873
the 0.07097209
lm_an 0.06041269
lm_the 0.047175456
lm_zero 0.03832056
corr_prob 0.023184154
init_prob 0.018260548
the_init 0.007634675
zero_init 0.0071124025
a_init 0.0057687378
the_lm 0.0049520936
a_lm 0.0038363293
zero_lm 0.0038173376
an_init 0.0026968254
an_lm 0.0016475325
the_ml 0.0016332887
a_ml 0.00043206185
an_ml 0.00014243797
zero 4.747932e-06
zero_ml 0.0
