In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import accuracy_score, classification_report
import pickle

SEED = 7



In [2]:
full_data = pd.read_csv('articles_meta.csv',sep=';',header=None)
full_data.columns = ['present','zero','a','an','the','raw_NP','Start_idx','Sent_start_idx','Initial','ML_L1','Ann']

In [3]:
with open('../lm_preds.json','r',encoding='utf-8') as f:
    probs = json.loads(f.read())

In [4]:
full_data = pd.concat((full_data,pd.DataFrame(probs,columns=['lm_a','lm_an','lm_the','lm_zero'])),axis=1)
full_data.head()

Unnamed: 0,present,zero,a,an,the,raw_NP,Start_idx,Sent_start_idx,Initial,ML_L1,Ann,lm_a,lm_an,lm_the,lm_zero
0,0.561481,0.438519,0.159896,0.000604,0.8395,diagram,6,0,zero,the,zero,-54.438694,-56.489075,-53.800022,-54.50099
1,0.931808,0.068192,0.466231,0.000759,0.533009,the proportion,24,0,the,the,the,-54.752853,-58.973408,-54.50099,-54.94257
2,0.922656,0.077344,0.053483,0.000401,0.946116,population,42,0,zero,the,the,-55.030296,-59.175804,-51.809734,-54.50099
3,0.005603,0.994397,0.00375,0.000106,0.996144,Japan,73,0,zero,zero,zero,-57.616066,-59.286823,-57.695892,-54.50099
4,0.00901,0.99099,0.229016,0.000554,0.77043,Sweden,80,0,zero,zero,zero,-61.82524,-63.025627,-60.32564,-54.50099


In [5]:
probs_ratio = []
probs_delta = []
init_probs = []
corr_probs = []
lm_choice = []
for i in range(full_data.shape[0]):
    row = full_data.iloc[i]
    init_prob = row['lm_'+row['Initial']]
    corr_prob = row['lm_'+row['ML_L1']]
    init_probs.append(init_prob)
    corr_probs.append(corr_prob)
    probs_ratio.append(init_prob / corr_prob)
    probs_delta.append(init_prob - corr_prob)
    lm_choice.append(np.argmax(row[['lm_a','lm_an','lm_the','lm_zero']]).split('_')[1])
full_data['init_prob'] = init_probs
full_data['corr_prob'] = corr_probs
full_data['probs_ratio'] = probs_ratio
full_data['probs_delta'] = probs_delta
full_data['LM'] = lm_choice

In [6]:
full_data.head()

Unnamed: 0,present,zero,a,an,the,raw_NP,Start_idx,Sent_start_idx,Initial,ML_L1,Ann,lm_a,lm_an,lm_the,lm_zero,init_prob,corr_prob,probs_ratio,probs_delta,LM
0,0.561481,0.438519,0.159896,0.000604,0.8395,diagram,6,0,zero,the,zero,-54.438694,-56.489075,-53.800022,-54.50099,-54.50099,-53.800022,1.013029,-0.700968,the
1,0.931808,0.068192,0.466231,0.000759,0.533009,the proportion,24,0,the,the,the,-54.752853,-58.973408,-54.50099,-54.94257,-54.50099,-54.50099,1.0,0.0,the
2,0.922656,0.077344,0.053483,0.000401,0.946116,population,42,0,zero,the,the,-55.030296,-59.175804,-51.809734,-54.50099,-54.50099,-51.809734,1.051945,-2.691256,the
3,0.005603,0.994397,0.00375,0.000106,0.996144,Japan,73,0,zero,zero,zero,-57.616066,-59.286823,-57.695892,-54.50099,-54.50099,-54.50099,1.0,0.0,zero
4,0.00901,0.99099,0.229016,0.000554,0.77043,Sweden,80,0,zero,zero,zero,-61.82524,-63.025627,-60.32564,-54.50099,-54.50099,-54.50099,1.0,0.0,zero


In [7]:
data = full_data.loc[(full_data['Initial'] != full_data['ML_L1']) | (full_data['Initial'] != full_data['LM']),:]
data.head()

Unnamed: 0,present,zero,a,an,the,raw_NP,Start_idx,Sent_start_idx,Initial,ML_L1,Ann,lm_a,lm_an,lm_the,lm_zero,init_prob,corr_prob,probs_ratio,probs_delta,LM
0,0.561481,0.438519,0.159896,0.000604,0.8395,diagram,6,0,zero,the,zero,-54.438694,-56.489075,-53.800022,-54.50099,-54.50099,-53.800022,1.013029,-0.700968,the
2,0.922656,0.077344,0.053483,0.000401,0.946116,population,42,0,zero,the,the,-55.030296,-59.175804,-51.809734,-54.50099,-54.50099,-51.809734,1.051945,-2.691256,the
6,0.221908,0.778092,0.001581,4e-06,0.998415,the countries,10,129,the,zero,the,-106.542984,-107.16292,-101.66037,-103.46805,-101.66037,-103.46805,0.982529,1.80768,the
7,0.479154,0.520846,0.293543,0.000239,0.706218,the proportion,25,129,the,zero,the,-104.42146,-107.19301,-101.66037,-104.30438,-101.66037,-104.30438,0.974651,2.64401,the
10,0.929846,0.070154,0.013446,0.153751,0.832803,an exception,97,129,an,the,an,-107.13663,-101.66037,-104.450485,-104.91199,-101.66037,-104.450485,0.973288,2.790115,an


In [8]:
data.shape

(11213, 20)

In [9]:
target = data['Ann']
X = data.drop(['Ann','raw_NP','Start_idx','Sent_start_idx'],axis=1)

In [10]:
target_vect = CountVectorizer(token_pattern='.+')
target_vect.fit(['a','an','the','zero'])
X_sparse = hstack((X.drop(['Initial','ML_L1','LM','lm_a','lm_an','lm_the','lm_zero'],axis=1).to_sparse(),
                   target_vect.transform(X['Initial']),
                   target_vect.transform(X['LM']),
                   target_vect.transform(X['ML_L1'])))

In [11]:
X_sparse.shape

(11213, 21)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_sparse, target, test_size=0.33, random_state=SEED)

In [80]:
xgb = XGBClassifier(seed=SEED)
xgb.fit(X_train,y_train)
x_pred = xgb.predict(X_test)
print(accuracy_score(y_test,x_pred))
print(classification_report(y_test,x_pred))

0.819508241016
             precision    recall  f1-score   support

          a       0.84      0.70      0.77       467
         an       0.87      0.57      0.69        82
        the       0.88      0.78      0.83      1458
       zero       0.77      0.90      0.83      1694

avg / total       0.83      0.82      0.82      3701



In [47]:
test_data = data.loc[y_test.index]
test_data['Preds'] = x_pred
not_same = test_data.loc[(y_test != test_data['ML_L1']) & (y_test != test_data['Initial']),['Initial','ML_L1','Preds','Ann']]

In [48]:
not_same[not_same['Preds'] == not_same['Ann']]

Unnamed: 0,Initial,ML_L1,Preds,Ann
945,the,zero,a,a
16330,a,the,an,an


In [49]:
test_data.loc[(x_pred != test_data['ML_L1']) & (x_pred != test_data['Initial']),['Initial','ML_L1','Preds','Ann']]

Unnamed: 0,Initial,ML_L1,Preds,Ann
22210,zero,a,the,a
22863,the,zero,a,the
945,the,zero,a,a
5021,zero,a,the,zero
16330,a,the,an,an


In [38]:
forest = RandomForestClassifier(n_estimators=300,random_state=SEED)
forest.fit(X_train,y_train)
f_pred = forest.predict(X_test)
print(accuracy_score(y_test,f_pred))
print(classification_report(y_test,f_pred))

0.784954407295
             precision    recall  f1-score   support

          a       0.80      0.70      0.75       359
         an       0.82      0.66      0.73        77
        the       0.82      0.76      0.79       996
       zero       0.75      0.84      0.79      1200

avg / total       0.79      0.78      0.78      2632



In [37]:
logit = LogisticRegression(random_state=SEED)
logit.fit(X_train,y_train)
l_pred = logit.predict(X_test)
print(accuracy_score(y_test,l_pred))
print(classification_report(y_test,l_pred))

0.790273556231
             precision    recall  f1-score   support

          a       0.82      0.68      0.75       359
         an       0.80      0.64      0.71        77
        the       0.87      0.73      0.79       996
       zero       0.74      0.88      0.80      1200

avg / total       0.80      0.79      0.79      2632



In [34]:
feats = ['present','zero','ml_a','ml_an','ml_the',
         'init_prob','corr_prob','probs_ratio','probs_delta',
        'init_a','init_an','init_the','init_zero','corr_a','lm_a','lm_an','lm_the','lm_zero','corr_an','corr_the','corr_zero']
imps = forest.feature_importances_
for f,i in sorted(zip(feats,imps),key=lambda x: x[1],reverse=True):
    print(f,'-',i)

init_the - 0.118495433265
init_zero - 0.0985645221145
init_a - 0.0823167659037
zero - 0.0805626130461
present - 0.0790257154328
probs_delta - 0.0710716376968
probs_ratio - 0.0697842861776
ml_a - 0.0692898836246
ml_the - 0.0649227469684
ml_an - 0.0641604327396
corr_prob - 0.0518712162127
init_prob - 0.051343238892
init_an - 0.0197780372385
corr_the - 0.0163153147678
corr_zero - 0.0156236370571
lm_an - 0.0125670240678
corr_a - 0.0121701481888
lm_the - 0.0115679073338
lm_zero - 0.00464117517574
lm_a - 0.00435974889176
corr_an - 0.00156851520452


In [68]:
L1_preds = X.loc[y_test.index,'ML_L1']
print(accuracy_score(y_test,L1_preds))
print(classification_report(y_test,L1_preds))

0.374223182924
             precision    recall  f1-score   support

          a       0.28      0.35      0.31       467
         an       0.20      0.22      0.21        82
        the       0.37      0.45      0.41      1458
       zero       0.43      0.33      0.37      1694

avg / total       0.38      0.37      0.37      3701



In [69]:
L1_preds = X.loc[y_test.index,'LM']
print(accuracy_score(y_test,L1_preds))
print(classification_report(y_test,L1_preds))

0.41664415023
             precision    recall  f1-score   support

          a       0.35      0.37      0.36       467
         an       0.37      0.49      0.42        82
        the       0.41      0.39      0.40      1458
       zero       0.44      0.45      0.45      1694

avg / total       0.42      0.42      0.42      3701



In [52]:
LM_preds = []
for row in X.loc[y_test.index,['init_prob','corr_prob','Initial','ML_L1']].itertuples():
    if row.init_prob > row.corr_prob:
        LM_preds.append(row.Initial)
    else:
        LM_preds.append(row.ML_L1)
print(accuracy_score(y_test,LM_preds))
print(classification_report(y_test,LM_preds))

0.565349544073
             precision    recall  f1-score   support

          a       0.54      0.48      0.51       359
         an       0.57      0.47      0.51        77
        the       0.53      0.48      0.51       996
       zero       0.59      0.67      0.63      1200

avg / total       0.56      0.57      0.56      2632



In [68]:
sum(L1_preds == X.loc[y_test.index,'Initial'])

9360

In [19]:
len(x_pred),sum(x_pred == X.loc[y_test.index,'Initial'])

(2632, 2546)

# Train and save the model

In [13]:
xgb = XGBClassifier(n_estimators=500,max_depth=10,seed=SEED)
xgb.fit(X_sparse,target)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=7, silent=True, subsample=1)

In [14]:
with open('../../models/article_choice_vectorizer.pickle','wb') as f:
    pickle.dump(target_vect,f)

with open('../../models/article_metaclassifier_xgboost.pickle','wb') as f:
    pickle.dump(xgb,f)   