In [59]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import accuracy_score, classification_report
import pickle

SEED = 7

In [5]:
data = pd.read_csv('articles_meta.csv',sep=';',header=None)
data.columns = ['present','zero','a','an','the','raw_NP','Start_idx','Sent_start_idx','Initial','ML_L1','Ann']

In [9]:
data.head()

Unnamed: 0,present,zero,a,an,the,raw_NP,Start_idx,Sent_start_idx,Initial,ML_L1,Ann
0,0.561481,0.438519,0.159896,0.000604,0.8395,diagram,6,0,zero,the,zero
1,0.931808,0.068192,0.466231,0.000759,0.533009,the proportion,24,0,the,the,the
2,0.922656,0.077344,0.053483,0.000401,0.946116,population,42,0,zero,the,the
3,0.005603,0.994397,0.00375,0.000106,0.996144,Japan,73,0,zero,zero,zero
4,0.00901,0.99099,0.229016,0.000554,0.77043,Sweden,80,0,zero,zero,zero


In [23]:
with open('../lm_preds.json','r',encoding='utf-8') as f:
    probs = json.loads(f.read())

In [26]:
init_prob = []
corr_prob = []
for i in range(0,len(probs),2):
    init_prob.append(probs[i])
    if probs[i+1] is None:
        corr_prob.append(probs[i])
    else:
        corr_prob.append(probs[i+1])
data['init_prob'] = init_prob
data['corr_prob'] = corr_prob
data['probs_ratio'] = data['init_prob'] / data['corr_prob']
data['probs_delta'] = data['init_prob'] - data['corr_prob']

In [28]:
target = data['Ann']
X = data.drop(['Ann','raw_NP','Start_idx','Sent_start_idx'],axis=1)

In [55]:
target_vect = CountVectorizer(token_pattern='.+')
target_vect.fit(['a','an','the','zero'])
X_sparse = hstack((X.drop(['Initial','ML_L1'],axis=1).to_sparse(),target_vect.transform(X['Initial']),
                    target_vect.transform(X['ML_L1'])))

In [56]:
X_sparse.shape

(36196, 17)

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X_sparse, target, test_size=0.33, random_state=SEED)

In [58]:
forest = RandomForestClassifier(n_estimators=100,random_state=SEED)
forest.fit(X_train,y_train)
f_pred = forest.predict(X_test)
print(accuracy_score(y_test,f_pred))
print(classification_report(y_test,f_pred))

0.929845123483
             precision    recall  f1-score   support

          a       0.89      0.77      0.82       728
         an       0.90      0.61      0.73       124
        the       0.92      0.83      0.88      2609
       zero       0.93      0.98      0.96      8484

avg / total       0.93      0.93      0.93     11945



In [53]:
feats = ['present','zero','ml_a','ml_an','ml_the','init_prob','corr_prob','probs_ratio','probs_delta',
        'init_a','init_an','init_the','init_zero','corr_a','corr_an','corr_the','corr_zero']
imps = forest.feature_importances_
for f,i in sorted(zip(feats,imps),key=lambda x: x[1],reverse=True):
    print(f,'-',i)

ml_an - 0.189059680368
ml_a - 0.13254321983
zero - 0.108217848917
probs_delta - 0.106238986089
ml_the - 0.10232936279
probs_ratio - 0.0973938222737
present - 0.093272680513
init_prob - 0.0362135389829
corr_prob - 0.0354184590302
init_a - 0.0286558170355
init_an - 0.0264137493171
init_zero - 0.0233374925419
init_the - 0.0209053423114


In [39]:
L1_preds = X.loc[y_test.index,'ML_L1']
print(accuracy_score(y_test,L1_preds))
print(classification_report(y_test,L1_preds))

0.799162829636
             precision    recall  f1-score   support

          a       0.50      0.59      0.54       728
         an       0.43      0.36      0.39       124
        the       0.61      0.65      0.63      2609
       zero       0.90      0.87      0.88      8484

avg / total       0.81      0.80      0.80     11945



In [40]:
LM_preds = []
for row in X.loc[y_test.index,['init_prob','corr_prob','Initial','ML_L1']].itertuples():
    if row.init_prob > row.corr_prob:
        LM_preds.append(row.Initial)
    else:
        LM_preds.append(row.ML_L1)
print(accuracy_score(y_test,LM_preds))
print(classification_report(y_test,LM_preds))

0.886479698619
             precision    recall  f1-score   support

          a       0.76      0.67      0.71       728
         an       0.69      0.55      0.61       124
        the       0.81      0.75      0.78      2609
       zero       0.92      0.95      0.93      8484

avg / total       0.88      0.89      0.88     11945



In [68]:
sum(L1_preds == X.loc[y_test.index,'Initial'])

9360

In [66]:
sum(y_test == X.loc[y_test.index,'Initial'])

11090

# Train and save the model

In [60]:
forest = RandomForestClassifier(n_estimators=100,random_state=SEED)
forest.fit(X_sparse,target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=7, verbose=0, warm_start=False)

In [63]:
with open('../../models/article_choice_vectorizer.pickle','wb') as f:
    pickle.dump(target_vect,f)

with open('../../models/article_metaclassifier_forest.pickle','wb') as f:
    pickle.dump(forest,f)   