In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import accuracy_score, classification_report
import pickle

SEED = 7



In [14]:
with open('../../prepositions.txt','r',encoding='utf-8-sig') as f:
    full_options = f.read().split()
    full_options.append('zero')

In [15]:
full_data = pd.read_csv('prepositions_meta.csv',sep=';')

In [16]:
options = full_data.columns[2:-6]
options

Index(['about', 'along', 'among', 'around', 'as', 'at', 'beside', 'between',
       'by', 'down', 'during', 'except', 'for', 'from', 'in', 'inside', 'into',
       'of', 'off', 'on', 'onto', 'outside', 'over', 'through', 'towards',
       'under', 'until', 'up', 'upon', 'with', 'within', 'without'],
      dtype='object')

In [17]:
with open('../lm_preds_prepositions.json','r',encoding='utf-8') as f:
    probs = json.loads(f.read())

In [73]:
full_data['Ann'].apply(str.lower)

TypeError: descriptor 'lower' requires a 'str' object but received a 'float'

In [18]:
full_data = pd.concat((full_data,pd.DataFrame(probs,columns=['lm_'+x for x in full_options])),axis=1)
full_data = full_data.loc[pd.notnull(full_data['Ann']),:]
full_data.head()

Unnamed: 0,present,zero,about,along,among,around,as,at,beside,between,...,lm_towards,lm_under,lm_underneath,lm_until,lm_up,lm_upon,lm_with,lm_within,lm_without,lm_zero
0,0.05718,0.94282,0.000258,0.000388,0.001148,0.000244,0.115932,0.001321,0.000146,0.094416,...,-68.39262,-68.06999,-69.15108,-68.36011,-68.402176,-68.67732,-67.67051,-68.250374,-68.53549,-63.410843
1,0.000579,0.999421,0.029673,6.2e-05,0.000706,0.002724,0.213891,0.000282,8.4e-05,8.3e-05,...,-68.010284,-68.274506,-69.505936,-68.81487,-68.28006,-68.59552,-65.34287,-67.17113,-66.9102,-63.410843
2,0.998452,0.001548,0.001611,5.4e-05,0.02483,0.000388,0.001244,0.000699,6.4e-05,0.002067,...,-66.46163,-66.773155,-68.983604,-66.54698,-66.74667,-67.08113,-64.77977,-66.98881,-66.408394,-63.86743
3,0.537116,0.462884,0.000147,0.000286,5.4e-05,0.001057,0.000129,0.003057,0.000206,0.001394,...,-66.89788,-65.724556,-67.87192,-66.41504,-66.49582,-66.86648,-65.36411,-66.37533,-66.48254,-63.07747
4,0.989848,0.010152,2.8e-05,6.7e-05,6.7e-05,0.000131,0.000572,0.001795,8.6e-05,0.000201,...,-64.68084,-66.43234,-67.88293,-65.57243,-65.80447,-65.92004,-64.33803,-64.73782,-66.707306,-62.766747


In [19]:
probs_ratio = []
probs_delta = []
init_probs = []
corr_probs = []
lm_choice = []
for i in range(full_data.shape[0]):
    row = full_data.iloc[i]
    init_prob = row['lm_'+row['Initial']]
    corr_prob = row['lm_'+row['ML_L1']]
    init_probs.append(init_prob)
    corr_probs.append(corr_prob)
    probs_ratio.append(init_prob / corr_prob)
    probs_delta.append(init_prob - corr_prob)
    lm_choice.append(np.argmax(row[['lm_'+x for x in full_options]]).split('_')[1])
full_data['init_prob'] = init_probs
full_data['corr_prob'] = corr_probs
full_data['probs_ratio'] = probs_ratio
full_data['probs_delta'] = probs_delta
full_data['LM'] = lm_choice

In [20]:
full_data.head()

Unnamed: 0,present,zero,about,along,among,around,as,at,beside,between,...,lm_upon,lm_with,lm_within,lm_without,lm_zero,init_prob,corr_prob,probs_ratio,probs_delta,LM
0,0.05718,0.94282,0.000258,0.000388,0.001148,0.000244,0.115932,0.001321,0.000146,0.094416,...,-68.67732,-67.67051,-68.250374,-68.53549,-63.410843,-63.410843,-63.410843,1.0,0.0,zero
1,0.000579,0.999421,0.029673,6.2e-05,0.000706,0.002724,0.213891,0.000282,8.4e-05,8.3e-05,...,-68.59552,-65.34287,-67.17113,-66.9102,-63.410843,-63.410843,-63.410843,1.0,0.0,zero
2,0.998452,0.001548,0.001611,5.4e-05,0.02483,0.000388,0.001244,0.000699,6.4e-05,0.002067,...,-67.08113,-64.77977,-66.98881,-66.408394,-63.86743,-63.410843,-63.410843,1.0,0.0,of
3,0.537116,0.462884,0.000147,0.000286,5.4e-05,0.001057,0.000129,0.003057,0.000206,0.001394,...,-66.86648,-65.36411,-66.37533,-66.48254,-63.07747,-63.410843,-64.749596,0.979324,1.338753,zero
4,0.989848,0.010152,2.8e-05,6.7e-05,6.7e-05,0.000131,0.000572,0.001795,8.6e-05,0.000201,...,-65.92004,-64.33803,-64.73782,-66.707306,-62.766747,-63.410843,-63.410843,1.0,0.0,zero


In [21]:
data = full_data.loc[(full_data['Initial'] != full_data['ML_L1']) | (full_data['Initial'] != full_data['LM']),:]
data.head()

Unnamed: 0,present,zero,about,along,among,around,as,at,beside,between,...,lm_upon,lm_with,lm_within,lm_without,lm_zero,init_prob,corr_prob,probs_ratio,probs_delta,LM
3,0.537116,0.462884,0.000147,0.000286,5.4e-05,0.001057,0.000129,0.003057,0.000206,0.001394,...,-66.86648,-65.36411,-66.37533,-66.48254,-63.07747,-63.410843,-64.749596,0.979324,1.338753,zero
4,0.989848,0.010152,2.8e-05,6.7e-05,6.7e-05,0.000131,0.000572,0.001795,8.6e-05,0.000201,...,-65.92004,-64.33803,-64.73782,-66.707306,-62.766747,-63.410843,-63.410843,1.0,0.0,zero
10,0.052714,0.947286,7.2e-05,0.000189,0.00013,0.000877,0.000852,0.002919,0.000344,0.004622,...,-96.33144,-94.82908,-95.840294,-95.9475,-92.54243,-92.87582,-92.54243,1.003603,-0.33339,zero
14,0.056559,0.943441,3.8e-05,0.000351,0.010075,8.7e-05,0.053072,0.000106,0.000707,0.025298,...,-91.90727,-88.55996,-91.65863,-91.04385,-87.28125,-86.6646,-87.28125,0.992935,0.61665,in
17,0.993923,0.006077,6.6e-05,0.000467,0.000142,8.2e-05,0.001537,0.002713,0.000179,3.8e-05,...,-91.15803,-88.89931,-90.12658,-91.34325,-86.46425,-86.6646,-86.6646,1.0,0.0,zero


In [22]:
data.shape

(9383, 82)

In [76]:
data = data.loc[pd.notnull(data['Ann']),:]
data['Ann'] = data['Ann'].apply(str.lower)
target = data['Ann']
X = data.drop(['Ann','raw_NP','Start_idx','Sent_start_idx'],axis=1)

In [61]:
target_vect = CountVectorizer(token_pattern='.+')
target_vect.fit(full_options)
X_sparse = hstack((X.drop(['Initial','ML_L1','LM']+['lm_'+x for x in full_options],axis=1).to_sparse(),
                   target_vect.transform(X['Initial']),
                   target_vect.transform(X['LM']),
                   target_vect.transform(X['ML_L1'])))

In [62]:
X_sparse

<9378x149 sparse matrix of type '<class 'numpy.float64'>'
	with 381702 stored elements in COOrdinate format>

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X_sparse, target, test_size=0.33, random_state=SEED)

In [79]:
xgb = XGBClassifier(seed=SEED)
xgb.fit(X_train,y_train)
x_pred = xgb.predict(X_test)
print(accuracy_score(y_test,x_pred))
print(classification_report(y_test,x_pred))

0.929886914378
             precision    recall  f1-score   support

      about       0.92      1.00      0.96       119
      among       0.88      0.92      0.90        24
     around       1.00      1.00      1.00         6
         as       0.98      1.00      0.99        61
         at       0.79      0.80      0.80        96
    besides       0.00      0.00      0.00         1
    between       0.96      0.87      0.91        30
         by       0.90      0.87      0.88        75
     during       1.00      0.86      0.93        36
     except       1.00      0.20      0.33         5
        for       0.90      0.96      0.93       276
       from       0.92      0.97      0.95       109
         in       0.94      0.94      0.94       868
       into       1.00      0.91      0.95        11
         of       0.96      0.95      0.96       616
         on       0.83      0.87      0.85       168
       over       1.00      0.95      0.97        20
    through       0.82      0.

  'precision', 'predicted', average, warn_for)


In [82]:
test_data = data.loc[y_test.index]
test_data['Preds'] = x_pred
not_same = test_data.loc[(y_test != test_data['ML_L1']) & (y_test != test_data['Initial']),['Initial','ML_L1','Preds','Ann']]

In [83]:
not_same[not_same['Preds'] == not_same['Ann']]

Unnamed: 0,Initial,ML_L1,Preds,Ann
18580,at,over,by,by


In [84]:
test_data.loc[(x_pred != test_data['ML_L1']) & (x_pred != test_data['Initial']),['Initial','ML_L1','Preds','Ann']]

Unnamed: 0,Initial,ML_L1,Preds,Ann
18580,at,over,by,by
29635,in,over,at,in
12215,except,of,in,except
13466,except,with,in,except
26502,for,of,in,for


In [38]:
forest = RandomForestClassifier(n_estimators=300,random_state=SEED)
forest.fit(X_train,y_train)
f_pred = forest.predict(X_test)
print(accuracy_score(y_test,f_pred))
print(classification_report(y_test,f_pred))

0.784954407295
             precision    recall  f1-score   support

          a       0.80      0.70      0.75       359
         an       0.82      0.66      0.73        77
        the       0.82      0.76      0.79       996
       zero       0.75      0.84      0.79      1200

avg / total       0.79      0.78      0.78      2632



In [85]:
logit = LogisticRegression(random_state=SEED)
logit.fit(X_train,y_train)
l_pred = logit.predict(X_test)
print(accuracy_score(y_test,l_pred))
print(classification_report(y_test,l_pred))

0.929563812601
             precision    recall  f1-score   support

      about       0.92      1.00      0.96       119
      among       0.88      0.92      0.90        24
     around       1.00      1.00      1.00         6
         as       0.98      1.00      0.99        61
         at       0.80      0.77      0.78        96
    besides       1.00      1.00      1.00         1
    between       0.96      0.87      0.91        30
         by       0.89      0.85      0.87        75
     during       1.00      0.86      0.93        36
     except       0.00      0.00      0.00         5
        for       0.90      0.96      0.93       276
       from       0.91      0.98      0.94       109
         in       0.94      0.95      0.94       868
       into       1.00      0.91      0.95        11
         of       0.96      0.95      0.96       616
         on       0.83      0.86      0.84       168
       over       1.00      0.95      0.97        20
    through       0.82      0.

  'precision', 'predicted', average, warn_for)


In [34]:
feats = ['present','zero','ml_a','ml_an','ml_the',
         'init_prob','corr_prob','probs_ratio','probs_delta',
        'init_a','init_an','init_the','init_zero','corr_a','lm_a','lm_an','lm_the','lm_zero','corr_an','corr_the','corr_zero']
imps = forest.feature_importances_
for f,i in sorted(zip(feats,imps),key=lambda x: x[1],reverse=True):
    print(f,'-',i)

init_the - 0.118495433265
init_zero - 0.0985645221145
init_a - 0.0823167659037
zero - 0.0805626130461
present - 0.0790257154328
probs_delta - 0.0710716376968
probs_ratio - 0.0697842861776
ml_a - 0.0692898836246
ml_the - 0.0649227469684
ml_an - 0.0641604327396
corr_prob - 0.0518712162127
init_prob - 0.051343238892
init_an - 0.0197780372385
corr_the - 0.0163153147678
corr_zero - 0.0156236370571
lm_an - 0.0125670240678
corr_a - 0.0121701481888
lm_the - 0.0115679073338
lm_zero - 0.00464117517574
lm_a - 0.00435974889176
corr_an - 0.00156851520452


In [68]:
L1_preds = X.loc[y_test.index,'ML_L1']
print(accuracy_score(y_test,L1_preds))
print(classification_report(y_test,L1_preds))

0.374223182924
             precision    recall  f1-score   support

          a       0.28      0.35      0.31       467
         an       0.20      0.22      0.21        82
        the       0.37      0.45      0.41      1458
       zero       0.43      0.33      0.37      1694

avg / total       0.38      0.37      0.37      3701



In [69]:
L1_preds = X.loc[y_test.index,'LM']
print(accuracy_score(y_test,L1_preds))
print(classification_report(y_test,L1_preds))

0.41664415023
             precision    recall  f1-score   support

          a       0.35      0.37      0.36       467
         an       0.37      0.49      0.42        82
        the       0.41      0.39      0.40      1458
       zero       0.44      0.45      0.45      1694

avg / total       0.42      0.42      0.42      3701



In [52]:
LM_preds = []
for row in X.loc[y_test.index,['init_prob','corr_prob','Initial','ML_L1']].itertuples():
    if row.init_prob > row.corr_prob:
        LM_preds.append(row.Initial)
    else:
        LM_preds.append(row.ML_L1)
print(accuracy_score(y_test,LM_preds))
print(classification_report(y_test,LM_preds))

0.565349544073
             precision    recall  f1-score   support

          a       0.54      0.48      0.51       359
         an       0.57      0.47      0.51        77
        the       0.53      0.48      0.51       996
       zero       0.59      0.67      0.63      1200

avg / total       0.56      0.57      0.56      2632



In [68]:
sum(L1_preds == X.loc[y_test.index,'Initial'])

9360

In [19]:
len(x_pred),sum(x_pred == X.loc[y_test.index,'Initial'])

(2632, 2546)

# Train and save the model

In [89]:
xgb = XGBClassifier(n_estimators=500,max_depth=10,seed=SEED)
xgb.fit(X_sparse,target)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=7, silent=True, subsample=1)

In [90]:
with open('../../models/preposition_choice_vectorizer.pickle','wb') as f:
    pickle.dump(target_vect,f)

with open('../../models/preposition_metaclassifier_xgboost.pickle','wb') as f:
    pickle.dump(xgb,f)   