In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import accuracy_score, classification_report
import pickle
import random

SEED = 7

In [2]:
with open('../prepositions.txt','r',encoding='utf-8-sig') as f:
    full_options = f.read().split()
    full_options.append('zero')

In [3]:
full_data = pd.read_csv('prepositions_meta.csv',sep=';',encoding='utf-8-sig')

In [4]:
full_data.drop(['hitherto','k442','na','opera','ta','too'],axis=1,inplace=True)

In [5]:
with open('../lm_preds_prepositions.json','r',encoding='utf-8') as f:
    probs = json.loads(f.read())

FileNotFoundError: [Errno 2] No such file or directory: '../lm_preds_prepositions.json'

In [6]:
#full_data = pd.concat((full_data,pd.DataFrame(probs,columns=['lm_'+x for x in full_options])),axis=1)
full_data = full_data.loc[pd.notnull(full_data['Ann']),:]
full_data.head()

Unnamed: 0,present,zero,about,among,as,at,between,by,during,for,...,lm_for,lm_from,lm_in,lm_into,lm_of,lm_on,lm_over,lm_to,lm_with,lm_zero
0,0.014882,0.9851183,0.021415,5.2e-05,0.126387,0.004439,8.2e-05,0.007522,0.002527,0.35627,...,-33.16922,-33.546665,-32.24034,-33.797226,-33.011402,-32.670826,-33.878326,-32.714996,-33.169025,-30.106878
1,1.0,1.122606e-07,0.000154,0.008797,0.002671,0.125364,0.003533,0.000589,4.8e-05,0.005333,...,-29.39925,-30.085175,-29.474987,-30.328856,-29.61325,-29.882133,-32.0092,-30.204685,-30.430601,-28.89728
2,0.000255,0.9997448,0.023409,0.001858,0.179208,0.007757,4.1e-05,0.009895,1.9e-05,0.084803,...,-33.102764,-35.083813,-32.90931,-35.41747,-34.301186,-34.139107,-33.785168,-33.646137,-34.023697,-30.106878
3,0.003864,0.9961361,0.003481,0.000654,0.776202,0.035943,0.000267,0.002419,0.004328,0.011206,...,-34.267464,-34.743233,-33.945534,-35.873238,-34.973675,-34.661076,-35.534275,-35.129456,-33.85262,-31.27317
4,0.999961,3.919356e-05,0.039227,7e-06,0.002289,0.001446,1.4e-05,0.003828,0.000415,0.045707,...,-33.030487,-34.685753,-33.085075,-36.02312,-31.27317,-34.841442,-35.72839,-34.39954,-32.722183,-32.975845


In [7]:
full_data.rename(columns={'lm_﻿about':'lm_about'}, inplace=True)
full_data.columns

Index(['present', 'zero', 'about', 'among', 'as', 'at', 'between', 'by',
       'during', 'for', 'from', 'in', 'into', 'of', 'on', 'over', 'to', 'with',
       'raw_NP', 'Start_idx', 'Sent_start_idx', 'Initial', 'ML_L1', 'Ann',
       'lm_about', 'lm_among', 'lm_as', 'lm_at', 'lm_between', 'lm_by',
       'lm_during', 'lm_for', 'lm_from', 'lm_in', 'lm_into', 'lm_of', 'lm_on',
       'lm_over', 'lm_to', 'lm_with', 'lm_zero'],
      dtype='object')

In [8]:
probs_ratio = []
probs_delta = []
init_probs = []
corr_probs = []
lm_choice = []
for i in range(full_data.shape[0]):
    row = full_data.iloc[i]
    init_prob = row['lm_'+row['Initial']]
    corr_prob = row['lm_'+row['ML_L1']]
    init_probs.append(init_prob)
    corr_probs.append(corr_prob)
    probs_ratio.append(init_prob / corr_prob)
    probs_delta.append(init_prob - corr_prob)
full_data['init_prob'] = init_probs
full_data['corr_prob'] = corr_probs
full_data['probs_ratio'] = probs_ratio
full_data['probs_delta'] = probs_delta
full_data['LM'] = full_data[['lm_'+x for x in full_options]].idxmax(1)
full_data['LM'] = full_data['LM'].apply(lambda x: x.split('_')[1])

In [9]:
full_data.head()

Unnamed: 0,present,zero,about,among,as,at,between,by,during,for,...,lm_on,lm_over,lm_to,lm_with,lm_zero,init_prob,corr_prob,probs_ratio,probs_delta,LM
0,0.014882,0.9851183,0.021415,5.2e-05,0.126387,0.004439,8.2e-05,0.007522,0.002527,0.35627,...,-32.670826,-33.878326,-32.714996,-33.169025,-30.106878,-30.106878,-30.106878,1.0,0.0,zero
1,1.0,1.122606e-07,0.000154,0.008797,0.002671,0.125364,0.003533,0.000589,4.8e-05,0.005333,...,-29.882133,-32.0092,-30.204685,-30.430601,-28.89728,-30.106878,-29.61325,1.016669,-0.493628,zero
2,0.000255,0.9997448,0.023409,0.001858,0.179208,0.007757,4.1e-05,0.009895,1.9e-05,0.084803,...,-34.139107,-33.785168,-33.646137,-34.023697,-30.106878,-30.106878,-30.106878,1.0,0.0,zero
3,0.003864,0.9961361,0.003481,0.000654,0.776202,0.035943,0.000267,0.002419,0.004328,0.011206,...,-34.661076,-35.534275,-35.129456,-33.85262,-31.27317,-31.27317,-31.27317,1.0,0.0,zero
4,0.999961,3.919356e-05,0.039227,7e-06,0.002289,0.001446,1.4e-05,0.003828,0.000415,0.045707,...,-34.841442,-35.72839,-34.39954,-32.722183,-32.975845,-31.27317,-31.27317,1.0,0.0,of


In [10]:
data = full_data.loc[(full_data['Initial'] != full_data['ML_L1']) | (full_data['Initial'] != full_data['LM']),:]
data.head()

Unnamed: 0,present,zero,about,among,as,at,between,by,during,for,...,lm_on,lm_over,lm_to,lm_with,lm_zero,init_prob,corr_prob,probs_ratio,probs_delta,LM
1,1.0,1.122606e-07,0.000154,0.008797,0.002671,0.125364,0.003533,0.000589,4.8e-05,0.005333,...,-29.882133,-32.0092,-30.204685,-30.430601,-28.89728,-30.106878,-29.61325,1.016669,-0.493628,zero
8,0.529766,0.4702339,0.044846,0.024327,0.018907,0.007044,0.002463,0.133144,0.001312,0.32664,...,-35.853508,-37.054478,-35.880882,-35.886555,-33.212723,-35.538296,-35.538296,1.0,0.0,zero
10,1.0,2.031407e-07,0.019044,0.029955,0.01679,0.005628,0.000302,0.142329,0.000458,0.098731,...,-35.932354,-37.301533,-35.684933,-34.92664,-35.7173,-35.538296,-34.92664,1.017513,-0.611656,with
22,1.0,9.03297e-08,0.001325,0.00274,0.009714,0.000945,0.412205,0.000606,2.1e-05,0.0131,...,-32.532894,-32.76442,-32.216457,-31.615618,-29.754082,-29.692871,-31.615618,0.939184,1.922747,between
28,1.0,9.144474e-09,0.015174,0.000318,0.000455,0.000447,0.002068,0.014984,0.000453,0.748496,...,-66.182304,-67.28685,-63.869583,-64.66651,-64.341675,-64.42835,-64.42835,1.0,0.0,to


In [32]:
random.seed(SEED)

corr_idx = data.loc[data['Initial'] != data['Ann']].index
notcorr_idx = random.sample(data.loc[data['Initial'] == data['Ann']].index.tolist(),len(corr_idx)*2)
data = data.loc[corr_idx.tolist()+notcorr_idx,:]

In [33]:
data.shape

(4149, 46)

In [34]:
data = data.loc[pd.notnull(data['Ann']),:]
target = data['Ann']
X = data.drop(['Ann','raw_NP','Start_idx','Sent_start_idx'],axis=1)

In [35]:
target_vect = CountVectorizer(token_pattern='.+')
target_vect.fit(full_options)
X_sparse = hstack((X.drop(['Initial','ML_L1','LM'],axis=1).to_sparse(),
                   target_vect.transform(X['Initial']),
                   target_vect.transform(X['LM']),
                   target_vect.transform(X['ML_L1'])))

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X_sparse, target, test_size=0.33, random_state=SEED)

In [None]:
xgb = XGBClassifier(seed=SEED,n_jobs=3)
xgb.fit(X_train,y_train)
x_pred = xgb.predict(X_test)
print(accuracy_score(y_test,x_pred))
print(classification_report(y_test,x_pred))

In [16]:
logit = LogisticRegression(random_state=SEED)
logit.fit(X_train,y_train)
l_pred = logit.predict(X_test)
print(accuracy_score(y_test,l_pred))
print(classification_report(y_test,l_pred))

0.9353693181818182
             precision    recall  f1-score   support

      about       0.96      0.97      0.96       299
      among       0.97      0.88      0.92        80
         as       0.99      1.00      0.99       240
         at       0.78      0.75      0.77       182
    between       0.91      0.93      0.92        57
         by       0.93      0.92      0.92       285
     during       0.99      0.87      0.93        86
        for       0.94      0.93      0.93       648
       from       0.94      0.96      0.95       272
         in       0.94      0.95      0.94      1535
       into       0.83      0.92      0.87        37
         of       0.95      0.97      0.96      1262
         on       0.78      0.84      0.81       313
       over       0.98      0.89      0.93        61
         to       0.92      0.88      0.90       485
       with       0.96      0.95      0.95       367
       zero       0.97      0.97      0.97       831

avg / total       0.94   

In [17]:
test_data = data.loc[y_test.index]

In [23]:
x_pred_prob = logit.predict_proba(X_test)
final_preds = []
for l1_prob,lm_prob,meta_prob in zip(test_data[full_options+['present']].values,
                                     test_data[['lm_'+x for x in full_options]].values,
                                    x_pred_prob):
    #print(l1_prob[:-1],meta_prob,lm_prob)
    l1_prob[:3] *= l1_prob[-1]
    lm_prob /= sum(lm_prob)
    final_preds.append(full_options[np.argmax(np.average((l1_prob[:-1],meta_prob,lm_prob),axis=0))])

In [24]:
test_data['Preds'] = l_pred
test_data['Final_preds'] = final_preds
corrected = test_data.loc[(y_test != test_data['Initial']),['Initial','ML_L1','LM','Preds','Final_preds','Ann']]
not_corrected = test_data.loc[(y_test == test_data['Initial']),['Initial','ML_L1','LM','Preds','Final_preds','Ann']]

In [25]:
not_corrected.shape

(6592, 6)

In [31]:
print(accuracy_score(corrected['Ann'],corrected['Preds']))
print(classification_report(corrected['Ann'],corrected['Preds']))

0.029017857142857144
             precision    recall  f1-score   support

      about       0.00      0.00      0.00        10
      among       0.00      0.00      0.00        10
         as       0.00      0.00      0.00         1
         at       0.18      0.14      0.16        50
    between       0.00      0.00      0.00         4
         by       0.05      0.04      0.04        24
     during       0.00      0.00      0.00        11
        for       0.00      0.00      0.00        44
       from       0.00      0.00      0.00        10
         in       0.04      0.05      0.05        83
       into       0.00      0.00      0.00         3
         of       0.00      0.00      0.00        43
         on       0.01      0.02      0.02        45
       over       0.00      0.00      0.00         7
         to       0.00      0.00      0.00        56
       with       0.00      0.00      0.00        19
       zero       0.00      0.00      0.00        28

avg / total       0.03 

In [97]:
corrected[(corrected['Preds'] == corrected['Ann']) & (corrected['Ann'] != corrected['ML_L1']) &
          (corrected['Ann'] != corrected['LM'])].shape

(3, 6)

In [106]:
forest = RandomForestClassifier(n_estimators=300,random_state=SEED)
forest.fit(X_train,y_train)
f_pred = forest.predict(X_test)
print(accuracy_score(y_test,f_pred))
print(classification_report(y_test,f_pred))

0.5284738041
             precision    recall  f1-score   support

      about       0.38      0.36      0.37        14
      among       0.00      0.00      0.00         5
         as       0.62      1.00      0.77         5
         at       0.54      0.44      0.48        32
    between       0.75      0.60      0.67         5
         by       0.50      0.33      0.40         9
     during       0.00      0.00      0.00         6
        for       0.49      0.50      0.49        42
       from       0.69      0.56      0.62        16
         in       0.47      0.79      0.59        99
       into       0.00      0.00      0.00         2
         of       0.58      0.61      0.59        67
         on       0.55      0.36      0.44        33
       over       0.00      0.00      0.00         3
         to       0.40      0.23      0.29        43
       with       0.71      0.50      0.59        20
       zero       0.72      0.55      0.63        38

avg / total       0.52      0.5

  'precision', 'predicted', average, warn_for)


In [82]:
feats = ['present','zero','ml_a','ml_an','ml_the',
         'init_prob','corr_prob','probs_ratio','probs_delta',
        'init_a','init_an','init_the','init_zero','corr_a','lm_a','lm_an','lm_the','lm_zero','corr_an','corr_the','corr_zero']
imps = forest.feature_importances_
for f,i in sorted(zip(feats,imps),key=lambda x: x[1],reverse=True):
    print(f,'-',i)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [68]:
L1_preds = X.loc[y_test.index,'ML_L1']
print(accuracy_score(y_test,L1_preds))
print(classification_report(y_test,L1_preds))

0.374223182924
             precision    recall  f1-score   support

          a       0.28      0.35      0.31       467
         an       0.20      0.22      0.21        82
        the       0.37      0.45      0.41      1458
       zero       0.43      0.33      0.37      1694

avg / total       0.38      0.37      0.37      3701



In [69]:
L1_preds = X.loc[y_test.index,'LM']
print(accuracy_score(y_test,L1_preds))
print(classification_report(y_test,L1_preds))

0.41664415023
             precision    recall  f1-score   support

          a       0.35      0.37      0.36       467
         an       0.37      0.49      0.42        82
        the       0.41      0.39      0.40      1458
       zero       0.44      0.45      0.45      1694

avg / total       0.42      0.42      0.42      3701



In [52]:
LM_preds = []
for row in X.loc[y_test.index,['init_prob','corr_prob','Initial','ML_L1']].itertuples():
    if row.init_prob > row.corr_prob:
        LM_preds.append(row.Initial)
    else:
        LM_preds.append(row.ML_L1)
print(accuracy_score(y_test,LM_preds))
print(classification_report(y_test,LM_preds))

0.565349544073
             precision    recall  f1-score   support

          a       0.54      0.48      0.51       359
         an       0.57      0.47      0.51        77
        the       0.53      0.48      0.51       996
       zero       0.59      0.67      0.63      1200

avg / total       0.56      0.57      0.56      2632



In [68]:
sum(L1_preds == X.loc[y_test.index,'Initial'])

9360

In [19]:
len(x_pred),sum(x_pred == X.loc[y_test.index,'Initial'])

(2632, 2546)

# Train and save the model

In [38]:
xgb = LogisticRegression(random_state=SEED)
xgb.fit(X_sparse,target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=7, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [37]:
X_sparse.shape

(4149, 90)

In [39]:
with open('../models/preposition_choice_vectorizer.pickle','wb') as f:
    pickle.dump(target_vect,f)

with open('../models/preposition_metaclassifier_logistic.pickle','wb') as f:
    pickle.dump(xgb,f)   