In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack, csr_matrix, lil_matrix
from sklearn.metrics import accuracy_score, classification_report
from copy import deepcopy
from string import punctuation
import pickle

SEED = 42
punct = set(punctuation) | {'‘','’','—',' ','\t','\n'}

In [2]:
data = pd.read_csv('../preposition_table/shrinked_prepositions.csv',delimiter=';',encoding='utf-8-sig',keep_default_na=False)
data.head()

Unnamed: 0,NP,POS_tags,Head,Head_countability,Head_POS,hypernyms,higher_hypernyms,HHead,HHead_POS,HHead_rel,...,post_2,post_3,post_4,post_5,post_1_POS,post_2_POS,post_3_POS,post_4_POS,post_5_POS,Preposition
0,is aids,DT NN,aids,both,NN,immunodeficiency infectious_disease,disorder disease,,,,...,,,,,.,,,,,zero
1,aids,NN,aids,both,NN,immunodeficiency infectious_disease,disorder disease,condition,NN,nsubj,...,acquired,immune,deficiency,syndrome,-LRB-,VBN,NNP,NNP,NNP,zero
2,immune deficiency syndrome,NNP NNP NNP,syndrome,C,NNP,complex,concept,aids,NN,appos,...,is,a,condition,caused,-RRB-,VBZ,DT,NN,VBN,zero
3,a condition,DT NN,condition,C,NN,state,abstraction,be,VBZ,cop,...,by,a,virus,called,VBN,IN,DT,NN,VBN,zero
4,a virus,DT NN,virus,C,NN,infectious_agent microorganism,causal_agent living_thing,cause,VBN,obl,...,hiv,(,human,immuno,VBN,NNP,-LRB-,NNP,NNP,by


In [3]:
data.shape

(5000000, 31)

In [7]:
with open('../models/one_word_vectorizer.pickle','rb') as f:
    onewordvect = pickle.load(f)

with open('../models/pos_vectorizer.pickle','rb') as f:
    pos_vect = pickle.load(f)
    
with open('../models/noun_hypernym_vectorizer.pickle','rb') as f:
    hyp_vect = pickle.load(f)
            
with open('../models/noun_higher_hypernym_vectorizer.pickle','rb') as f:
    hhyp_vect = pickle.load(f)

with open('../models/countability_vectorizer.pickle','rb') as f:
    count_vect = pickle.load(f)
    
with open('../models/deprel_vectorizer.pickle','rb') as f:
    deprel_vect = pickle.load(f)

In [4]:
with open('target.pickle','wb') as f:
    pickle.dump(data['Preposition'],f)

In [5]:
target = data['Preposition']
target[target != 'zero'] = 'present'

In [6]:
with open('binary_target.pickle','wb') as f:
    pickle.dump(target,f)  

In [8]:
np_vect = CountVectorizer(token_pattern = '\\b\\w+\\b')
npm = np_vect.fit_transform(data['NP'])

pos = pos_vect.transform(data['POS_tags'])
head_pos = pos_vect.transform(data['Head_POS'])
hhead_pos = pos_vect.transform(data['HHead_POS'])
prevs_pos = hstack([pos_vect.transform(data['prev_'+str(i)+'_POS']) for i in range(1,6)])
posts_pos = hstack([pos_vect.transform(data['post_'+str(i)+'_POS']) for i in range(1,6)])


countability = count_vect.transform(data['Head_countability'])


hyp = hyp_vect.transform(data['hypernyms'])
hhyp = hhyp_vect.transform(data['higher_hypernyms'])

deprel = deprel_vect.transform(data['HHead_rel'])


hhead_vect = CountVectorizer(token_pattern='.+')
hhead = hhead_vect.fit_transform(data['HHead'])


head = onewordvect.transform(data['Head'])
prevs = hstack([onewordvect.transform(data['prev_'+str(i)]) for i in range(1,6)])
posts = hstack([onewordvect.transform(data['post_'+str(i)]) for i in range(1,6)])

In [9]:
with open('npm.pickle','wb') as f:
    pickle.dump(npm,f)
    
with open('pos.pickle','wb') as f:
    pickle.dump(pos,f)
    
with open('head.pickle','wb') as f:
    pickle.dump(head,f)
    
with open('countability.pickle','wb') as f:
    pickle.dump(countability,f)
    
    
with open('head_pos.pickle','wb') as f:
    pickle.dump(head_pos,f)
    
with open('hyp.pickle','wb') as f:
    pickle.dump(hyp,f)
    
with open('hhyp.pickle','wb') as f:
    pickle.dump(hhyp,f)
    
with open('deprel.pickle','wb') as f:
    pickle.dump(deprel,f)
    
with open('hhead.pickle','wb') as f:
    pickle.dump(hhead,f)
    
with open('hhead_pos.pickle','wb') as f:
    pickle.dump(hhead_pos,f)
    
with open('prevs.pickle','wb') as f:
    pickle.dump(prevs,f)
    
with open('posts.pickle','wb') as f:
    pickle.dump(posts,f)

with open('prevs_pos.pickle','wb') as f:
    pickle.dump(prevs_pos,f)

with open('posts_pos.pickle','wb') as f:
    pickle.dump(posts_pos,f)

In [10]:
with open('../models/hhead_vectorizer.pickle','wb') as f:
    pickle.dump(hhead_vect,f)
    
with open('../models/extended_np_vectorizer.pickle','wb') as f:
    pickle.dump(np_vect,f)

# INTERMEDIATE STAGE

In [2]:
with open('npm.pickle','rb') as f:
    npm = pickle.load(f)
    
with open('pos.pickle','rb') as f:
    pos = pickle.load(f)
    
with open('head.pickle','rb') as f:
    head = pickle.load(f)
    
with open('countability.pickle','rb') as f:
    countability = pickle.load(f)
    
with open('head_pos.pickle','rb') as f:
    head_pos = pickle.load(f)
    
with open('hyp.pickle','rb') as f:
    hyp = pickle.load(f)
    
with open('hhyp.pickle','rb') as f:
    hhyp = pickle.load(f)

with open('hhead.pickle','rb') as f:
    hhead = pickle.load(f)
    
with open('hhead_pos.pickle','rb') as f:
    hhead_pos = pickle.load(f)
    
with open('deprel.pickle','rb') as f:
    deprel = pickle.load(f)
    
with open('prevs.pickle','rb') as f:
    prevs = pickle.load(f)
    
with open('posts.pickle','rb') as f:
    posts = pickle.load(f)

with open('prevs_pos.pickle','rb') as f:
    prevs_pos = pickle.load(f)

with open('posts_pos.pickle','rb') as f:
    posts_pos = pickle.load(f)

In [3]:
data_sparse = hstack((npm,pos,head,countability,head_pos,hyp,hhyp,hhead,hhead_pos,deprel,
                      prevs,prevs_pos,posts,posts_pos)).tocsr()
nonzero_columns = np.unique(data_sparse.nonzero()[1]) # TODO: need to remember what cols were omitted
data_sparse = data_sparse[:,nonzero_columns]

In [4]:
with open('sparse_data.pickle','wb') as f:
    pickle.dump(data_sparse,f)

with open('../models/preposition_nonzero_columns.pickle','wb') as f:
    pickle.dump(nonzero_columns,f)

In [5]:
data_sparse.shape

(5000000, 1165020)

# presence classifier & a-an-the classifier

In [65]:
X_train, X_test, y_train, y_test = train_test_split(data_sparse, target, test_size=0.33, 
                                                    random_state=SEED,stratify=target)

In [59]:
logit = LogisticRegression(random_state=SEED)
logit.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [60]:
pred_l = logit.predict(X_test)
print(accuracy_score(y_test,pred_l))
print(classification_report(y_test,pred_l))

0.948183041723
             precision    recall  f1-score   support

    present       0.96      0.92      0.94      6942
       zero       0.94      0.97      0.96      9404

avg / total       0.95      0.95      0.95     16346



In [61]:
logit_pres = LogisticRegression(random_state=SEED)
logit_pres.fit(X_train[np.where(y_train == 'present')[0],:],target[y_train[y_train == 'present'].index])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [62]:
pred_l_pres = logit_pres.predict(X_test[np.where(pred_l == 'present')[0],:])
print(accuracy_score(target[y_test[pred_l == 'present'].index],pred_l_pres))
print(classification_report(target[y_test[pred_l == 'present'].index],pred_l_pres))

0.564988730278
             precision    recall  f1-score   support

      about       0.46      0.14      0.21        81
      among       0.00      0.00      0.00        15
         as       0.62      0.56      0.58       257
         at       0.47      0.33      0.39       226
    between       0.33      0.03      0.06        31
         by       0.46      0.54      0.50       283
     during       0.00      0.00      0.00        27
        for       0.46      0.38      0.42       580
       from       0.52      0.20      0.29       228
         in       0.52      0.62      0.57      1190
       into       0.39      0.16      0.23        73
         of       0.67      0.91      0.77      2058
         on       0.51      0.37      0.43       366
       over       0.38      0.11      0.17        55
         to       0.44      0.47      0.45       558
       with       0.43      0.29      0.34       347
       zero       0.00      0.00      0.00       280

avg / total       0.52      0

  'precision', 'predicted', average, warn_for)


In [63]:
pred_l[pred_l == 'present'] = pred_l_pres

In [64]:
print(accuracy_score(target[y_test.index],pred_l))
print(classification_report(target[y_test.index],pred_l))

0.788205065459
             precision    recall  f1-score   support

      about       0.46      0.12      0.19        89
      among       0.00      0.00      0.00        17
         as       0.62      0.43      0.51       330
         at       0.47      0.28      0.36       260
    between       0.33      0.03      0.06        31
         by       0.46      0.47      0.47       322
     during       0.00      0.00      0.00        31
        for       0.46      0.36      0.40       613
       from       0.52      0.18      0.27       246
         in       0.52      0.55      0.54      1325
       into       0.39      0.16      0.23        74
         of       0.67      0.89      0.76      2099
         on       0.51      0.34      0.41       405
       over       0.38      0.08      0.13        74
         to       0.44      0.41      0.43       636
       with       0.43      0.25      0.32       390
       zero       0.94      0.97      0.96      9404

avg / total       0.77      0

  'precision', 'predicted', average, warn_for)


# Fit and save models

In [2]:
with open('sparse_data.pickle','rb') as f:
    data_sparse = pickle.load(f)
    
with open('target.pickle','rb') as f:
    target = pickle.load(f)

with open('binary_target.pickle','rb') as f:
    binary_target = pickle.load(f)

In [None]:
logit = LogisticRegression(random_state=SEED)
logit.fit(data_sparse, binary_target)

In [4]:
logit_pres = LogisticRegression(random_state=SEED)
logit_pres.fit(data_sparse[np.where(binary_target == 'present')[0],:],target[binary_target[binary_target == 'present'].index])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [5]:
with open('../models/preposition_logit_binary.pickle','wb') as f:
    pickle.dump(logit,f)
    
with open('../models/preposition_logit_type.pickle','wb') as f:
    pickle.dump(logit_pres,f)

# Just in case - list of classifiers that support predict_proba

In [27]:
from sklearn.utils.testing import all_estimators

estimators = all_estimators()

for name, class_ in estimators:
    if hasattr(class_, 'predict_proba'):
        print(name)



AdaBoostClassifier
BaggingClassifier
BayesianGaussianMixture
BernoulliNB
CalibratedClassifierCV
DPGMM
DecisionTreeClassifier
ExtraTreeClassifier
ExtraTreesClassifier
GMM
GaussianMixture
GaussianNB
GaussianProcessClassifier
GradientBoostingClassifier
KNeighborsClassifier
LDA
LabelPropagation
LabelSpreading
LinearDiscriminantAnalysis
LogisticRegression
LogisticRegressionCV
MLPClassifier
MultinomialNB
NuSVC
QDA
QuadraticDiscriminantAnalysis
RandomForestClassifier
SGDClassifier
SVC
VBGMM
_BinaryGaussianProcessClassifierLaplace
_ConstantPredictor
_DPGMMBase
_GMMBase
_LDA
_QDA
