In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack, csr_matrix, lil_matrix
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import TruncatedSVD
from copy import deepcopy
from string import punctuation
from gensim.models import Word2Vec, KeyedVectors
import pickle

SEED = 42
punct = set(punctuation) | {'‘','’','—',' ','\t','\n'}



In [2]:
model = KeyedVectors.load_word2vec_format("C:/Users/PC1/Desktop/python/деплом/deplom/constructions/GoogleNews-vectors-negative300.bin.gz",
                                      binary=True)

In [3]:
data = pd.read_csv('prepositions.csv',delimiter=';',encoding='utf-8-sig')
data = data.fillna('')
data.head()

Unnamed: 0,Sentence,raw_NP,NP,Start_idx,Sent_start_idx,POS_tags,Head,Head_countability,Head_POS,hypernyms,...,post_2,post_3,post_4,post_5,post_1_POS,post_2_POS,post_3_POS,post_4_POS,post_5_POS,Preposition
0,ageism,ageism,ageism,0,0,NN,ageism,,NN,discrimination,...,,,,,,,,,,zero
1,the foundation of age discrimination,the foundation,the foundation,0,8,DT NN,foundation,both,NN,relation,...,age,discrimination,,,IN,NN,NN,,,zero
2,the foundation of age discrimination,of age discrimination,age discrimination,15,8,NN NN,discrimination,U,NN,social_control,...,,,,,,,,,,of
3,steve scrutton,steve scrutton,steve scrutton,0,45,NNP NNP,scrutton,,NNP,,...,,,,,,,,,,zero
4,Steve Scrutton is a social work manager in Nor...,Steve Scrutton,steve scrutton,0,60,NNP NNP,scrutton,,NNP,,...,a,social,work,manager,VBZ,DT,JJ,NN,NN,zero


In [4]:
data.columns

Index(['Sentence', 'raw_NP', 'NP', 'Start_idx', 'Sent_start_idx', 'POS_tags',
       'Head', 'Head_countability', 'Head_POS', 'hypernyms',
       'higher_hypernyms', 'HHead', 'HHead_POS', 'HHead_rel', 'prev_5',
       'prev_4', 'prev_3', 'prev_2', 'prev_1', 'prev_5_POS', 'prev_4_POS',
       'prev_3_POS', 'prev_2_POS', 'prev_1_POS', 'post_1', 'post_2', 'post_3',
       'post_4', 'post_5', 'post_1_POS', 'post_2_POS', 'post_3_POS',
       'post_4_POS', 'post_5_POS', 'Preposition'],
      dtype='object')

In [5]:
all_vectors = lil_matrix((data.shape[0],300))
for i,word in enumerate(data['Head']):
    if word in model:
        all_vectors[i,:] = model[word]
        
all_vectors_hhead = lil_matrix((data.shape[0],300))
for i,word in enumerate(data['HHead']):
    if word in model:
        all_vectors_hhead[i,:] = model[word]

In [6]:
all_vectors

<50249x300 sparse matrix of type '<class 'numpy.float64'>'
	with 14090400 stored elements in LInked List format>

In [7]:
all_vectors_hhead

<50249x300 sparse matrix of type '<class 'numpy.float64'>'
	with 12167100 stored elements in LInked List format>

In [8]:
model = []

In [16]:
with open('../models/one_word_vectorizer.pickle','rb') as f:
    onewordvect = pickle.load(f)

with open('../models/pos_vectorizer.pickle','rb') as f:
    pos_vect = pickle.load(f)
    
with open('../models/noun_hypernym_vectorizer.pickle','rb') as f:
    hyp_vect = pickle.load(f)
            
with open('../models/noun_higher_hypernym_vectorizer.pickle','rb') as f:
    hhyp_vect = pickle.load(f)

with open('../models/countability_vectorizer.pickle','rb') as f:
    count_vect = pickle.load(f)

In [27]:
with open('../ud_relations.txt','r',encoding='utf-8') as f:
    relations = f.read().split('\n')

deprel_vect = CountVectorizer(token_pattern='.+')
deprel_vect.fit(relations)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='.+', tokenizer=None,
        vocabulary=None)

In [11]:
target = data['Preposition']
data = data.drop('Preposition',axis=1)

In [12]:
pres_idx = target[target != 'zero'].index
binary_target = deepcopy(target)
binary_target[target != 'zero'] = 'present'

In [18]:
np_vect = CountVectorizer(token_pattern = '\\b\\w+\\b')
npm = np_vect.fit_transform(data['NP'])

pos = pos_vect.transform(data['POS_tags'])
head_pos = pos_vect.transform(data['Head_POS'])
hhead_pos = pos_vect.transform(data['HHead_POS'])
prevs_pos = hstack([pos_vect.transform(data['prev_'+str(i)+'_POS']) for i in range(1,6)])
posts_pos = hstack([pos_vect.transform(data['post_'+str(i)+'_POS']) for i in range(1,6)])

countability = count_vect.transform(data['Head_countability'])

hyp = hyp_vect.transform(data['hypernyms'])
hhyp = hhyp_vect.transform(data['higher_hypernyms'])

deprel = deprel_vect.transform(data['HHead_rel'])

hhead_vect = CountVectorizer(token_pattern='.+')
hhead = hhead_vect.fit_transform(data['HHead'])

head = onewordvect.transform(data['Head'])
prevs = hstack([onewordvect.transform(data['prev_'+str(i)]) for i in range(1,6)])
posts = hstack([onewordvect.transform(data['post_'+str(i)]) for i in range(1,6)])

In [29]:
data_sparse = hstack((npm,pos,head,countability,head_pos,hyp,hhyp,all_vectors,hhead,hhead_pos,deprel,all_vectors_hhead,
                      prevs,prevs_pos,posts,posts_pos)).tocsr()
#nonzero_columns = np.unique(data_sparse.nonzero()[1]) # TODO: need to remember what cols were omitted
#data_sparse = data_sparse[:,nonzero_columns]

In [30]:
data_sparse.shape

(50249, 1379745)

In [31]:
all_vectors

<50249x300 sparse matrix of type '<class 'numpy.float64'>'
	with 14090400 stored elements in LInked List format>

# presence classifier & a-an-the classifier

In [32]:
X_train, X_test, y_train, y_test = train_test_split(data_sparse, binary_target, test_size=0.33, 
                                                    random_state=SEED,stratify=binary_target)

In [33]:
logit = LogisticRegression(random_state=SEED)
logit.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [34]:
pred_l = logit.predict(X_test)
print(accuracy_score(y_test,pred_l))
print(classification_report(y_test,pred_l))

0.948561780136
             precision    recall  f1-score   support

    present       0.95      0.92      0.93      6528
       zero       0.95      0.97      0.96     10055

avg / total       0.95      0.95      0.95     16583



In [35]:
logit_pres = LogisticRegression(random_state=SEED)
logit_pres.fit(X_train[np.where(y_train == 'present')[0],:],target[y_train[y_train == 'present'].index])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [36]:
pred_l_pres = logit_pres.predict(X_test[np.where(pred_l == 'present')[0],:])
print(accuracy_score(target[y_test[pred_l == 'present'].index],pred_l_pres))
print(classification_report(target[y_test[pred_l == 'present'].index],pred_l_pres))

0.588057805304
             precision    recall  f1-score   support

      about       0.40      0.16      0.23        89
      along       0.00      0.00      0.00         1
      among       0.00      0.00      0.00        11
     around       0.00      0.00      0.00         6
         as       0.65      0.56      0.60       248
         at       0.51      0.44      0.47       224
    between       0.86      0.18      0.30        33
         by       0.48      0.50      0.49       286
       down       0.00      0.00      0.00         5
     during       0.00      0.00      0.00        24
     except       0.00      0.00      0.00         1
        for       0.47      0.46      0.46       591
       from       0.42      0.27      0.33       222
         in       0.53      0.66      0.59      1167
     inside       0.00      0.00      0.00         1
       into       0.40      0.24      0.30        84
         of       0.71      0.90      0.79      2087
        off       0.00      0.

  'precision', 'predicted', average, warn_for)


In [37]:
pred_l[pred_l == 'present'] = pred_l_pres

In [38]:
print(accuracy_score(target[y_test.index],pred_l))
print(classification_report(target[y_test.index],pred_l))

0.810890671169
             precision    recall  f1-score   support

      about       0.40      0.13      0.20       105
      along       0.00      0.00      0.00         2
      among       0.00      0.00      0.00        16
     around       0.00      0.00      0.00         9
         as       0.65      0.44      0.52       320
         at       0.51      0.38      0.44       257
    between       0.86      0.17      0.29        35
         by       0.48      0.45      0.46       318
       down       0.00      0.00      0.00         5
     during       0.00      0.00      0.00        27
     except       0.00      0.00      0.00         1
        for       0.47      0.43      0.44       641
       from       0.42      0.24      0.31       243
         in       0.53      0.60      0.56      1298
     inside       0.00      0.00      0.00         1
       into       0.40      0.23      0.29        87
         of       0.71      0.88      0.79      2128
        off       0.00      0.

  'precision', 'predicted', average, warn_for)


# Fit and save models

In [39]:
logit = LogisticRegression(random_state=SEED)
logit.fit(data_sparse, binary_target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [40]:
logit_pres = LogisticRegression(random_state=SEED)
logit_pres.fit(data_sparse[np.where(binary_target == 'present')[0],:],target[binary_target[binary_target == 'present'].index])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [41]:
with open('../models/hhead_vectorizer.pickle','wb') as f:
    pickle.dump(hhead_vect,f)
    
with open('../models/deprel_vectorizer.pickle','wb') as f:
    pickle.dump(deprel_vect,f)
    
with open('../models/extended_np_vectorizer.pickle','wb') as f:
    pickle.dump(np_vect,f)



with open('../models/preposition_logit_binary.pickle','wb') as f:
    pickle.dump(logit,f)
    
with open('../models/preposition_logit_type.pickle','wb') as f:
    pickle.dump(logit_pres,f)

# Just in case - list of classifiers that support predict_proba

In [27]:
from sklearn.utils.testing import all_estimators

estimators = all_estimators()

for name, class_ in estimators:
    if hasattr(class_, 'predict_proba'):
        print(name)



AdaBoostClassifier
BaggingClassifier
BayesianGaussianMixture
BernoulliNB
CalibratedClassifierCV
DPGMM
DecisionTreeClassifier
ExtraTreeClassifier
ExtraTreesClassifier
GMM
GaussianMixture
GaussianNB
GaussianProcessClassifier
GradientBoostingClassifier
KNeighborsClassifier
LDA
LabelPropagation
LabelSpreading
LinearDiscriminantAnalysis
LogisticRegression
LogisticRegressionCV
MLPClassifier
MultinomialNB
NuSVC
QDA
QuadraticDiscriminantAnalysis
RandomForestClassifier
SGDClassifier
SVC
VBGMM
_BinaryGaussianProcessClassifierLaplace
_ConstantPredictor
_DPGMMBase
_GMMBase
_LDA
_QDA
