In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack, csr_matrix, lil_matrix
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import TruncatedSVD
from copy import deepcopy
from string import punctuation
from gensim.models import Word2Vec

SEED = 42
punct = set(punctuation) | {'‘','’','—',' ','\t','\n'}

In [None]:
model = Word2Vec.load_word2vec_format("C:/Users/Андрей/Desktop/DM/constructions/GoogleNews-vectors-negative300.bin.gz",
                                      binary=True)

In [2]:
data = pd.read_csv('articles.csv',delimiter=';',encoding='utf-8-sig')
data = data.fillna('')
data.head()

Unnamed: 0,NP,POS_tags,Head,Head_countability,NP_first_letter,Head_POS,hypernyms,higher_hypernyms,hhead,hhead_POS,deprel,prevprev,prev,post,postpost,prevprev_POS,prev_POS,post_POS,postpost_POS,Article
0,ageism,NN,ageism,,a,NN,discrimination,group_action,,,,,,,,,,,,zero
1,foundation,NN,foundation,both,f,NN,relation,abstraction,,,,,,of,age,,,IN,NN,the
2,age discrimination,NN NN,discrimination,U,a,NN,social_control,act event,foundation,NN,nmod,foundation,of,,,NN,IN,,,zero
3,steve scrutton,NNP NNP,scrutton,,s,NNP,,,steve,VB,obj,,,,,,,,,zero
4,steve scrutton,NNP NNP,scrutton,,s,NNP,,,steve,NNP,flat,,,is,a,,,VBZ,DT,zero


In [12]:
all_vectors = lil_matrix((data.shape[0],300))
for i,word in enumerate(data['Head']):
    if word in model:
        all_vectors[i,:] = model[word]

In [14]:
model = []

0

In [15]:
with open('../unique_words.txt','r',encoding='utf-8') as f:
    unique_words = f.read().split('\n')

onewordvect = CountVectorizer(token_pattern='.+')
onewordvect.fit(unique_words+list(punct))

with open('../Penn_POS_tagset.txt','r',encoding='utf-8') as f:
    unique_pos = f.read().split('\n')

pos_vect = CountVectorizer(token_pattern='(?:^| )(.+?)(?= |$)')
pos_vect.fit(unique_pos+list(punct))

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?:^| )(.+?)(?= |$)',
        tokenizer=None, vocabulary=None)

In [16]:
target = data['Article']
data = data.drop('Article',axis=1)

In [17]:
pres_idx = target[(target == 'a') | (target == 'an') | (target == 'the')].index
binary_target = deepcopy(target)
binary_target[(binary_target == 'a') | (binary_target == 'an') | (binary_target == 'the')] = 'present'

In [18]:
np_vect = CountVectorizer(token_pattern = '\\b\\w+\\b')
npm = np_vect.fit_transform(data['NP'])

pos = pos_vect.transform(data['POS_tags'])
head_pos = pos_vect.transform(data['Head_POS'])
#hhead_pos = pos_vect.transform(data['hhead_POS'])
prevprev_pos = pos_vect.transform(data['prevprev_POS'])
prev_pos = pos_vect.transform(data['prev_POS'])
post_pos = pos_vect.transform(data['post_POS'])
postpost_pos = pos_vect.transform(data['postpost_POS'])

countability = pd.get_dummies(data['Head_countability'],drop_first=True).to_sparse()

letter_vect = CountVectorizer(token_pattern='.+')
first_letter = letter_vect.fit_transform(data['NP_first_letter'])

hyp_vect = CountVectorizer()
hyp = hyp_vect.fit_transform(data['hypernyms'])

hhyp_vect = CountVectorizer()
hhyp = hhyp_vect.fit_transform(data['higher_hypernyms'])

#deprel_vect = CountVectorizer()
#deprel = deprel_vect.fit_transform(data['deprel'])

#hhead_vect = CountVectorizer(token_pattern='.+')
#hhead = hhead_vect.fit_transform(data['hhead'])

head = onewordvect.transform(data['Head'])
prevprev = onewordvect.transform(data['prevprev'])
prev = onewordvect.transform(data['prev'])
post = onewordvect.transform(data['post'])
postpost = onewordvect.transform(data['postpost'])

In [38]:
data_sparse = hstack((npm,pos,head,countability,first_letter,head_pos,hyp,hhyp,all_vectors,
                      prevprev,prev,post,postpost,prevprev_pos,prev_pos,post_pos,postpost_pos)).tocsr()
nonzero_columns = np.unique(data_sparse.nonzero()[1]) # TODO: need to remember what cols were omitted
data_sparse = data_sparse[:,nonzero_columns]

# presence classifier & a-an-the classifier

In [39]:
X_train, X_test, y_train, y_test = train_test_split(data_sparse, binary_target, test_size=0.33, 
                                                    random_state=SEED,stratify=binary_target)

In [40]:
logit = LogisticRegression(random_state=SEED)
logit.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [41]:
pred_l = logit.predict(X_test)
print(accuracy_score(y_test,pred_l))
print(classification_report(y_test,pred_l))

0.854628052243
             precision    recall  f1-score   support

    present       0.80      0.75      0.78      5271
       zero       0.88      0.90      0.89     10578

avg / total       0.85      0.85      0.85     15849



In [42]:
logit_pres = LogisticRegression(random_state=SEED,multi_class='multinomial',solver='lbfgs')
logit_pres.fit(X_train[np.where(y_train == 'present')[0],:],target[y_train[y_train == 'present'].index])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=42, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [43]:
pred_l_pres = logit_pres.predict(X_test[np.where(pred_l == 'present')[0],:])
print(accuracy_score(target[y_test[pred_l == 'present'].index],pred_l_pres))
print(classification_report(target[y_test[pred_l == 'present'].index],pred_l_pres))

0.664593781344
             precision    recall  f1-score   support

          a       0.60      0.66      0.63       892
         an       0.60      0.48      0.53       186
        the       0.68      0.91      0.78      2898
       zero       0.00      0.00      0.00      1009

avg / total       0.53      0.66      0.59      4985



  'precision', 'predicted', average, warn_for)


In [44]:
pred_l[pred_l == 'present'] = pred_l_pres

In [45]:
print(accuracy_score(target[y_test.index],pred_l))
print(classification_report(target[y_test.index],pred_l))

0.812795759985
             precision    recall  f1-score   support

          a       0.60      0.53      0.56      1111
         an       0.60      0.39      0.47       228
        the       0.68      0.67      0.68      3932
       zero       0.88      0.90      0.89     10578

avg / total       0.81      0.81      0.81     15849



# Just in case - list of classifiers that support predict_proba

In [27]:
from sklearn.utils.testing import all_estimators

estimators = all_estimators()

for name, class_ in estimators:
    if hasattr(class_, 'predict_proba'):
        print(name)



AdaBoostClassifier
BaggingClassifier
BayesianGaussianMixture
BernoulliNB
CalibratedClassifierCV
DPGMM
DecisionTreeClassifier
ExtraTreeClassifier
ExtraTreesClassifier
GMM
GaussianMixture
GaussianNB
GaussianProcessClassifier
GradientBoostingClassifier
KNeighborsClassifier
LDA
LabelPropagation
LabelSpreading
LinearDiscriminantAnalysis
LogisticRegression
LogisticRegressionCV
MLPClassifier
MultinomialNB
NuSVC
QDA
QuadraticDiscriminantAnalysis
RandomForestClassifier
SGDClassifier
SVC
VBGMM
_BinaryGaussianProcessClassifierLaplace
_ConstantPredictor
_DPGMMBase
_GMMBase
_LDA
_QDA
