In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack, csr_matrix, lil_matrix
from sklearn.metrics import accuracy_score, classification_report
from copy import deepcopy
from string import punctuation
import pickle

SEED = 42
punct = set(punctuation) | {'‘','’','—',' ','\t','\n'}

# OMG 10G THAT REMAIN IN MEM

In [2]:
data = pd.read_csv('../article_table/articles.csv',delimiter=';',encoding='utf-8-sig',keep_default_na=False)
data.drop(['Sentence','raw_NP','Start_idx','Sent_start_idx'],axis=1,inplace=True)
data.head()

Unnamed: 0,NP,POS_tags,Head,Head_countability,NP_first_letter,Head_POS,hypernyms,higher_hypernyms,prev_2,prev_1,prev_2_POS,prev_1_POS,post_1,post_2,post_1_POS,post_2_POS,Article
0,factsheet,NN,factsheet,,f,NN,,,,,,,what,is,WDT,VBZ,zero
1,aids,NNP,aids,both,a,NNP,infectious_disease immunodeficiency,disorder disease,what,is,WDT,VBZ,?,,.,,zero
2,aids,NN,aids,both,a,NN,infectious_disease immunodeficiency,disorder disease,,,,,(,acquired,NN,VBN,zero
3,immune deficiency syndrome,NNP NNP NN,syndrome,C,i,NN,complex,concept,(,acquired,NN,VBN,),is,NN,VBZ,zero
4,condition,NN,condition,C,c,NN,state,abstraction,),is,NN,VBZ,caused,by,VBN,IN,a


In [3]:
with open('../unique_words.txt','r',encoding='utf-8') as f:
    unique_words = f.read().split('\n')

onewordvect = CountVectorizer(token_pattern='.+')
onewordvect.fit(unique_words+list(punct))

with open('../Penn_POS_tagset.txt','r',encoding='utf-8') as f:
    unique_pos = f.read().split('\n')

pos_vect = CountVectorizer(token_pattern='(?:^| )(.+?)(?= |$)')
pos_vect.fit(unique_pos+list(punct))

count_vect = CountVectorizer(token_pattern='.+')
count_vect.fit(['C','U','both','proper'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='.+', tokenizer=None,
        vocabulary=None)

In [4]:
target = data['Article']
data.drop('Article',axis=1,inplace=True)

In [5]:
pres_idx = target[(target == 'a') | (target == 'an') | (target == 'the')].index
binary_target = deepcopy(target)
binary_target[(binary_target == 'a') | (binary_target == 'an') | (binary_target == 'the')] = 'present'

In [8]:
with open('target.pickle','wb') as f:
    pickle.dump(target,f)

with open('binary_target.pickle','wb') as f:
    pickle.dump(binary_target,f)  

In [9]:
np_vect = CountVectorizer(token_pattern = '\\b\\w+\\b')
npm = np_vect.fit_transform(data['NP'])

'''
pos = pos_vect.transform(data['POS_tags'])
head_pos = pos_vect.transform(data['Head_POS'])
#hhead_pos = pos_vect.transform(data['hhead_POS'])
prevs_pos = hstack([pos_vect.transform(data['prev_'+str(i)+'_POS']) for i in range(1,3)])
posts_pos = hstack([pos_vect.transform(data['post_'+str(i)+'_POS']) for i in range(1,3)])


countability = count_vect.transform(data['Head_countability'])
'''

letter_vect = CountVectorizer(token_pattern='.+')
first_letter = letter_vect.fit_transform(data['NP_first_letter'])

hyp_vect = CountVectorizer()
hyp = hyp_vect.fit_transform(data['hypernyms'])

hhyp_vect = CountVectorizer()
hhyp = hhyp_vect.fit_transform(data['higher_hypernyms'])

'''

#deprel_vect = CountVectorizer()
#deprel = deprel_vect.fit_transform(data['deprel'])

#hhead_vect = CountVectorizer(token_pattern='.+')
#hhead = hhead_vect.fit_transform(data['hhead'])

head = onewordvect.transform(data['Head'])
prevs = hstack([onewordvect.transform(data['prev_'+str(i)]) for i in range(1,3)])
posts = hstack([onewordvect.transform(data['post_'+str(i)]) for i in range(1,3)])
'''

"\n\n#deprel_vect = CountVectorizer()\n#deprel = deprel_vect.fit_transform(data['deprel'])\n\n#hhead_vect = CountVectorizer(token_pattern='.+')\n#hhead = hhead_vect.fit_transform(data['hhead'])\n\nhead = onewordvect.transform(data['Head'])\nprevs = hstack([onewordvect.transform(data['prev_'+str(i)]) for i in range(1,3)])\nposts = hstack([onewordvect.transform(data['post_'+str(i)]) for i in range(1,3)])\n"

In [10]:
with open('../models/one_word_vectorizer.pickle','wb') as f:
    pickle.dump(onewordvect,f)
    
with open('../models/pos_vectorizer.pickle','wb') as f:
    pickle.dump(pos_vect,f)

with open('../models/letter_vectorizer.pickle','wb') as f:
    pickle.dump(letter_vect,f)

with open('../models/np_vectorizer.pickle','wb') as f:
    pickle.dump(np_vect,f)
    
with open('../models/noun_hypernym_vectorizer.pickle','wb') as f:
    pickle.dump(hyp_vect,f)
    
with open('../models/noun_higher_hypernym_vectorizer.pickle','wb') as f:
    pickle.dump(hhyp_vect,f)
    
with open('../models/countability_vectorizer.pickle','wb') as f:
    pickle.dump(count_vect,f)

In [11]:
with open('npm.pickle','wb') as f:
    pickle.dump(npm,f)
    
with open('pos.pickle','wb') as f:
    pickle.dump(pos,f)
    
with open('head.pickle','wb') as f:
    pickle.dump(head,f)
    
with open('countability.pickle','wb') as f:
    pickle.dump(countability,f)
    
with open('first_letter.pickle','wb') as f:
    pickle.dump(first_letter,f)
    
with open('head_pos.pickle','wb') as f:
    pickle.dump(head_pos,f)
    
with open('hyp.pickle','wb') as f:
    pickle.dump(hyp,f)
    
with open('hhyp.pickle','wb') as f:
    pickle.dump(hhyp,f)
    
with open('prevs.pickle','wb') as f:
    pickle.dump(prevs,f)
    
with open('posts.pickle','wb') as f:
    pickle.dump(posts,f)

with open('prevs_pos.pickle','wb') as f:
    pickle.dump(prevs_pos,f)

with open('posts_pos.pickle','wb') as f:
    pickle.dump(posts_pos,f)
    
with open('head_col.pickle','wb') as f:
    pickle.dump(head_col,f)

# INTERMEDIATE STAGE

In [3]:
with open('npm.pickle','rb') as f:
    npm = pickle.load(f)
    
with open('pos.pickle','rb') as f:
    pos = pickle.load(f)
    
with open('head.pickle','rb') as f:
    head = pickle.load(f)
    
with open('countability.pickle','rb') as f:
    countability = pickle.load(f)
    
with open('first_letter.pickle','rb') as f:
    first_letter = pickle.load(f)
    
with open('head_pos.pickle','rb') as f:
    head_pos = pickle.load(f)
    
with open('hyp.pickle','rb') as f:
    hyp = pickle.load(f)
    
with open('hhyp.pickle','rb') as f:
    hhyp = pickle.load(f)
    
with open('prevs.pickle','rb') as f:
    prevs = pickle.load(f)
    
with open('posts.pickle','rb') as f:
    posts = pickle.load(f)

with open('prevs_pos.pickle','rb') as f:
    prevs_pos = pickle.load(f)

with open('posts_pos.pickle','rb') as f:
    posts_pos = pickle.load(f)

In [6]:
data_sparse = hstack((npm,pos,head,countability,first_letter,head_pos,hyp,hhyp,
                      prevs,posts,prevs_pos,posts_pos)).tocsr()
nonzero_columns = np.unique(data_sparse.nonzero()[1]) # TODO: need to remember what cols were omitted
data_sparse = data_sparse[:,nonzero_columns]

In [7]:
data_sparse.shape

(21408958, 1319378)

In [8]:
with open('sparse_data.pickle','wb') as f:
    pickle.dump(data_sparse,f)

with open('nonzero_columns.pickle','wb') as f:
    pickle.dump(nonzero_columns,f)

# presence classifier & a-an-the classifier

In [None]:
with open('sparse_data.pickle','rb') as f:
    data_sparse = pickle.load(f)

with open('target.pickle','rb') as f:
    target = pickle.load(f)

with open('binary_target.pickle','rb') as f:
    binary_target = pickle.load(f)  

In [67]:
X_train, X_test, y_train, y_test = train_test_split(data_sparse, binary_target, test_size=0.33, 
                                                    random_state=SEED,stratify=binary_target)

In [81]:
%%time
logit = LogisticRegression(random_state=SEED)
logit.fit(X_train,y_train)

Wall time: 22.4 s


In [69]:
pred_l = logit.predict(X_test)
print(accuracy_score(y_test,pred_l))
print(classification_report(y_test,pred_l))

0.853807811218
             precision    recall  f1-score   support

    present       0.80      0.75      0.77      5271
       zero       0.88      0.90      0.89     10578

avg / total       0.85      0.85      0.85     15849



In [82]:
%%time
logit_pres = LogisticRegression(random_state=SEED,multi_class='multinomial',solver='lbfgs')
logit_pres.fit(X_train[np.where(y_train == 'present')[0],:],target[y_train[y_train == 'present'].index])

Wall time: 10 s


In [71]:
pred_l_pres = logit_pres.predict(X_test[np.where(pred_l == 'present')[0],:])
print(accuracy_score(target[y_test[pred_l == 'present'].index],pred_l_pres))
print(classification_report(target[y_test[pred_l == 'present'].index],pred_l_pres))

0.66291683407
             precision    recall  f1-score   support

          a       0.60      0.65      0.62       887
         an       0.58      0.46      0.51       184
        the       0.68      0.91      0.78      2895
       zero       0.00      0.00      0.00      1012

avg / total       0.52      0.66      0.58      4978



  'precision', 'predicted', average, warn_for)


In [72]:
pred_l[pred_l == 'present'] = pred_l_pres

In [73]:
print(accuracy_score(target[y_test.index],pred_l))
print(classification_report(target[y_test.index],pred_l))

0.81178623257
             precision    recall  f1-score   support

          a       0.60      0.52      0.56      1111
         an       0.58      0.37      0.45       228
        the       0.68      0.67      0.68      3932
       zero       0.88      0.90      0.89     10578

avg / total       0.81      0.81      0.81     15849



# Fit and save models

In [3]:
with open('sparse_data.pickle','rb') as f:
    data_sparse = pickle.load(f)
    
with open('target.pickle','rb') as f:
    target = pickle.load(f)

with open('binary_target.pickle','rb') as f:
    binary_target = pickle.load(f)  

In [3]:
logit = LogisticRegression(random_state=SEED)
logit.fit(data_sparse, binary_target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [4]:
logit_pres = LogisticRegression(random_state=SEED,multi_class='multinomial',solver='lbfgs',n_jobs=3)
logit_pres.fit(data_sparse[np.where(binary_target == 'present')[0],:],target[binary_target[binary_target == 'present'].index])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=3, penalty='l2', random_state=42, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [5]:
#with open('../models/article_logit_binary.pickle','wb') as f:
#    pickle.dump(logit,f)
    
with open('../models/article_logit_type.pickle','wb') as f:
    pickle.dump(logit_pres,f)

# Just in case - list of classifiers that support predict_proba

In [27]:
from sklearn.utils.testing import all_estimators

estimators = all_estimators()

for name, class_ in estimators:
    if hasattr(class_, 'predict_proba'):
        print(name)



AdaBoostClassifier
BaggingClassifier
BayesianGaussianMixture
BernoulliNB
CalibratedClassifierCV
DPGMM
DecisionTreeClassifier
ExtraTreeClassifier
ExtraTreesClassifier
GMM
GaussianMixture
GaussianNB
GaussianProcessClassifier
GradientBoostingClassifier
KNeighborsClassifier
LDA
LabelPropagation
LabelSpreading
LinearDiscriminantAnalysis
LogisticRegression
LogisticRegressionCV
MLPClassifier
MultinomialNB
NuSVC
QDA
QuadraticDiscriminantAnalysis
RandomForestClassifier
SGDClassifier
SVC
VBGMM
_BinaryGaussianProcessClassifierLaplace
_ConstantPredictor
_DPGMMBase
_GMMBase
_LDA
_QDA
