In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
train_df=pd.read_csv('train.csv')

In [3]:
train_df.head()

Unnamed: 0,id,Doc_ID,Sent_ID,Word,tag
0,1,1,1,Obesity,O
1,2,1,1,in,O
2,3,1,1,Low-,O
3,4,1,1,and,O
4,5,1,1,Middle-Income,O


In [10]:
train_df['Word']=train_df['Word'].fillna(' ')

In [24]:
X = train_df.drop(['id','tag','Doc_ID'], axis=1)
v = DictVectorizer(sparse=True)
X = v.fit_transform(X.to_dict('records'))

In [25]:
y = train_df.tag.values

In [26]:
classes = np.unique(y)
classes = classes.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)
X_train.shape, y_train.shape

((3635066, 184507), (3635066,))

In [27]:
nb = MultinomialNB(alpha=0.01)
nb.partial_fit(X_train, y_train, classes)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [28]:
new_classes = classes.copy()
new_classes.pop()
new_classes

['B-indications', 'I-indications']

In [29]:
print(classification_report(y_pred=nb.predict(X_test), y_true=y_test, labels=new_classes))

               precision    recall  f1-score   support

B-indications       0.64      0.45      0.53     10443
I-indications       0.61      0.37      0.46      8999

    micro avg       0.63      0.42      0.50     19442
    macro avg       0.63      0.41      0.50     19442
 weighted avg       0.63      0.42      0.50     19442



In [39]:
test_df=pd.read_csv('test.csv')

In [40]:
test_ids=test_df.id.values

In [41]:
test_df.head()

Unnamed: 0,id,Doc_ID,Sent_ID,Word
0,4543834,30001,191283,CCCVA
1,4543835,30001,191283,","
2,4543836,30001,191283,MANOVA
3,4543837,30001,191283,","
4,4543838,30001,191283,my


In [42]:
test_df=test_df.drop(columns=['id','Doc_ID'],axis=1)

In [43]:
test_df.head()

Unnamed: 0,Sent_ID,Word
0,191283,CCCVA
1,191283,","
2,191283,MANOVA
3,191283,","
4,191283,my


In [36]:
test_df['Word']=test_df['Word'].fillna(' ')

In [45]:
test_transform=v.transform(test_df.to_dict('records'))

In [46]:
test_preds=nb.predict(test_transform)

In [47]:
sub=pd.DataFrame()

In [170]:
sub['id']=test_ids
sub['Sent_ID']=test_df['Sent_ID']
sub['tag']=test_preds
sub.head()

Unnamed: 0,id,Sent_ID,tag
0,4543834,191283,O
1,4543835,191283,O
2,4543836,191283,O
3,4543837,191283,O
4,4543838,191283,O


In [49]:
sub.tag.value_counts()

O                2961633
B-indications      17827
I-indications      15003
Name: tag, dtype: int64

In [50]:
sub.to_csv('submission-multi-nb.csv',index=False)

In [53]:
!apt-get install zip

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  zip
0 upgraded, 1 newly installed, 0 to remove and 13 not upgraded.
Need to get 158 kB of archives.
After this operation, 587 kB of additional disk space will be used.
Get:1 http://us-east1.gce.archive.ubuntu.com/ubuntu xenial/main amd64 zip amd64 3.0-11 [158 kB]
Fetched 158 kB in 0s (5,533 kB/s)
Selecting previously unselected package zip.
(Reading database ... 76972 files and directories currently installed.)
Preparing to unpack .../archives/zip_3.0-11_amd64.deb ...
Unpacking zip (3.0-11) ...
Processing triggers for man-db (2.7.5-1) ...
Setting up zip (3.0-11) ...


In [54]:
!zip -r submission-multi-nb.zip submission-multi-nb.csv

  adding: submission-multi-nb.csv (deflated 73%)


### 10 fold - CV

In [59]:
from sklearn.model_selection import StratifiedKFold
import time

In [56]:
fold_n = 10
cv = fold_n   
folds = StratifiedKFold(n_splits=fold_n, random_state=10, shuffle=True)

In [92]:
y_pred_nb = {}
features = ['Sent_ID','Word']
oof = np.array(['O' for i in range(len(train_df))])

In [95]:
for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
    print('Fold', fold_n, 'started at', time.ctime())    
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    nb = MultinomialNB(alpha=0.01)
    nb.partial_fit(X_train, y_train, classes)
    oof[valid_index]=nb.predict(X_valid)
    y_pred_nb[fold_n]= nb.predict(test_transform)

Fold 0 started at Sat Mar 23 09:31:43 2019
Fold 1 started at Sat Mar 23 09:31:48 2019
Fold 2 started at Sat Mar 23 09:31:52 2019
Fold 3 started at Sat Mar 23 09:31:57 2019
Fold 4 started at Sat Mar 23 09:32:02 2019
Fold 5 started at Sat Mar 23 09:32:07 2019
Fold 6 started at Sat Mar 23 09:32:12 2019
Fold 7 started at Sat Mar 23 09:32:17 2019
Fold 8 started at Sat Mar 23 09:32:22 2019
Fold 9 started at Sat Mar 23 09:32:27 2019


In [73]:
cv_test_preds=pd.DataFrame(y_pred_nb)

In [76]:
cv_test_preds=cv_test_preds.replace('O',0)
cv_test_preds=cv_test_preds.replace('I-indications',1)
cv_test_preds=cv_test_preds.replace('B-indications',2)

In [77]:
cv_test_preds['final']=cv_test_preds.max(axis=1)

In [78]:
cv_test_preds['final'].value_counts()

0    2957477
2      20130
1      16856
Name: final, dtype: int64

In [79]:
cv_test_preds['final']=cv_test_preds['final'].replace(0,'O')
cv_test_preds['final']=cv_test_preds['final'].replace(1,'I-indications')
cv_test_preds['final']=cv_test_preds['final'].replace(2,'B-indications')

In [80]:
sub['tag']=cv_test_preds['final'].values

In [81]:
sub.to_csv('submission-multi-nb_cv.csv',index=False)

In [82]:
!zip -r submission-multi-nb_cv.zip submission-multi-nb_cv.csv

  adding: submission-multi-nb_cv.csv (deflated 73%)


### Eval metric

In [96]:
import ner_f1

In [99]:
ideal=pd.DataFrame()
ideal['id']=train_df['id']
ideal['Sent_ID']=train_df['Sent_ID']
ideal['tag']=y

In [100]:
pred=pd.DataFrame()
pred['id']=train_df['id']
pred['Sent_ID']=train_df['Sent_ID']
pred['tag']=oof

In [101]:
ner_f1.calculate_score(ideal,pred)

0.3009418906209923

### CRF

In [103]:
!pip3 install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3 (from sklearn_crfsuite)
  Downloading https://files.pythonhosted.org/packages/4d/2c/274b89d009bb019038feaf96abb263d7cbff069a0771841013cf1832156e/python_crfsuite-0.9.6-cp35-cp35m-manylinux1_x86_64.whl (739kB)
[K    100% |████████████████████████████████| 747kB 871kB/s ta 0:00:011
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.6 sklearn-crfsuite-0.3.6
[33mYou are using pip version 8.1.1, however version 19.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [104]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter

In [109]:
import spacy
pos_tag=[]
nlp = spacy.load('en_core_web_sm')

In [110]:
from tqdm import tqdm

In [114]:
unique_words=train_df['Word'].unique()

In [117]:
len(unique_words)

184506

In [1]:
pos_tag={}
for ind in tqdm(range(len(unique_words))):
    doc=nlp(unique_words[ind])
    for token in doc:
        pos_tag[unique_words[ind]]=token.tag_

In [120]:
train_words=list(pos_tag.keys())
train_pos=list(pos_tag.values())

In [121]:
pos_df=pd.DataFrame()
pos_df['Word']=train_words
pos_df['pos']=train_pos

In [122]:
train_df_pos=train_df.merge(pos_df,on='Word',how='left')

In [124]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['pos'].values.tolist(), 
                                                           s['tag'].values.tolist())]
        self.grouped = self.data.groupby('Sent_ID').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None
getter = SentenceGetter(train_df_pos)
sentences = getter.sentences

In [142]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features

In [143]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [144]:
def sent2labels(sent):
    return [label for token, postag, label in sent]

In [145]:
def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [146]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [148]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [225]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.2,
    c2=0.0,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=1000,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [192]:
unique_words_test=test_df['Word'].unique()

In [2]:
pos_tag_test={}
for ind in tqdm(range(len(unique_words_test))):
    doc=nlp(unique_words_test[ind])
    for token in doc:
        pos_tag_test[unique_words_test[ind]]=token.tag_

In [194]:
test_words=list(pos_tag_test.keys())
test_pos=list(pos_tag_test.values())

In [195]:
pos_df_test=pd.DataFrame()
pos_df_test['Word']=test_words
pos_df_test['pos']=test_pos

In [196]:
test_df_pos=test_df.merge(pos_df_test,on='Word',how='left')

In [197]:
test_df_pos['tag']='O'

In [199]:
test_getter = SentenceGetter(test_df_pos)
test_sentences = test_getter.sentences

In [200]:
test_df_X = [sent2features(s) for s in test_sentences]
test_df_y = [sent2labels(s) for s in test_sentences]

In [227]:
y_test_pred = crf.predict(test_df_X)

In [228]:
len(y_test_pred)

125840

In [229]:
y_test_pred_unpack=[val for sub_li in y_test_pred for val in sub_li]

In [230]:
len(y_test_pred_unpack)

2994463

In [231]:
len(test_df)

2994463

In [232]:
sub['tag']=y_test_pred_unpack

In [233]:
sub.head()

Unnamed: 0,id,Sent_ID,tag
0,4543834,191283,O
1,4543835,191283,O
2,4543836,191283,O
3,4543837,191283,O
4,4543838,191283,O


In [234]:
sub.to_csv('final_submission.csv',index=False)

In [252]:
import pickle
with open('train_x.pkl', 'wb') as f:
    pickle.dump(X, f)
with open('train_y.pkl', 'wb') as f:
    pickle.dump(y, f)
with open('test.pkl', 'wb') as f:
    pickle.dump(test_df_X, f)