In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
import time
import pickle 
import gc
import ner_f1
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter
import spacy
from tqdm import tqdm
from sklearn.metrics import make_scorer
import scipy.stats
from sklearn.model_selection import RandomizedSearchCV

In [2]:
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')

In [3]:
train_df['Word']=train_df['Word'].fillna(' ')
test_df['Word']=test_df['Word'].fillna(' ')

In [10]:
# unique_words=train_df['Word'].unique()

In [6]:
# nlp = spacy.load('en_core_web_sm')

In [24]:
# pos_tag={}
# for ind in tqdm(range(len(unique_words))):
#     doc=nlp(unique_words[ind])
#     for token in doc:
#         pos_tag[unique_words[ind]]=token.tag_

In [13]:
# train_words=list(pos_tag.keys())
# train_pos=list(pos_tag.values())

In [14]:
# pos_df=pd.DataFrame()
# pos_df['Word']=train_words
# pos_df['pos']=train_pos

In [15]:
# train_df_pos=train_df.merge(pos_df,on='Word',how='left')

In [4]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w,  t) for w,  t in zip(s['Word'].values.tolist(), 
                                                           
                                                           s['tag'].values.tolist())]
        self.grouped = self.data.groupby('Sent_ID').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None
getter = SentenceGetter(train_df)
train_sentences = getter.sentences

In [5]:
maxlen = max([len(s) for s in train_sentences])
print ('Maximum sequence length:', maxlen)

Maximum sequence length: 3899


In [6]:
words = list(set(train_df["Word"].values))
words.append("ENDPAD")

In [7]:
n_words = len(words); n_words

184507

In [8]:
tags = list(set(train_df["tag"].values))

In [9]:
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [10]:
tag2idx

{'B-indications': 2, 'I-indications': 0, 'O': 1}

In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w[0]] for w in s] for s in train_sentences]

In [12]:
X = pad_sequences(maxlen=140, sequences=X, padding="post",value=n_words - 1)

In [13]:
y = [[tag2idx[w[1]] for w in s] for s in train_sentences]

In [14]:
y = pad_sequences(maxlen=140, sequences=y, padding="post", value=tag2idx["O"])

In [15]:
y[7]

array([1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [16]:
n_tags = len(tags); n_tags

3

In [17]:
from tensorflow.keras.utils import to_categorical
y = [to_categorical(i, num_classes=n_tags) for i in y]

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [19]:
import tensorflow
from tensorflow.keras.models import Model,Sequential 
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

In [23]:
model= Sequential()
model.add(Embedding(input_dim=n_words, output_dim=140, input_length=140))
model.add(Dropout(rate =0.9))
model.add(Dropout(rate =0.9))
model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
model.add(TimeDistributed(Dense(n_tags, activation="softmax")))

In [24]:
model.compile(optimizer='adam',metrics=['accuracy'],loss='categorical_crossentropy')

In [25]:
model.fit(X_train, np.array(y_train), batch_size=512, epochs=3, validation_split=0.2, verbose=1)

Train on 122420 samples, validate on 30605 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f03b9ce87f0>

In [69]:
# unique_words_test=test_df['Word'].unique()

In [26]:
# pos_tag_test={}
# for ind in tqdm(range(len(unique_words_test))):
#     doc=nlp(unique_words_test[ind])
#     for token in doc:
#         pos_tag_test[unique_words_test[ind]]=token.tag_

In [27]:
# test_words=list(pos_tag_test.keys())
# test_pos=list(pos_tag_test.values())

In [28]:
# pos_df_test=pd.DataFrame()
# pos_df_test['Word']=test_words
# pos_df_test['pos']=test_pos

In [73]:
# test_df_pos=test_df.merge(pos_df_test,on='Word',how='left')

In [26]:
test_df['tag']='O'

In [27]:
test_getter = SentenceGetter(test_df)
test_sentences = test_getter.sentences

In [28]:
test_X = [[word2idx[w[0]] if w[0] in word2idx else 0 for w in s] for s in test_sentences]

In [29]:
test_X = pad_sequences(maxlen=140, sequences=test_X, padding="post",value=n_words - 1)

In [30]:
test_preds = model.predict(test_X,batch_size=512)
test_preds = np.argmax(test_preds, axis=-1)

In [31]:
test_len=[len(s) for s in test_sentences]

In [32]:
test_preds_mod=[]
for ind in tqdm(range(len(test_sentences))):
    if test_len[ind]<140:
        test_preds_mod.append(list(test_preds[ind][0:test_len[ind]]))
    else:
        test_mod=list(test_preds[ind][0:test_len[ind]])
        dummy=[1 for i in range(test_len[ind]-140)]
        test_mod.extend(dummy)
        test_preds_mod.append(test_mod)

100%|██████████| 125840/125840 [00:00<00:00, 248055.83it/s]


In [33]:
y_test_pred_unpack=[val for sub_li in test_preds_mod for val in sub_li]

In [34]:
sub=pd.DataFrame()
sub['id']=test_df.id.values
sub['Sent_ID']=test_df['Sent_ID']
sub['tag']=y_test_pred_unpack
sub.head()

Unnamed: 0,id,Sent_ID,tag
0,4543834,191283,1
1,4543835,191283,1
2,4543836,191283,1
3,4543837,191283,1
4,4543838,191283,1


In [35]:
sub['tag'].value_counts()

1    2992945
0       1503
2         15
Name: tag, dtype: int64

In [37]:
sub['tag']=sub['tag'].replace({0:'I-indications',1:'O',2:'B-indications'})

In [38]:
sub.to_csv('lstm.csv',index=False)

In [39]:
!zip -r lstm.zip lstm.csv

  adding: lstm.csv (deflated 89%)
