In [1]:
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import LSTM, Dense, TimeDistributed, Embedding, Bidirectional
from keras.models import Model, Input
from keras_contrib.layers import CRF
from keras.callbacks import ModelCheckpoint


In [2]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.metrics import f1_score
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from keras.preprocessing.text import text_to_word_sequence
import pickle

import warnings
warnings.filterwarnings("ignore")



In [3]:
df = pd.read_csv('ner_dataset.csv', encoding = "ISO-8859-1")
df = df.fillna(method = 'ffill')


In [4]:
# This is a class te get sentence. The each sentence will be list of tuples with its tag and pos.
class sentence(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        self.empty = False
        agg = lambda s : [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                       s['POS'].values.tolist(),
                                                       s['Tag'].values.tolist())]
        self.grouped = self.df.groupby("Sentence #").apply(agg)
        self.sentences = [s for s in self.grouped]
        
    def get_text(self):
        try:
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent +=1
            return s
        except:
            return None
            

In [5]:
#Displaying one full sentence
getter = sentence(df)
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
sentences[0]

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

In [6]:
#sentence with its pos and tag.
sent = getter.get_text()
print(sent)

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


In [7]:
sentences = getter.sentences

In [8]:
# Number of data points passed in each iteration
batch_size = 64 
# Passes through entire dataset
epochs = 8
# Maximum length of review
max_len = 75 
# Dimension of embedding vector
embedding = 40 

In [9]:
words = list(df['Word'].unique())
tags = list(df['Tag'].unique())

word_to_index = { w:i+2 for i,w in enumerate(words)}
word_to_index["PAD"] = 1
word_to_index["UNK"] = 1

tag_to_index = {t:i+1 for i,t in enumerate(tags)}
# necssarily force PAD as Zero index.
tag_to_index["PAD"] = 0

idx2word = {i:w for w,i in word_to_index.items()}
idx2tag = {i:t for t,i in tag_to_index.items()}


In [10]:
sentences[0]

X = [ [word_to_index[w] for w,p,t in s] for s in sentences]
X = pad_sequences(sequences=X, maxlen=max_len, padding='post', value=word_to_index['PAD'])

In [11]:
y = [ [ tag_to_index[t] for w,p,t in s] for s in sentences ]
y = pad_sequences(sequences=y, maxlen=max_len, padding='post', value=tag_to_index['PAD'])

In [12]:
y

array([[1, 1, 1, ..., 0, 0, 0],
       [3, 1, 1, ..., 0, 0, 0],
       [1, 1, 8, ..., 0, 0, 0],
       ...,
       [1, 2, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 6, 7, ..., 0, 0, 0]], dtype=int32)

In [13]:
num_tag = df['Tag'].nunique()
y = [ to_categorical(i, num_classes=num_tag+1) for i in y ]


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)

In [15]:
import numpy as np
print("Size of training input data : ", X_train.shape)
print("Size of training output data : ", np.array(y_train).shape)
print("Size of testing input data : ", X_test.shape)
print("Size of testing output data : ", np.array(y_test).shape)

Size of training input data :  (40765, 75)
Size of training output data :  (40765, 75, 18)
Size of testing input data :  (7194, 75)
Size of testing output data :  (7194, 75, 18)


In [16]:
# Let's check the first sentence before and after processing.
print('*****Before Processing first sentence : *****\n', ' '.join([w[0] for w in sentences[0]]))
print('*****After Processing first sentence : *****\n ', X[0])

*****Before Processing first sentence : *****
 Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
*****After Processing first sentence : *****
  [ 2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 11 17  3 18 19 20 21 22 23
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1]


In [17]:
# First label before and after processing.
print('*****Before Processing first sentence : *****\n', ' '.join([w[2] for w in sentences[0]]))
print('*****After Processing first sentence : *****\n ', y[0])

*****Before Processing first sentence : *****
 O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O
*****After Processing first sentence : *****
  [[0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


### Bidirectional LSTM-CRF

In [18]:
print(len(words))
print(max_len)
print(embedding)

35178
75
40


In [23]:
35180*40

1407200

In [27]:
# import tensorflow as tf_1
# inputs_1 = tf_1.keras.Input(shape=(10, 128, 128, 3))
# conv_2d_layer = tf_1.keras.layers.Conv2D(64, (3, 3))
# outputs_1 = tf_1.keras.layers.TimeDistributed(conv_2d_layer)(inputs_1)
# outputs_1.shape
# mdl_1 = Model(inputs_1,outputs_1)
# mdl_1.compile(optimizer='rmsprop', loss='categorical_crossentropy')
# mdl_1.summary()
# # TensorShape([None, 10, 126, 126, 64])

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 10, 128, 128, 3)] 0         
_________________________________________________________________
time_distributed_3 (TimeDist (None, 10, 126, 126, 64)  1792      
Total params: 1,792
Trainable params: 1,792
Non-trainable params: 0
_________________________________________________________________


In [35]:
# input is the inpupt tensor initializer
input = Input(shape=(max_len,))

# input to Embedding layer is OHE of vocabulary. Output
# num_words + PAD + UNK;
# 1407200 => (35178+2)*40
model = Embedding(input_dim=len(words) + 2, output_dim=embedding, input_length=max_len, mask_zero=True, embeddings_initializer=None)(input)

'''
stateful: Boolean (default False). If True, the last state for each sample at index i in a batch will be used as initial state for the sample of index i in the following batch.
'''

"""
50 hidden units -> 
Refer Wikipedia for equations ->
LSTM Left-to-Right ->
50*40 * 4(Wi,Wf, Wo, Wc) = 8,000
50*50(Ui,Uf,Uo,Uc) = 10,000
50 (bi,bo,bf,bc) -> Bias
Total L-to-R -> 18200
Total R-to-L -> 18200
Total = 2*18200 = 36400 
"""
model = Bidirectional( LSTM(50, use_bias=True, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', return_sequences=True) )(model)

""" 
Hidden unit dimension = 50(left) + 50(right) = 100.
TimeDistr preserves history of 50 hidden units. Thus, 50*100 = approx 5000 params
"""
model = TimeDistributed(Dense(50, activation='relu'))(model)
crf = CRF(num_tag+1)
out = crf(model)


In [36]:
model = Model(input, out)

model.compile(optimizer='rmsprop', loss=crf.loss_function, metrics=[crf.accuracy])
print(model.summary())




Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         [(None, 75)]              0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 75, 40)            1407200   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 75, 100)           36400     
_________________________________________________________________
time_distributed_8 (TimeDist (None, 75, 50)            5050      
_________________________________________________________________
crf_3 (CRF)                  (None, 75, 18)            1278      
Total params: 1,449,928
Trainable params: 1,449,928
Non-trainable params: 0
_________________________________________________________________
None


Making Checkpoint each epoch to check and save the best model performance till last and also avoiding further validation loss drop due to overfitting.

In [38]:
checkptr = ModelCheckpoint("./model.h5", monitor='val_acc', verbose=1, save_best_only=False, save_weights_only=False, mode='auto', save_freq='epoch')

In [41]:
# history = model.fit(X_train, np.array(y_train), batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=[checkptr])
history = model.fit(X_train, np.array(y_train), batch_size=batch_size, epochs=epochs, validation_split=0.1)


Epoch 1/8


AttributeError: in user code:

    /home/aman/anaconda3/envs/tf_gpu/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    /home/aman/anaconda3/envs/tf_gpu/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/aman/anaconda3/envs/tf_gpu/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/aman/anaconda3/envs/tf_gpu/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/aman/anaconda3/envs/tf_gpu/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:532 train_step  **
        loss = self.compiled_loss(
    /home/aman/anaconda3/envs/tf_gpu/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:205 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    /home/aman/anaconda3/envs/tf_gpu/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:143 __call__
        losses = self.call(y_true, y_pred)
    /home/aman/anaconda3/envs/tf_gpu/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:246 call
        return self.fn(y_true, y_pred, **self._fn_kwargs)
    /home/aman/anaconda3/envs/tf_gpu/lib/python3.8/site-packages/keras_contrib/losses/crf_losses.py:54 crf_loss
        crf, idx = y_pred._keras_history[:2]

    AttributeError: 'Tensor' object has no attribute '_keras_history'
