In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score, f1_score
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, Activation, Dense, Dropout, TimeDistributed, Embedding
from keras.preprocessing.sequence import pad_sequences
from keras import callbacks
from keras.utils.vis_utils import plot_model

Using TensorFlow backend.


In [3]:
model = Sequential()

In [4]:
trainfile = "./conll2003/train.txt"
validationfile = "./conll2003/valid.txt"
testfile = "./conll2003/test.txt"

In [5]:
train = pd.read_csv(trainfile,delimiter = " ")
test = pd.read_csv(testfile,delimiter= " ")
valid = pd.read_csv(validationfile,delimiter= " ")

In [6]:
test.head()

Unnamed: 0,-DOCSTART-,-X-,-X-.1,O
0,SOCCER,NN,B-NP,O
1,-,:,O,O
2,JAPAN,NNP,B-NP,B-LOC
3,GET,VB,B-VP,O
4,LUCKY,NNP,B-NP,O


In [7]:
test.shape,train.shape,valid.shape

((46665, 4), (204566, 4), (51577, 4))

In [8]:
def read_file(file):
    with open(file, 'r') as f:
        content = f.read()
    X, Y, Z = [], [], []
    sentences = content.split("\n\n")[:-1]
    for sentence in sentences:
        tokens = sentence.split("\n")
        # initialize new lists
        x, y, z = [], [],[]
        for token in tokens:
            # initialize new tuple
            tuple = token.split(" ")
            # append tuple to list
            x.append(tuple[0])
            y.append(tuple[2])
            z.append(tuple[3])

        X.append(x)
        Y.append(y)
        Z.append(z)
    return(X,Y,Z)

In [9]:
train_1,train_2,train_3 = read_file(trainfile)
test_1,test_2, test_3 = read_file(testfile)
valid_1,valid_2,valid_3 = read_file(validationfile)

In [10]:
train_2

[['-X-'],
 ['B-NP', 'B-VP', 'B-NP', 'I-NP', 'B-VP', 'I-VP', 'B-NP', 'I-NP', 'O'],
 ['B-NP', 'I-NP'],
 ['B-NP', 'I-NP'],
 ['B-NP',
  'I-NP',
  'I-NP',
  'B-VP',
  'B-PP',
  'B-NP',
  'B-NP',
  'B-VP',
  'B-PP',
  'B-NP',
  'I-NP',
  'B-PP',
  'B-NP',
  'B-VP',
  'I-VP',
  'B-NP',
  'I-NP',
  'B-SBAR',
  'B-NP',
  'B-VP',
  'B-SBAR',
  'B-NP',
  'I-NP',
  'I-NP',
  'B-VP',
  'I-VP',
  'I-VP',
  'B-PP',
  'B-NP',
  'O'],
 ['B-NP',
  'B-NP',
  'I-NP',
  'B-PP',
  'B-NP',
  'I-NP',
  'I-NP',
  'B-NP',
  'I-NP',
  'I-NP',
  'I-NP',
  'I-NP',
  'B-VP',
  'B-PP',
  'B-NP',
  'I-NP',
  'B-VP',
  'I-VP',
  'B-NP',
  'B-PP',
  'B-NP',
  'B-ADJP',
  'B-PP',
  'B-NP',
  'B-SBAR',
  'B-NP',
  'I-NP',
  'I-NP',
  'B-VP',
  'B-ADJP',
  'O'],
 ['O',
  'B-NP',
  'B-VP',
  'I-VP',
  'I-VP',
  'B-NP',
  'I-NP',
  'I-NP',
  'B-SBAR',
  'B-NP',
  'B-VP',
  'I-VP',
  'I-VP',
  'B-NP',
  'I-NP',
  'B-PP',
  'B-NP',
  'O',
  'O',
  'B-NP',
  'I-NP',
  'B-NP',
  'I-NP',
  'I-NP',
  'I-NP',
  'I-NP',
  'I-NP',
 

In [11]:
from collections import Counter

In [12]:
d = Counter(x for sublist in train_3 for x in sublist)
print(d)

Counter({'O': 170524, 'B-LOC': 7140, 'B-PER': 6600, 'B-ORG': 6321, 'I-PER': 4528, 'I-ORG': 3704, 'B-MISC': 3438, 'I-LOC': 1157, 'I-MISC': 1155})


In [13]:
d = Counter(x for sublist in train_2 for x in sublist)
print(d)

Counter({'I-NP': 66645, 'B-NP': 57387, 'O': 27646, 'B-PP': 18843, 'B-VP': 18047, 'I-VP': 8818, 'B-ADVP': 2637, 'B-SBAR': 1269, 'B-ADJP': 1165, '-X-': 946, 'B-PRT': 527, 'I-ADJP': 211, 'I-ADVP': 138, 'I-PP': 103, 'B-INTJ': 59, 'I-CONJP': 39, 'B-LST': 32, 'B-CONJP': 31, 'I-SBAR': 19, 'I-LST': 4, 'I-INTJ': 1})


In [14]:
# replacing B with I label
def datapreprocessing(list_of_BIO):
    for x in list_of_BIO:
        for n, i in enumerate(x):
            if i == 'B-LOC':
                x[n] = 'I-LOC'
            if i == 'B-MISC':
                x[n] = 'I-MISC'
            if i == 'B-ORG':
                x[n] = 'I-ORG'
            if i == 'B-PER':
                x[n] = 'I-PER'

In [15]:
datapreprocessing(train_3)
datapreprocessing(test_3)
datapreprocessing(valid_3)

In [16]:
print(Counter(x for sublist in train_3 for x in sublist))
print(Counter(x for sublist in test_3 for x in sublist))
print(Counter(x for sublist in valid_3 for x in sublist))

Counter({'O': 170524, 'I-PER': 11128, 'I-ORG': 10025, 'I-LOC': 8297, 'I-MISC': 4593})
Counter({'O': 38554, 'I-PER': 2773, 'I-ORG': 2496, 'I-LOC': 1925, 'I-MISC': 918})
Counter({'O': 42975, 'I-PER': 3149, 'I-LOC': 2094, 'I-ORG': 2092, 'I-MISC': 1268})


In [17]:
# function to one hot encode the labels
def encode(arr, num_labels):
    one_hot = []
    for z in arr:
        # create a new numpy array with same shape as labels
        temp = np.zeros(num_labels, dtype=np.int32)
        temp[z] = 1
        one_hot.append(temp)
    return one_hot

In [18]:
import itertools

In [19]:
# check for unique labels in the labels array
# and sorted for label 
train_entities = sorted(set(itertools.chain(*train_3)))
test_entities = sorted(set(itertools.chain(*test_3)))
valid_entities = sorted(set(itertools.chain(*valid_3)))

In [20]:
train_words = sorted(set(itertools.chain(*train_1)))
test_words = sorted(set(itertools.chain(*test_1)))
valid_words = sorted(set(itertools.chain(*valid_1)))

In [21]:
len(train_words)

23624

In [22]:
#create dictionaries that map the words in the vocabulary to integers. 
#Then we can convert each of our reviews into integers so they can be passed into the network.

# reserve index 0 for padding/masking
train_idx2word = dict((i+1,v) for i,v in enumerate(train_words))
train_word2idx = dict((v, i+1) for i,v in enumerate(train_words))
train_idx2entity = dict((i+1,v) for i,v in sorted(enumerate(train_entities)))
train_entity2idx = dict((v, i+1) for i,v in sorted(enumerate(train_entities)))


test_idx2word = dict((i+1,v) for i,v in enumerate(test_words))
test_word2idx = dict((v, i+1) for i,v in enumerate(test_words))
test_idx2entity = dict((i+1,v) for i,v in sorted(enumerate(test_entities)))
test_entity2idx = dict((v, i+1) for i,v in sorted(enumerate(test_entities)))

valid_idx2word = dict((i+1,v) for i,v in enumerate(valid_words))
valid_word2idx = dict((v, i+1) for i,v in enumerate(valid_words))
valid_idx2entity = dict((i+1,v) for i,v in sorted(enumerate(valid_entities)))
valid_entity2idx = dict((v, i+1) for i,v in sorted(enumerate(valid_entities)))

In [23]:
# add 1 on top of label counts
num_entities = len(train_entity2idx) + 1
num_words = len(train_word2idx) + 1
print('Training & Testing Word Counts = {0}, Entity Count = {1}'.format(num_words, num_entities))

Training & Testing Word Counts = 23625, Entity Count = 6


In [24]:
# Training Dataset
# index encoder
train_x_enc = list(map(lambda x: [train_word2idx[wx] for wx in x], train_1))
train_y_enc = list(map(lambda y: [train_entity2idx[wy] for wy in y], train_3))

test_x_enc = list(map(lambda x: [test_word2idx[wx] for wx in x], test_1))
test_y_enc = list(map(lambda y: [test_entity2idx[wy] for wy in y], test_3))

valid_x_enc = list(map(lambda x: [valid_word2idx[wx] for wx in x], valid_1))
valid_y_enc = list(map(lambda y: [valid_entity2idx[wy] for wy in y], valid_3))

In [25]:
# one-hot encoder
train_y_oh_enc = list(map(lambda y: encode(y, num_labels=num_entities), train_y_enc))

test_y_oh_enc = list(map(lambda y: encode(y, num_labels=num_entities), test_y_enc))

valid_y_oh_enc = list(map(lambda y: encode(y, num_labels=num_entities), valid_y_enc))

In [26]:
#As maximum review length too many steps for RNN. Let's truncate to 64 steps. 
#For reviews shorter than 64 steps, we'll pad with 0s.
# training dataset
train_X_all = pad_sequences(train_x_enc, 64) 
train_Y_all = pad_sequences(train_y_oh_enc, 64)
#testing dataset
test_X_all = pad_sequences(test_x_enc, 64) 
test_Y_all = pad_sequences(test_y_oh_enc, 64)
#validation dataset
valid_X_all = pad_sequences(valid_x_enc, 64) 
valid_Y_all = pad_sequences(valid_y_oh_enc, 64)

# training the model

In [27]:
# Creating Callbacks which is used in the Keras fit function
# ModelCheckpoints is used to save the model after every epoch
# EarlyStopping is used to stop training when the validation loss has not improved after 2 epochs

cbks = [callbacks.ModelCheckpoint(filepath='./checkpoint_model_bilstm_with_dropout.h5', monitor='val_loss', save_best_only=True),
            callbacks.EarlyStopping(monitor='val_loss', patience=2)]


In [28]:
# define training hyperparameters

# embedding layer size
embedding_size = 32
# num of units in LSTM cell
num_cells = 100
batch_size = 32
num_epochs = 5

# if using bi_directional LSTM layer, set to true
use_bidirectional = True


# construct the NN model
model = Sequential()

In [31]:
# embed into vector space of dimension embedding_size
# input value 0 is a special "padding" value that should be masked out
# initialize with random vectors
model.add(Embedding(len(train_word2idx)+1, embedding_size, input_length=64, mask_zero=True))


# add LSTM layer; return all sequences for the output
if use_bidirectional:
    model.add(Bidirectional(LSTM(num_cells, return_sequences=True)))
else:
    model.add(LSTM(num_cells, return_sequences=True))


model.add(Dropout(0.2))

# applies fully-connected operation at every timestep
model.add(TimeDistributed(Dense(len(train_entity2idx)+1)))
# add softmax classifer at output
model.add(Activation('softmax'))

# use categorical cross entropy loss function and adam optimizer
model.compile(optimizer='adam', loss='categorical_crossentropy')
print (model.summary())


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 64, 32)            756000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64, 200)           106400    
_________________________________________________________________
dropout_1 (Dropout)          (None, 64, 200)           0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 64, 6)             1206      
_________________________________________________________________
activation_1 (Activation)    (None, 64, 6)             0         
Total params: 863,606
Trainable params: 863,606
Non-trainable params: 0
_______________

In [33]:
# train the model
model.fit(train_X_all, train_Y_all, batch_size=batch_size, epochs=num_epochs, validation_data=(valid_X_all,valid_Y_all),callbacks=cbks)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 14987 samples, validate on 3466 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5


<keras.callbacks.History at 0x12449268898>

In [36]:
# load the saved model
# returns a compiled model
from keras.models import load_model
model = load_model('checkpoint_model_bilstm_with_dropout.h5')

In [41]:
# run prediction on training data
Y_test_pred = model.predict_classes(valid_X_all)

In [42]:
# function to convert predicted values and actual values to appropiate format 
# to generate confusion matrix first remove the zero masked inputs and outputs
def clean(yh, pr):
    coords = [np.where(yhh > 0)[0][0] for yhh in yh]
    yh = [yhh[co:] for yhh, co in zip(yh, coords)]
    ypr = [prr[co:] for prr, co in zip(pr, coords)]
    # flatten to single array with class labels
    fyh = [c for row in yh for c in row]
    fpr = [c for row in ypr for c in row]
    return fyh, fpr

In [43]:
y_g_u, y_p_u = clean(valid_Y_all.argmax(2),Y_test_pred)

In [44]:
# prints confusion matrix for each labels
print ('\nconfusion matrix:\n')
print (confusion_matrix(y_g_u, y_p_u))


confusion matrix:

[[    0     0     0     0     0     0]
 [    5    82     1    30   463  1505]
 [    3    25     3     7   405   823]
 [    6    48     1    57   509  1471]
 [    6    69     3    76   752  2180]
 [  362  1333   155   321 24524 16166]]


In [46]:
# prints metrics for each labels
precision, recall, fscore, support  = precision_recall_fscore_support(y_g_u, y_p_u)
print('\nclass | precision,recall,fscore,support\n')
for tag, i in valid_entity2idx.items():
    print('{0} | {1:1.2f}\t{2:1.2f}\t{3:1.5f}\t{4}'.format(tag, precision[i-1], recall[i-1], fscore[i-1], support[i-1]))


class | precision,recall,fscore,support

I-LOC | 0.00	0.00	0.00000	0
I-MISC | 0.05	0.04	0.04502	2086
I-ORG | 0.02	0.00	0.00420	1266
I-PER | 0.12	0.03	0.04413	2092
O | 0.03	0.24	0.05057	3086


  'recall', 'true', average, warn_for)


In [47]:
# prints overall f1 score on wikiner dataset
# use averaging method of 'macro' to ensure equal weightage to each label
# irregadless of how big its support are

print(f1_score(y_g_u, y_p_u, average = 'macro'))

0.10688235078201085


  'recall', 'true', average, warn_for)
