In [1]:
# import necessary libraries
!pip install gensim
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from matplotlib import pyplot as plt

from nltk.corpus import brown
from nltk.corpus import treebank
from nltk.corpus import conll2000

import seaborn as sns

from gensim.models import KeyedVectors

from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense, Input
from keras.layers import TimeDistributed
from keras.layers import LSTM, GRU, Bidirectional, SimpleRNN, RNN
from keras.models import Model
from keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle



In [27]:
import pandas as pd
from collections import Counter
from sklearn.metrics import classification_report
import ast

In [3]:
def load_file(train_file):
    # Split train file into word and state lists
    word_ls = []
    st_ls = []
    f = open(train_file, encoding="utf8")
    for line in f:
        line_2 = str(line.strip('\n'))
        if line_2 != '':
            inter_ls = line_2.split()
            word_ls.append(inter_ls[0])
            st_ls.append(inter_ls[1])
        elif line_2 == '':
            word_ls.append(line_2)
            st_ls.append('')
    
    compiled_word_ls = []
    compiled_state_ls = []
    big_word_ls = []
    big_state_ls = []
    for i in range(len(word_ls)):
        if word_ls[i] != '':
            compiled_word_ls.append(word_ls[i])
            compiled_state_ls.append(st_ls[i])
        elif word_ls[i] == '':
            big_word_ls.append(compiled_word_ls)
            big_state_ls.append(compiled_state_ls)
            compiled_word_ls = []
            compiled_state_ls = []
    return big_word_ls, big_state_ls

In [6]:
# Retrieve Files and split them into respective lists
train_x_ls, train_y_ls = load_file('train')
test_x_ls, test_y_ls = load_file('dev.out')

In [8]:
## Count your unique words and tags
num_words = len(set([word.lower() for sentence in train_x_ls for word in sentence]))
num_tags   = len(set([word.lower() for sentence in train_y_ls for word in sentence]))
print("Total number of tagged sentences: {}".format(len(train_x_ls)))
print("Vocabulary size: {}".format(num_words))
print("Total number of tags: {}".format(num_tags))

Total number of tagged sentences: 7663
Vocabulary size: 16490
Total number of tags: 21


In [36]:
# encode X

word_tokenizer = Tokenizer()                      # instantiate tokeniser
word_tokenizer.fit_on_texts(train_x_ls)                    # fit tokeniser on data
X_train_encoded = word_tokenizer.texts_to_sequences(train_x_ls)  # use the tokeniser to encode input sequence
X_test_encoded = word_tokenizer.texts_to_sequences(test_x_ls)

In [37]:
# encode Y

tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(train_y_ls)
Y_train_encoded = tag_tokenizer.texts_to_sequences(train_y_ls)
Y_test_encoded = tag_tokenizer.texts_to_sequences(test_y_ls)

In [38]:
tokenizer_dic = tag_tokenizer.get_config()

In [39]:
# Retrieve your state dictionary for mapping
state_dic = tokenizer_dic['index_word']
state_dic = ast.literal_eval(state_dic)
state_dic = {k:v.upper() for k,v in state_dic.items()}

In [40]:
state_dic

{'1': 'I-NP',
 '2': 'B-NP',
 '3': 'O',
 '4': 'B-PP',
 '5': 'B-VP',
 '6': 'I-VP',
 '7': 'B-ADVP',
 '8': 'B-SBAR',
 '9': 'B-ADJP',
 '10': 'I-ADJP',
 '11': 'B-PRT',
 '12': 'I-ADVP',
 '13': 'I-PP',
 '14': 'I-CONJP',
 '15': 'B-CONJP',
 '16': 'I-SBAR',
 '17': 'B-INTJ',
 '18': 'B-LST',
 '19': 'I-INTJ',
 '20': 'I-UCP',
 '21': 'B-UCP'}

In [41]:
# Pad each sequence to MAX_SEQ_LENGTH using KERAS' pad_sequences() function. 
# Sentences longer than MAX_SEQ_LENGTH are truncated.
# Sentences shorter than MAX_SEQ_LENGTH are padded with zeroes.

# Truncation and padding can either be 'pre' or 'post'. 
# For padding we are using 'pre' padding type, that is, add zeroes on the left side.
# For truncation, we are using 'post', that is, truncate a sentence from right side.

MAX_SEQ_LENGTH = 100  # sequences greater than 100 in length will be truncated

X_train_padded = pad_sequences(X_train_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")
X_test_padded = pad_sequences(X_test_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")

Y_train_padded = pad_sequences(Y_train_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")
Y_test_padded = pad_sequences(Y_test_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")

In [42]:
# Word2Vec
# load word2vec using the following function present in the gensim library
word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [43]:
# assign word vectors from word2vec model

EMBEDDING_SIZE  = 300  # each word in word2vec model is represented using a 300 dimensional vector
VOCABULARY_SIZE = len(word_tokenizer.word_index) + 1

# create an empty embedding matix
embedding_weights = np.zeros((VOCABULARY_SIZE, EMBEDDING_SIZE))

# create a word to index dictionary mapping
word2id = word_tokenizer.word_index

# copy vectors from word2vec model to the words present in corpus
for word, index in word2id.items():
    try:
        embedding_weights[index, :] = word2vec[word]
    except KeyError:
        pass

In [44]:
# assign padded sequences to X and Y
X, Y = X_train_padded, Y_train_padded

In [62]:
# use Keras' to_categorical function to one-hot encode Y
Y_oh = to_categorical(Y)
Y_oh.shape

(7663, 100, 22)

In [63]:
# use Keras' to_categorical function to one-hot encode Y
Y_test_oh = to_categorical(Y_test_padded, 22)
Y_test_oh.shape

(1094, 100, 22)

In [64]:
# split entire data into training and testing sets
TEST_SIZE = 0.2
X_train, X_val, Y_train, Y_val = train_test_split(X, Y_oh, test_size=TEST_SIZE, random_state=4)

In [66]:
# total number of tags
NUM_CLASSES = Y_oh.shape[2]

# create architecture


bidirect_model = Sequential()
bidirect_model.add(Embedding(input_dim     = VOCABULARY_SIZE,
                             output_dim    = EMBEDDING_SIZE,
                             input_length  = MAX_SEQ_LENGTH,
                             weights       = [embedding_weights],
                             trainable     = True
))
bidirect_model.add(Bidirectional(LSTM(64, return_sequences=True)))
bidirect_model.add(TimeDistributed(Dense(NUM_CLASSES, activation='softmax')))
bidirect_model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])
# check summary of model
bidirect_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 300)          4947300   
_________________________________________________________________
bidirectional (Bidirectional (None, 100, 128)          186880    
_________________________________________________________________
time_distributed (TimeDistri (None, 100, 22)           2838      
Total params: 5,137,018
Trainable params: 5,137,018
Non-trainable params: 0
_________________________________________________________________


In [67]:
bidirect_training = bidirect_model.fit(X_train, Y_train, batch_size=128, epochs=10, validation_data=(X_val, Y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [102]:
y_pred = bidirect_model.predict(X_test_padded)
y_pred.argmax(axis=1)

array([[ 9, 86, 66, ..., 71, 71, 71],
       [ 9, 94, 93, ..., 91, 91, 98],
       [ 9, 96, 88, ..., 91, 98, 98],
       ...,
       [ 9, 91, 86, ..., 83, 76, 83],
       [ 9, 96, 86, ..., 84, 84, 84],
       [ 9, 95, 94, ..., 97,  0, 93]])

In [94]:
Y_test_padded.argmax(axis=1)

array([69, 91, 88, ..., 83, 81, 96])

In [99]:
Y_test_oh.argmax(axis=1)

array([[ 0, 58, 55, ...,  0,  0,  0],
       [ 0, 94, 89, ...,  0,  0,  0],
       [ 0, 92, 86, ...,  0,  0,  0],
       ...,
       [ 0, 85, 75, ...,  0,  0,  0],
       [ 0, 76, 75, ...,  0,  0,  0],
       [ 0, 94, 93, ...,  0,  0,  0]])

In [97]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(Y_test_padded.argmax(axis=1), y_pred.argmax(axis=1))
matrix

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ..., 15,  0,  1],
       [ 0,  0,  0, ...,  6, 38,  1],
       [ 0,  0,  0, ...,  2,  1,  8]])

In [81]:
print(classification_report(Y_test_padded.argmax(axis=1), y_pred.argmax(axis=1)))

              precision    recall  f1-score   support

          50       0.00      0.00      0.00         0
          53       0.00      0.00      0.00         1
          54       0.00      0.00      0.00         1
          55       0.00      0.00      0.00         1
          56       0.00      0.00      0.00         2
          57       0.00      0.00      0.00         1
          58       0.00      0.00      0.00         3
          59       0.00      0.00      0.00         2
          60       0.00      0.00      0.00         5
          61       0.50      1.00      0.67         1
          62       0.00      0.00      0.00         3
          63       0.00      0.00      0.00         3
          64       0.00      0.00      0.00         4
          65       0.43      0.50      0.46         6
          66       0.12      0.20      0.15         5
          67       0.00      0.00      0.00         9
          68       0.18      0.25      0.21         8
          69       0.09    

In [85]:
loss, accuracy = bidirect_model.evaluate(X_test_padded, Y_test_oh, verbose = 1)
print("Loss: {0},\nAccuracy: {1}".format(loss, accuracy))

Loss: 0.5808255076408386,
Accuracy: 0.9035557508468628
