In [272]:
# managing imports 

import keras 
import numpy as np 
import os 
import math 
from conll_modules import CoNLLDictorizer, Token
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
import operator 

## Load GLoVe embedding vectors 

In [195]:
# loading pre trained word embeddings GloVe

glove_dir = './glove.6B/'

embeddings_index = {}

f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [196]:
embedding_words = sorted(list(embeddings_index.keys()))

## Load the CoNLL corpus 

In [197]:
column_names = ['form', 'ppos', 'pchunk', 'ner']

def load_conll2003_en():
    BASE_DIR = './data/'
    train_file = BASE_DIR + 'train.txt'
    dev_file = BASE_DIR + 'valid.txt'
    test_file = BASE_DIR + 'test.txt'
    train_sentences = open(train_file).read().strip()
    dev_sentences = open(dev_file).read().strip()
    test_sentences = open(test_file).read().strip()
    return train_sentences, dev_sentences, test_sentences, column_names

train_sentences, dev_sentences, test_sentences, column_names = load_conll2003_en()
train_sentences[:100]

'-DOCSTART- -X- -X- O\n\nEU NNP B-NP B-ORG\nrejects VBZ B-VP O\nGerman JJ B-NP B-MISC\ncall NN I-NP O\nto T'

In [198]:
conll_dict = CoNLLDictorizer(column_names, col_sep=' +')
train_dict = conll_dict.transform(train_sentences)
dev_dict = conll_dict.transform(dev_sentences)
test_dict = conll_dict.transform(test_sentences)

In [199]:
train_dict[0]

[{'form': '-DOCSTART-', 'ppos': '-X-', 'pchunk': '-X-', 'ner': 'O'}]

## Build sequences from the dictionaries 

In [200]:
def build_sequences(corpus_dict, key_x='form', key_y='ner', tolower=True):
    """
    Creates sequences from a list of dictionaries
    :param corpus_dict:
    :param key_x:
    :param key_y:
    :return:
    """
    X = []
    Y = []
    for sentence in corpus_dict:
        x = []
        y = []
        for word in sentence:
            x += [word[key_x]]
            y += [word[key_y]]
        if tolower:
            x = list(map(str.lower, x))
        X += [x]
        Y += [y]
    return X, Y

In [201]:
X_words, Y_ner = build_sequences(train_dict)
X_dev, Y_dev = build_sequences(dev_dict)
X_test, Y_test = build_sequences(test_dict)

In [202]:
print(X_words[1])
print(Y_ner[1])

['eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.']
['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


## Counter number of unique words in vocabulary (embedding and corpus) 

In [203]:
embedding_words = list(embeddings_index.keys())
word_set = list(set([item for sublist in X_words for item in sublist]))
ner_set = list(set([item for sublist in Y_ner for item in sublist]))
total_words = set(embedding_words + word_set)

## Build index where index 0 is kept for the unknown words 

`idx_word = dict(enumerate(vocabulary_words, start = 2))` where 0 is for padding and 1 is for unknown word. 

In [204]:
rev_word_idx = dict(enumerate(total_words, start=2))
rev_ner_idx = dict(enumerate(ner_set, start=2))
word_idx = {v: k for k, v in rev_word_idx.items()}
ner_idx = {v: k for k, v in rev_ner_idx.items()}

## Cosine similarity for words _table_, _france_ and _sweden_

In [271]:
def get_n_closest_words(word, n): 
    word_embedding = embeddings_index[word]
    cosine_dict = {w: 1 - cosine(word_embedding, embeddings_index[w]) for w in embeddings_index}
    return sorted(cosine_dict.items(), key=operator.itemgetter(1))[-(n + 1):-1]

get_n_closest_words('table', 5)

[('side', 0.6433666348457336),
 ('room', 0.654369056224823),
 ('bottom', 0.6559719443321228),
 ('place', 0.658237874507904),
 ('tables', 0.8021162748336792)]

In [268]:
get_n_closest_words('sweden', 5)

[('austria', 0.7466837763786316),
 ('netherlands', 0.7468465566635132),
 ('finland', 0.7906494140625),
 ('norway', 0.8073249459266663),
 ('denmark', 0.8624401688575745)]

In [269]:
get_n_closest_words('france', 5)

[('paris', 0.7481586933135986),
 ('spain', 0.7557463049888611),
 ('britain', 0.7950528860092163),
 ('french', 0.8004377484321594),
 ('belgium', 0.8076422810554504)]

## Building embedding matrix

In [205]:
embedding_matrix = np.random.random((len(total_words) + 2, 100))
embedding_matrix.shape

(402597, 100)

In [206]:
for word in total_words: 
    if word in embeddings_index.keys(): 
        embedding_matrix[word_idx[word], :] = embeddings_index[word]

Creating the $\mathbf{X}$ and $\mathbf{Y}$ Sequences

In [207]:
X_words_idx = [list(map(lambda words: word_idx[words], words)) for words in X_words]
Y_ner_idx = [list(map(lambda ner: ner_idx[ner], ner)) for ner in Y_ner]

X_dev = [list(map(lambda words: word_idx.get(words, 1), words)) for words in X_dev]
Y_dev = [list(map(lambda ner: ner_idx.get(ner, 1), ner)) for ner in Y_dev]

X_test_unpadded = [list(map(lambda words: word_idx.get(words, 1), words)) for words in X_test]
Y_test = [list(map(lambda ner: ner_idx.get(ner, 1), ner)) for ner in Y_test]

X_words_idx[1]

[117788, 238614, 384043, 103649, 298006, 322581, 141446, 151281, 150277]

In [208]:
len(X_words_idx)

14987

## Building neural network architecture 

In [209]:
maxlen = 150

In [210]:
from keras.preprocessing.sequence import pad_sequences

X_train = pad_sequences(X_words_idx, maxlen)
Y_train = pad_sequences(Y_ner_idx, maxlen)

X_dev = pad_sequences(X_dev, maxlen)
Y_dev = pad_sequences(Y_dev, maxlen)

X_test = pad_sequences(X_test_unpadded, maxlen)
Y_test = pad_sequences(Y_test, maxlen)

In [211]:
print(X_train[1])
print(Y_train[1])
print(type(X_train))

[     0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      

In [212]:
from keras.utils import to_categorical 
Y_train = to_categorical(Y_train, num_classes = len(ner_set) + 2)
Y_dev = to_categorical(Y_dev, num_classes = len(ner_set) + 2)
Y_test = to_categorical(Y_test, num_classes = len(ner_set) + 2)

In [213]:
print(X_train.shape)
print(Y_train.shape)

(14987, 150)
(14987, 150, 11)


In [214]:
from keras import models, layers 
from keras.layers import SimpleRNN, Dense

EMBEDDING_DIM = 100
EPOCHS = 10
BATCH_SIZE = 128
NB_CLASSES = len(ner_set)

model = models.Sequential()
model.add(layers.Embedding(len(total_words) + 2,
EMBEDDING_DIM, mask_zero=True, input_length=maxlen))

model.layers[0].set_weights([embedding_matrix])

# The default is True
model.layers[0].trainable = False
model.add(SimpleRNN(100, return_sequences=True))
model.add(Dense(NB_CLASSES + 2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
model.summary()


Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 150, 100)          40259700  
_________________________________________________________________
simple_rnn_4 (SimpleRNN)     (None, 150, 100)          20100     
_________________________________________________________________
dense_4 (Dense)              (None, 150, 11)           1111      
Total params: 40,280,911
Trainable params: 21,211
Non-trainable params: 40,259,700
_________________________________________________________________


In [215]:
model.fit(X_train, Y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_data = (X_dev, Y_dev))

Train on 14987 samples, validate on 3466 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x14a74bba8>

In [216]:
predicted_ner = model.predict(X_test, verbose = 1)



In [217]:
predicted_ner.shape

(3684, 150, 11)

In [218]:
print(X_test.shape)
print(Y_test.shape)

(3684, 150)
(3684, 150, 11)


In [219]:
model.evaluate(X_test, Y_test)



[0.016488607552197804, 0.9429134726524353]

In [220]:
def remove_padding(Y, X): 
    unpadded_pred = []
    for sent_nbr, sent_ner_predictions in enumerate(Y):
        pred = list(map(np.argmax, sent_ner_predictions))
        unpadded_pred += [pred[-len(X[sent_nbr]):]]
    return unpadded_pred

unpad_pred = remove_padding(predicted_ner, X_test_unpadded)
ground_truth = remove_padding(Y_test, X_test_unpadded)

In [221]:
def num_to_cat(Y): 
    pred = []
    for sublist in Y: 
        pred += [list(map(lambda x : rev_ner_idx[x], sublist))]
    return pred 

pred_cat = num_to_cat(unpad_pred)
ground_truth_cat = num_to_cat(ground_truth)

In [222]:
test_words, _ = build_sequences(test_dict)

In [227]:
f = open('eval-RNN.txt', 'w+')
faults = []
for s_idx, sentence in enumerate(test_words): 
    pred_sentence = ''
    predictions = pred_cat[s_idx]
    truth = ground_truth_cat[s_idx]
    for w_idx, word in enumerate(sentence):
        try: 
            pred_sentence += word + ' ' + predictions[w_idx] + ' ' + truth[w_idx] + '\n'
        except:
            pred_sentence += word + ' ' + 'O' + ' ' + 'O' + '\n' 
            faults.append([word, s_idx, w_idx])
    f.write(pred_sentence)

In [230]:
!python conlleval/conlleval.py < eval.txt

processed 46666 tokens with 5720 phrases; found: 5648 phrases; correct: 4064.
accuracy:  80.27%; (non-O)
accuracy:  94.29%; precision:  71.95%; recall:  71.05%; FB1:  71.50
              LOC: precision:  88.37%; recall:  65.72%; FB1:  75.38  1668
             MISC: precision:  53.70%; recall:  65.11%; FB1:  58.86  702
              ORG: precision:  52.44%; recall:  65.84%; FB1:  58.38  1661
              PER: precision:  82.99%; recall:  85.21%; FB1:  84.09  1617


## LSTM 

In [236]:
from keras import models, layers 
from keras.layers import LSTM, Dense, Bidirectional, Dropout 

EMBEDDING_DIM = 100
EPOCHS = 10
BATCH_SIZE = 128
NB_CLASSES = len(ner_set)

model = models.Sequential()
model.add(layers.Embedding(len(total_words) + 2,
EMBEDDING_DIM, mask_zero=True, input_length=maxlen))

model.layers[0].set_weights([embedding_matrix])

# The default is True
model.layers[0].trainable = False
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(Dense(NB_CLASSES + 2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 150, 100)          40259700  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 150, 200)          160800    
_________________________________________________________________
dense_6 (Dense)              (None, 150, 11)           2211      
Total params: 40,422,711
Trainable params: 163,011
Non-trainable params: 40,259,700
_________________________________________________________________


In [237]:
model.fit(X_train, Y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_data = (X_dev, Y_dev))

Train on 14987 samples, validate on 3466 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x151466ef0>

In [238]:
predicted_ner = model.predict(X_test, verbose = 1)

unpad_pred = remove_padding(predicted_ner, X_test_unpadded)
ground_truth = remove_padding(Y_test, X_test_unpadded)

pred_cat = num_to_cat(unpad_pred)
ground_truth_cat = num_to_cat(ground_truth)

test_words, _ = build_sequences(test_dict)

f = open('eval-LSTM.txt', 'w+')
faults = []
for s_idx, sentence in enumerate(test_words): 
    pred_sentence = ''
    predictions = pred_cat[s_idx]
    truth = ground_truth_cat[s_idx]
    for w_idx, word in enumerate(sentence):
        try: 
            pred_sentence += word + ' ' + predictions[w_idx] + ' ' + truth[w_idx] + '\n'
        except:
            pred_sentence += word + ' ' + 'O' + ' ' + 'O' + '\n' 
            faults.append([word, s_idx, w_idx])
    f.write(pred_sentence)



In [239]:
!python conlleval/conlleval.py < eval-LSTM.txt

processed 46472 tokens with 5748 phrases; found: 5625 phrases; correct: 4683.
accuracy:  87.14%; (non-O)
accuracy:  96.48%; precision:  83.25%; recall:  81.47%; FB1:  82.35
              LOC: precision:  87.97%; recall:  85.21%; FB1:  86.57  1663
             MISC: precision:  69.48%; recall:  69.19%; FB1:  69.34  698
              ORG: precision:  76.90%; recall:  74.08%; FB1:  75.47  1654
              PER: precision:  90.87%; recall:  90.70%; FB1:  90.78  1610
