# LSTM for NER

Adapted from [this](https://github.com/cltl/ma-ml4nlp-labs/blob/main/code/assignment3/lstm-ner.ipynb) notebook

# Imports

In [1]:
import numpy as np
import pandas as pd

from keras import Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional
from keras.utils import plot_model
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

import gensim.downloader as api



# Set path to data and embeddings:

In [9]:
# conll data
path_train ='/kaggle/input/ner-data/conll2003.train.conll'
path_eval = '/kaggle/input/ner-data/conll2003.test.conll'

paths = [path_train, path_eval]

# change to test if you are evaluating on test:
eval_split = 'test'

# model output path
output_path = 'lstm-out.csv'

# Data preparation

In [11]:
# connll data

def convert_data(paths):
    
    data = []
    sent_id = 1
    for path in paths:
        split = path.split('.')[-2]
        with open(path) as infile:
            lines = infile.read().split('\n')
        for n, line in enumerate(lines):
            ll = line.split('\t')
            if len(ll) > 2:
                d = dict()
                d['Sentence #'] = f'Sentence: {sent_id}'
                d['Word'] = ll[0]
                d['POS'] = ll[1]
                d['Tag'] = ll[-1]
                d['Split'] = split
                data.append(d)

            else:
                sent_id += 1
    data = pd.DataFrame(data)
    return data

data = convert_data(paths)

### Map tokens and labels to indices

In [41]:
# map tokens and labels to indices

def get_dict_map(data, token_or_tag, embedding_model=None):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}   
    
    return tok2idx, idx2tok


token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')
n_vocab = len(token2idx)
n_tags = len(tag2idx)
print(n_vocab)
print(n_tags)

27316
9


# Integrating embeddings

Change the path of the embedding model below to load your own GoogleNews vectors. 

In [None]:
# Load embedding model
# Change path to your path
w2v_model = api.load('word2vec-google-news-300')

# Create embedding matrix with zero vectors for oov words
emb_dim = 300
embedding_matrix = np.zeros((len(token2idx) + 1, emb_dim))
print(embedding_matrix.shape)
for word, i in token2idx.items():
    # You may have to change the following line to:
    # if word in w2v_model:
    if word in w2v_model.key_to_index:
        embedding_vector = w2v_model[word]
    else:
        embedding_vector = None
        # If you want to check OOV words:
        #print('couldnt find:', word, i)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
emb_dim = embedding_matrix.shape[1]

In [21]:
# Add index info to dataframe
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Split,Word_idx,Tag_idx
0,Sentence: 1,EU,NNP,B-ORG,train,7450,1
1,Sentence: 1,rejects,VBZ,O,train,12056,0
2,Sentence: 1,German,JJ,B-MISC,train,1791,6
3,Sentence: 1,call,NN,O,train,7320,0
4,Sentence: 1,to,TO,O,train,21011,0


In [22]:
# Group data by sentences
# Fill na
data_fillna = data.fillna(method='ffill', axis=0)
# Groupby and collect columns
data_group = data_fillna.groupby(['Sentence #'], as_index=False)[['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx', 'Split']].agg(lambda x: list(x))
# Visualise data
data_group.head()

  data_fillna = data.fillna(method='ffill', axis=0)


Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx,Split
0,Sentence: 1,"[EU, rejects, German, call, to, boycott, Briti...","[NNP, VBZ, JJ, NN, TO, VB, JJ, NN, .]","[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]","[7450, 12056, 1791, 7320, 21011, 12400, 25419,...","[1, 0, 6, 0, 0, 0, 6, 0, 0]","[train, train, train, train, train, train, tra..."
1,Sentence: 10,"[But, Fischler, agreed, to, review, his, propo...","[CC, NNP, VBD, TO, VB, PRP$, NN, IN, DT, NNP, ...","[O, B-PER, O, O, O, O, O, O, O, B-ORG, O, O, O...","[16088, 10397, 1878, 21011, 16625, 4853, 4421,...","[0, 8, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[train, train, train, train, train, train, tra..."
2,Sentence: 100,"[The, Syrians, are, confused, ,, they, are, de...","[DT, NNPS, VBP, VBN, ,, PRP, VBP, RB, JJ, ,, C...","[O, B-MISC, O, O, O, O, O, O, O, O, O, O, O, O...","[15206, 6558, 18695, 2236, 26419, 12948, 18695...","[0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[train, train, train, train, train, train, tra..."
3,Sentence: 1000,"[The, youth, side, replied, with, 246, for, se...","[DT, NN, NN, VBD, IN, CD, IN, CD, .]","[O, O, O, O, O, O, O, O, O]","[15206, 8435, 8052, 13448, 19470, 11187, 5326,...","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[train, train, train, train, train, train, tra..."
4,Sentence: 10000,"[Men, 's, 3,000, metres, :]","[NN, POS, CD, NNS, :]","[O, O, O, O, O]","[9988, 23683, 2082, 19984, 16435]","[0, 0, 0, 0, 0]","[train, train, train, train, train]"


In [23]:
# Change eval_split from 'dev' to test to run on test data
def get_pad_train_test_val(data_group, data, eval_split='dev', n_vocab = n_vocab):

    #get max token and tag length
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))
    print(n_token)

    #Pad tokens (X var)    
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    # value should be the number of items in the vocb?
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int64', padding='post', value= n_vocab)
    print('padding', len(pad_tokens[0]))
    # I used the code below to check the if the padded vectors are set to 0:
#     for token in pad_tokens:
#         print(token[-1])
# #         print(embedding_matrix[token[-1]])
#         break

    #Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int64', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    
    train_tokens = []
    dev_tokens = []
    train_tags = []
    dev_tags = []
    for i, row in data_group.iterrows():
        if 'train' in row['Split']:
            train_tokens.append(pad_tokens[i])
            train_tags.append(pad_tags[i])
        elif eval_split in row['Split']:
            #dev_idx.append(i)
            dev_tokens.append(pad_tokens[i])
            dev_tags.append(pad_tags[i])

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tokens length:', len(train_tokens),
        #'\ntest_tokens length:', len(test_tokens),
        #'\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(dev_tokens),
        '\nval_tags:', len(dev_tags))
 
    return np.array(train_tokens), np.array(dev_tokens),  np.array(train_tags), np.array(dev_tags)

train_tokens, dev_tokens,  train_tags, dev_tags = get_pad_train_test_val(data_group, data, eval_split= eval_split)

27316
padding 124
train_tokens length: 14041 
train_tokens length: 14041 
val_tokens: 3453 
val_tags: 3453


# Build model

In [25]:
input_dim = len(list(set(data['Word'].to_list()))) +1
output_dim = emb_dim # number of dimensions
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)
print('input_dim: ', 
      input_dim, '\noutput_dim: ', 
      output_dim, '\ninput_length: ', 
      input_length, '\nn_tags: ', n_tags)
print('emb dim', emb_dim)

input_dim:  27317 
output_dim:  300 
input_length:  124 
n_tags:  9
emb dim 300


In [26]:
def get_bilstm_lstm_model(embedding_matrix, embedding_dim):
    
    model = Sequential()
    #token2idx
    # Add Embedding layer original, trainable
    #model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))
    print(len(token2idx))
    embedding_layer = Embedding(len(token2idx)+1 ,
                            embedding_dim,
                            weights=[embedding_matrix],
                            # make max sent length a variable
                            input_length=input_length,
                            trainable=False)
    model.add(embedding_layer)

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Add LSTM
    # Pia decided to remove this
#     model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Add timeDistributed Layer
    # Pia: replaced relu with sigmoid 
    model.add(TimeDistributed(Dense(n_tags, activation="sigmoid")))

 
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])
    model.summary()
    
    return model

In [31]:
def train_model(X, y, model):
    loss = list()
    for i in range(20):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=200, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

# Train model 

In [32]:
results = pd.DataFrame()
embedding_dim = 300 # dimensions of the word2vec vectors
model_bilstm_lstm = get_bilstm_lstm_model(embedding_matrix, embedding_dim)
plot_model(model_bilstm_lstm)
# change to val_tokens to try out training on val set
results['with_add_lstm'] = train_model(train_tokens, train_tags, model_bilstm_lstm)

27316
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 124, 300)          8195100   
                                                                 
 bidirectional_2 (Bidirecti  (None, 124, 600)          1442400   
 onal)                                                           
                                                                 
 time_distributed_2 (TimeDi  (None, 124, 9)            5409      
 stributed)                                                      
                                                                 
Total params: 9642909 (36.78 MB)
Trainable params: 1447809 (5.52 MB)
Non-trainable params: 8195100 (31.26 MB)
_________________________________________________________________


# Evaluate

The code below evaluates your model on the development data using accuracy (which is not very indicative on this task. To get better insights, store the model output and run your own evaluation.

In [33]:
# Evaluate the model on the test data using `evaluate`
# Careful: Really high even if the model only predicts the majority class

results = model_bilstm_lstm.evaluate(dev_tokens, np.array(dev_tags), batch_size=1)

Evaluate on test data
test loss, test acc: [0.031662166118621826, 0.9916412234306335]


# Get model predictions

In [35]:
# Get predictions on development set
y_pred = model_bilstm_lstm.predict(dev_tokens)

# get dimension index with highest prob (--> label)
y_pred = np.argmax(y_pred, axis=-1)
y_dev =  np.argmax(dev_tags, axis=-1)



In [40]:
# Get predictions per token:
# map labels back to tokens

def output_to_file(dev_tokens, y_pred, output_path):
    
    with open(output_path, 'w') as outfile:
        for token,  preds in zip(dev_tokens, y_pred):
            for tok, pred in zip(token, preds):
                # igonre padding:
                if tok in idx2token:
                    tok_str = idx2token[tok]
                    outfile.write(f'{tok_str}\t{idx2tag[pred]}\n')
    
output_to_file(dev_tokens, y_pred, "lstmout.txt")

In [38]:
import pickle

with open("lstm.pkl", 'wb') as f:
        pickle.dump(model_bilstm_lstm, f)