# Task 1: NER via Bi-directional LSTM

In [1]:
from __future__ import print_function
from collections import OrderedDict

import torch
import torch.nn as nn
from torch.nn import init
from torch.autograd import Variable
from torch import autograd

import time
import _pickle as cPickle

import urllib
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 80
plt.style.use('seaborn-pastel')

import os
import sys
import codecs
import re
import numpy as np

In [2]:
parameters = OrderedDict()
parameters['train'] = "./data/train" 
parameters['dev'] = "./data/dev" 
parameters['test'] = "./data/test" 
parameters['lower'] = True 
parameters['zeros'] =  False 
parameters['word_dim'] = 100 
parameters['word_lstm_dim'] = 256 
parameters['word_bidirect'] = True
parameters['embedding_path'] = "./glove.6B.100d.txt" 
parameters['dropout'] = 0.33 
parameters['epoch'] =  20
parameters['gradient_clip']=5.0
models_path = "./models/" 

#GPU
parameters['use_gpu'] = torch.cuda.is_available() #GPU Check
use_gpu = parameters['use_gpu']

#parameters['reload'] = "./models/pre-trained-model" 
parameters['reload'] = False
#Constants
START_TAG = '<START>'
STOP_TAG = '<STOP>'

## Load data and preprocess

In [3]:
def zero_digits(s):
    return re.sub('\d', '0', s)

def load_sentences(path, zeros):
    sentences = []
    sentence = []
    for line in codecs.open(path, 'r', 'utf8'):
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = line.split()
            assert len(word) >= 2
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences

In [4]:
train_sentences = load_sentences(parameters['train'], parameters['zeros'])
test_sentences = load_sentences(parameters['test'], parameters['zeros'])
dev_sentences = load_sentences(parameters['dev'], parameters['zeros'])

##### Create Mappings for Words and Tags

In [5]:
def create_dico(item_list):
    """
    Create a dictionary of items from a list of list of items.
    """
    assert type(item_list) is list
    dico = {}
    for items in item_list:
        for item in items:
            if item not in dico:
                dico[item] = 1
            else:
                dico[item] += 1
    return dico

def create_mapping(dico):
    """
    Create a mapping (item to ID / ID to item) from a dictionary.
    Items are ordered by decreasing frequency.
    """
    sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
    id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
    item_to_id = {v: k for k, v in id_to_item.items()}
    return item_to_id, id_to_item

def word_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    words = [[x[1].lower() if lower else x[1] for x in s] for s in sentences]
    dico = create_dico(words)
    dico['<UNK>'] = 10000000 #UNK tag for unknown words
    word_to_id, id_to_word = create_mapping(dico)
    print("Found %i unique words (%i in total)" % (
        len(dico), sum(len(x) for x in words)
    ))
    return dico, word_to_id, id_to_word

def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[word[-1] for word in s] for s in sentences]
    dico = create_dico(tags)
    dico[START_TAG] = -1
    dico[STOP_TAG] = -2
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag

In [6]:
dico_words,word_to_id,id_to_word = word_mapping(train_sentences, parameters['lower'])
dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

Found 21011 unique words (204567 in total)
Found 11 unique named entity tags


## Preparing final dataset

The function prepare dataset returns a list of dictionaries ( one dictionary per each sentence )

Each of the dictionary returned by the function contains
  1. list of all words in the sentence
  2. list of word index for all words in the sentence
  3. list of lists, containing character id of each character for words in the sentence
  4. list of tag for each word in the sentence.

In [7]:
def lower_case(x, lower=False):
    if lower:
        return x.lower()  
    else:
        return x

In [8]:
def prepare_dataset(sentences, word_to_id, tag_to_id, lower=False, test=0):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """
    data = []
    for s in sentences:
        str_words = [w[1] for w in s]
        words = [word_to_id[lower_case(w,lower) if lower_case(w,lower) in word_to_id else '<UNK>']
                 for w in str_words]
        if test == 0:
            tags = [tag_to_id[w[-1]] for w in s]
            data.append({
                'str_words': str_words,
                'words': words,
                #'chars': chars,
                'tags': tags,
            })
        else:
            data.append({
                'str_words': str_words,
                'words': words,
            })
    return data

train_data = prepare_dataset(
    train_sentences, word_to_id, tag_to_id, parameters['lower']
)
dev_data = prepare_dataset(
    dev_sentences, word_to_id, tag_to_id, parameters['lower']
)
test_data = prepare_dataset(
    test_sentences, word_to_id, tag_to_id, parameters['lower'], test = 1
)
print("{} / {} / {} sentences in train / dev / train.".format(len(train_data), len(dev_data), len(test_data)))

14987 / 3466 / 3684 sentences in train / dev / train.


## Load Word Embeddings

Now, We will randomly embed our data.

In [9]:
word_embeds = np.random.uniform(-np.sqrt(0.06), np.sqrt(0.06), (len(word_to_id), parameters['word_dim']))

In [10]:
word_embeds

array([[ 0.15803386,  0.04817372, -0.08887219, ..., -0.23762649,
         0.20478912, -0.2153611 ],
       [-0.1243387 ,  0.07428248,  0.14206932, ..., -0.12125302,
        -0.20020828,  0.02645931],
       [-0.01706656, -0.0725474 ,  0.10472512, ...,  0.09301148,
        -0.05962866, -0.07434036],
       ...,
       [-0.02680391,  0.15265586, -0.15616517, ...,  0.1653538 ,
         0.00999025,  0.15492515],
       [-0.03792276,  0.18925285, -0.02060655, ...,  0.0600599 ,
         0.05767155, -0.13615273],
       [ 0.05666281,  0.03390512,  0.18806308, ...,  0.24323237,
         0.09536502,  0.24343882]])

## Model


In [11]:
class BiLSTM(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim,
                  pre_word_embeds=None, use_gpu=False):

        
        super(BiLSTM, self).__init__()
        
        #parameter initialization for the model
        self.use_gpu = use_gpu
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        
        ### Layer 1: Embedding :
        #Word Embedding
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        if pre_word_embeds is not None:
            #Initializes the word embeddings with pretrained word embeddings
            self.pre_word_embeds = True
            self.word_embeds.weight = nn.Parameter(torch.FloatTensor(pre_word_embeds))
        else:
            self.pre_word_embeds = False
        
    
        #Initializing the dropout layer, with dropout specificed in parameters
        self.dropout = nn.Dropout(parameters['dropout'])
        
        ### Layer 2: Lstm :
        self.lstm = nn.LSTM(input_size=embedding_dim, 
                            hidden_size=hidden_dim // 2, 
                            num_layers=1,
                            bidirectional=True,
                            )    

        ### Layer 3: Linear + ELU
        #Linear layer maps the output of the bidirectional LSTM into linear layer.
        self.linear = nn.Linear(hidden_dim, 128)
        self.elu = nn.ELU()
        
        ### Layer 4: Classifier :
        self.hidden2tag = nn.Linear(128, self.tagset_size)
    
    
    def _get_lstm_features(self, sentence):
       
        ## Loading word embeddings
        ## sentence = [sen len, batch, embedding]
        embeds = self.word_embeds(sentence)
             
        ## We concatenate the word embeddings and the character level representation
        ## to create unified representation for each word
        if len(np.shape(embeds)) == 2:
            embeds = embeds.unsqueeze(1)

        ## Dropout on the unified embeddings
        embeds = self.dropout(embeds)

        ## Word lstm
        lstm_out, _ = self.lstm(embeds)

        ## Reshaping the outputs from the lstm layer
        linear_out = self.elu(self.linear(lstm_out)) 
        
        ## Dropout on the lstm output
        lstm_out = self.dropout(linear_out)

        ## Linear layer converts the ouput vectors to tag space
        lstm_feats = self.hidden2tag(lstm_out)

        return lstm_feats
    
    
    def forward(self, sentence):
        
        ## returns the LSTM's tag vectors
        feats = self._get_lstm_features(sentence)
        #[sentence length, batch size, output dim]
            
        return feats
    
    def neg_log_likelihood(self, sentence, tags):
        ## For evaluation
        # features is a 2D tensor, len(sentence) * self.tagset_size
        feats = self._get_lstm_features(sentence)
        #print(np.shape(feats),np.shape(tags))
        pred_tags = feats.view(-1, feats.shape[-1])
        true_tags = tags.view(-1)
        true_tags = Variable(true_tags)
        scores = nn.functional.cross_entropy(pred_tags, true_tags)
        return scores

In [12]:
#creating the model using the Class defined above
model = BiLSTM(vocab_size=len(word_to_id),
                   tag_to_ix=tag_to_id,
                   embedding_dim=parameters['word_dim'],
                   hidden_dim=parameters['word_lstm_dim'],
                   use_gpu=use_gpu,
                   pre_word_embeds=word_embeds
              )
print("Model Initialized!!!")

#Initializing the optimizer
#The best results in the paper where achived using stochastic gradient descent (SGD) 
#learning rate=0.015 and momentum=0.9 
#decay_rate=0.05 

learning_rate = 0.1 #0.015
momentum = 0.9
number_of_epochs = parameters['epoch'] 
decay_rate = 0.05
gradient_clip = parameters['gradient_clip']
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)

#variables which will used in training process
losses = [] #list to store all losses
loss = 0.0 #Loss Initializatoin
eval_every = len(train_data) # Calculate F-1 Score after this many iterations
plot_every = 2000 # Store loss after this many iterations
count = 0 #Counts the number of iterations

Model Initialized!!!


## Training 

In [13]:
#from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import sklearn.metrics
def my_evaluating(model, datas):

    y_pred=[]
    y_true=[]
    for data in datas:
        ground_truth_id = data['tags']
        words = data['str_words']

        dwords = Variable(torch.LongTensor(data['words']))
        #print(dwords)
        # We are getting the predicted output from our model
        if use_gpu:
            feats = model(dwords.cuda())
        else:
            feats = model(dwords)
            
        feats = feats.view(-1, feats.shape[-1])
        _, tag_seq = torch.max(feats, 1)
        predicted_id = list(tag_seq.cpu().data)
        predicted_id = [i.item() for i in predicted_id]
        
        y_pred.extend(predicted_id)
        y_true.extend(ground_truth_id)

    y_pred = np.array(y_pred)
    y_true = np.array(y_true)
    report = sklearn.metrics.classification_report(y_true, y_pred, digits=3)
    #print(report)
 
    return report
#dev_F = my_evaluating(model, dev_data)

In [14]:
def adjust_learning_rate(optimizer, lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [None]:
## No batch training
count = 0
for epoch in range(number_of_epochs):
    for i, index in enumerate(np.random.permutation(len(train_data))):
        count += 1
        data = train_data[index]

        ##gradient updates for each data entry
        model.zero_grad()

        sentence_in = data['words']
        sentence_in = Variable(torch.LongTensor(sentence_in))
        tags = data['tags']


        targets = torch.LongTensor(tags)

        #we calculate the negative log-likelihood for the predicted tags using the predefined function
        if use_gpu:
            neg_log_likelihood = model.neg_log_likelihood(sentence_in.cuda(), targets.cuda())
        else:
            neg_log_likelihood = model.neg_log_likelihood(sentence_in, targets)
        #print(neg_log_likelihood.item())
        loss += neg_log_likelihood.item() / len(data['words'])
        neg_log_likelihood.backward()

        #we use gradient clipping to avoid exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), gradient_clip)
        optimizer.step()

        #Storing loss
        if count % plot_every == 0:
            loss /= plot_every
            print(f'count {count} : loss = {loss}')
            if losses == []:
                losses.append(loss)
            losses.append(loss)
            loss = 0.0

        #Evaluating on Train, Test, Dev Sets
        #if count % (eval_every) == 0 and count > (eval_every * 20) or \
        #        count % (eval_every*2) == 0 and count < (eval_every * 20):
            #dev_report = my_evaluating(model, dev_data)
            
        #if count >= 20000:  
        if count % len(train_data) == 0:
            adjust_learning_rate(optimizer, lr=learning_rate/(1+decay_rate*count/len(train_data)))


count 2000 : loss = 0.15920651671795286
count 4000 : loss = 0.08616005368902224
count 6000 : loss = 0.076924574431228
count 8000 : loss = 0.06724264426129321
count 10000 : loss = 0.08360639633776672
count 12000 : loss = 0.04799470001893452
count 14000 : loss = 0.052293399750440145
count 16000 : loss = 0.03976079923281013
count 18000 : loss = 0.02180505887243457
count 20000 : loss = 0.024520906471535566
count 22000 : loss = 0.02483374579629412
count 24000 : loss = 0.027542751486814832
count 26000 : loss = 0.024234930539256568
count 28000 : loss = 0.01369591030064631
count 30000 : loss = 0.022127302217403222
count 32000 : loss = 0.014274417325409624
count 34000 : loss = 0.010125220593764589
count 36000 : loss = 0.007528156209995223
count 38000 : loss = 0.013238591375328642
count 40000 : loss = 0.009071351863089463
count 42000 : loss = 0.0098592696842503
count 44000 : loss = 0.00969429588940225
count 46000 : loss = 0.0089571128841015
count 48000 : loss = 0.0056856959960166105
count 50000 

## Save model

In [None]:
dev_report = my_evaluating(model, dev_data)
torch.save(model.state_dict(), 'blstm1.pt')

## Generate dev1.out

In [None]:
## Inference
def my_inference(model, test_data, id_to_tag):
    y_pred=[]
    for data in test_data:
        pre_tag=[]
        dwords = Variable(torch.LongTensor(data['words']))
        #print(dwords)
        # We are getting the predicted output from our model
        if use_gpu:
            feats = model(dwords.cuda())
        else:
            feats = model(dwords)

        feats = feats.view(-1, feats.shape[-1])
        _, tag_seq = torch.max(feats, 1)
        predicted_id = list(tag_seq.cpu().data)
        predicted_id = [i.item() for i in predicted_id]
        for i in predicted_id:
            pre_tag.append(id_to_tag[i])
        y_pred.append(pre_tag)
    return y_pred

In [None]:
## Test taged datasets
y_pred = my_inference(model, dev_data, id_to_tag)
#print(y_pred)
import functools
y_pred= functools.reduce(lambda a, b: a + [''] + b, y_pred)

## Output
with open('./data/dev', 'r') as f:
    sentences = f.readlines()
    #print(len(sentences))
    output = []
    for i in range(len(sentences)):
        line = sentences[i]
        if line != '\n':
            line = line.replace('\n', '') + ' ' + y_pred[i]
        output.append(line)
        
with open('dev1_temp.txt', 'w') as f:
    for line in output:
        if line != '\n':
            f.write(line)
            f.write('\n')
        else:
            f.write(line)

In [None]:
final_outcome = []
for i in range(len(sentences)):
    temp = sentences[i].split()
    if len(temp) == 3:
        temp[2] = y_pred[i]
        final_outcome.append(' '.join(temp))
    else:
        final_outcome.append('')

In [None]:
with open('dev1.out.txt', 'w') as f:
    for i in final_outcome:
        f.write(i)
        f.write('\n')

##  Generate test2.out

In [None]:
y_pred_test = my_inference(model, test_data, id_to_tag)
#print(y_pred)
import functools
y_pred_test= functools.reduce(lambda a, b: a + [''] + b, y_pred_test)

## Output
with open('./data/test', 'r') as f:
    sentences = f.readlines()
    output_test = []
    for i in range(len(sentences)):
        line = sentences[i]
        if line != '\n':
            line = line.replace('\n', '') + ' ' + y_pred_test[i]
        output_test.append(line)

In [None]:
with open('test1.out.txt', 'w') as f:
    for i in output_test:
        f.write(i)
        f.write('\n')

In [None]:
! perl conll03eval.txt <  dev1_temp.txt