## HW4 - PDF Report

In order to explain my answers to task 1 and task 2 more efficiently, I have submitted my Jupyter notebook as a PDF, so that I can explain my answers in markdown cells with corresponding code directly backing my explanations.

In [1]:
from collections import defaultdict
import operator
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
f = open("./data/train","r")
count_dict = defaultdict(int)
label_set = []
for line in f:
    get_words = line.split()
    if len(get_words)!=0:
        count_dict[get_words[1]]+=1
        if get_words[2] not in label_set:
            label_set.append(get_words[2])
f.close()

unkw = 0
for key,val in count_dict.items():
    if val<2:
        unkw += val

sorted_count_list = sorted(count_dict.items(),key=operator.itemgetter(1), reverse=True)

In [3]:
word_index = {}
word_index['<PAD>'] = 0
word_index['<UNK>'] = 1

i=2
for word,count in sorted_count_list:
    if count>=2:
        word_index[word] = i
        i+=1

In [4]:
len(word_index.keys())

11985

#### Above is the custom vocab index created for task 1, with 11985 items, including padding and unknown tokens.

In [5]:
f_train = open("./data/train","r")
sentences = []
tags = []
curr_sent = ""
curr_tags = ""

for line in f_train:
    get_line = line.split()
    if len(get_line)>0:
        curr_sent += get_line[1]
        curr_sent += " "
        curr_tags += get_line[2]
        curr_tags += " "
    else:
        curr_sent = curr_sent[:-1]
        curr_tags = curr_tags[:-1]
        sentences.append(curr_sent)
        tags.append(curr_tags)
        curr_sent = ""
        curr_tags = ""
f_train.close()

curr_sent = curr_sent[:-1]
curr_tags = curr_tags[:-1]
sentences.append(curr_sent)
tags.append(curr_tags)
curr_sent = ""
curr_tags = ""

train_data = pd.DataFrame({'sentences':sentences, 'tags':tags})

f_dev = open("./data/dev","r")
sentences = []
tags = []
curr_sent = ""
curr_tags = ""

for line in f_dev:
    get_line = line.split()
    if len(get_line)>0:
        curr_sent += get_line[1]
        curr_sent += " "
        curr_tags += get_line[2]
        curr_tags += " "
    else:
        curr_sent = curr_sent[:-1]
        curr_tags = curr_tags[:-1]
        sentences.append(curr_sent)
        tags.append(curr_tags)
        curr_sent = ""
        curr_tags = ""
f_dev.close()

curr_sent = curr_sent[:-1]
curr_tags = curr_tags[:-1]
sentences.append(curr_sent)
tags.append(curr_tags)
curr_sent = ""
curr_tags = ""

dev_data = pd.DataFrame({'sentences':sentences, 'tags':tags})

f_test = open("./data/test","r")
sentences = []
# tags = []
curr_sent = ""
# curr_tags = ""

for line in f_test:
    get_line = line.split()
    if len(get_line)>0:
        curr_sent += get_line[1]
        curr_sent += " "
#         curr_tags += get_line[2]
#         curr_tags += " "
    else:
        curr_sent = curr_sent[:-1]
#         curr_tags = curr_tags[:-1]
        sentences.append(curr_sent)
#         tags.append(curr_tags)
        curr_sent = ""
#         curr_tags = ""
f_test.close()

curr_sent = curr_sent[:-1]
# curr_tags = curr_tags[:-1]
sentences.append(curr_sent)
# tags.append(curr_tags)
curr_sent = ""
# curr_tags = ""

test_data = pd.DataFrame({'sentences':sentences})

In [6]:
train_data

Unnamed: 0,sentences,tags
0,EU rejects German call to boycott British lamb .,B-ORG O B-MISC O O O B-MISC O O
1,Peter Blackburn,B-PER I-PER
2,BRUSSELS 1996-08-22,B-LOC O
3,The European Commission said on Thursday it di...,O B-ORG I-ORG O O O O O O B-MISC O O O O O B-M...
4,Germany 's representative to the European Unio...,B-LOC O O O O B-ORG I-ORG O O O B-PER I-PER O ...
...,...,...
14982,Division two,O O
14983,Plymouth 2 Preston 1,B-ORG O B-ORG O
14984,Division three,O O
14985,Swansea 1 Lincoln 2,B-ORG O B-ORG O


Above, I have created dataframes for each: train, dev and test set. This will be helpful in passing data to the Dataset creator and DataLoader.

In [7]:
label_index = {}
i=0
for label in label_set:
    label_index[label] = i
    i+=1
label_index['pad_label'] = -1

In [8]:
label_index

{'B-ORG': 0,
 'O': 1,
 'B-MISC': 2,
 'B-PER': 3,
 'I-PER': 4,
 'B-LOC': 5,
 'I-ORG': 6,
 'I-MISC': 7,
 'I-LOC': 8,
 'pad_label': -1}

Above is the label to index mapper to compute output labels in the model.

Below are the reverse index to label and word mappers to make deductions when necessary.

In [9]:
index_word = {v: k for k, v in word_index.items()}
index_label = {v: k for k, v in label_index.items()}

In [10]:
class TrainDataBiLSTM:
    def __init__(self, sentences, tags, word_index, label_index):
        self.sentences = sentences
        self.tags = tags
        self.word_index = word_index
        self.label_index = label_index
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, i):
        sentence = self.sentences.iloc[i].split()
        ner_tag = self.tags.iloc[i].split()
        
        sentence = [self.word_index.get(word, self.word_index['<UNK>']) for word in sentence]
        ner_tag = [self.label_index[tag] for tag in ner_tag]
        
        sentence = torch.tensor(sentence)
        ner_tag = torch.tensor(ner_tag)
        
        return sentence, ner_tag

def pad_collate(batch):
#     batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)
    
    sentences, ner_tags = zip(*batch)
    
    lengths = torch.tensor([len(sentence) for sentence in sentences])
    sentences = pad_sequence(sentences, batch_first=True, padding_value=0)
    
    ner_tags = pad_sequence(ner_tags, batch_first=True, padding_value=-1)
    
    return sentences, lengths, ner_tags

In [11]:
class DevDataBiLSTM:
    def __init__(self, sentences, tags, word_index, label_index):
        self.sentences = sentences
        self.tags = tags
        self.word_index = word_index
        self.label_index = label_index
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, i):
        sentence = self.sentences.iloc[i].split()
        ner_tag = self.tags.iloc[i].split()
        
        sentence = [self.word_index.get(word, self.word_index['<UNK>']) for word in sentence]
        ner_tag = [self.label_index[tag] for tag in ner_tag]
        
        sentence = torch.tensor(sentence)
        ner_tag = torch.tensor(ner_tag)
        
        return sentence, ner_tag

#### The data loaders for the train and test data are identical. To pad each batch, such that each sentence attains the length of longest sentence of that batch by padding with zeros, I have created a pad_collate function, which will also return original lengths of sentences. This will be passed to DataLoader to get padded data appropriately.

In [12]:
class TestDataBiLSTM:
    def __init__(self, sentences, word_index):
        self.sentences = sentences
#         self.tags = tags
        self.word_index = word_index
#         self.label_index = label_index
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, i):
        sentence = self.sentences.iloc[i].split()
#         ner_tag = self.tags.iloc[i].split()
        
        sentence = [self.word_index.get(word, self.word_index['<UNK>']) for word in sentence]
#         ner_tag = [self.label_index[tag] for tag in ner_tag]
        
        sentence = torch.tensor(sentence)
#         ner_tag = torch.tensor(ner_tag)
        
        return sentence
    
def pad_collate_test(batch):
#     batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)
    
    sentences = batch
    
    lengths = torch.tensor([len(sentence) for sentence in sentences])
    sentences = pad_sequence(sentences, batch_first=True, padding_value=0)
    
#     ner_tags = pad_sequence(ner_tags, batch_first=True, padding_value=-1)
    
    return sentences, lengths

The test dataset is created in a similar manner, except it does not have actual tags.

In [13]:
batch_size=16

train_dataset = TrainDataBiLSTM(train_data['sentences'], train_data['tags'], word_index, label_index)
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=pad_collate)

dev_dataset = DevDataBiLSTM(dev_data['sentences'], dev_data['tags'], word_index, label_index)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, collate_fn=pad_collate)

test_dataset = TestDataBiLSTM(test_data['sentences'], word_index)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=pad_collate_test)

### Please note above that my batch size is 16.

#### I have chosen this as my batch size hyperparameter value after trying out different batch sizes like 32, 64, etc. but 16 gives the fastest convergence to produce the expected result on the dev set.

In [14]:
test_data['sentences']

0       SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRI...
1                                             Nadim Ladki
2                AL-AIN , United Arab Emirates 1996-12-06
3       Japan began the defence of their Asian Cup tit...
4       But China saw their luck desert them in the se...
                              ...                        
3679    " It was the joy that we all had over the peri...
3680    Charlton managed Ireland for 93 matches , duri...
3681    He guided Ireland to two successive World Cup ...
3682    The lanky former Leeds United defender did not...
3683                                           -DOCSTART-
Name: sentences, Length: 3684, dtype: object

In [15]:
len(train_loader.dataset)

14987

### Task 1: Simple Bidirectional LSTM model

#### My Simple LSTM model is defined below:

The fixed hyper-parameters as per the assignment requirement are:

Embedding dimension: 100,

LSTM hidden dim: 256,

LSTM dropout: 0.33,

LSTM layers: 1,

Linear output dim: 128,

ELU layer: Here, I have determined alpha=0.35 gives a slight boost in results.

Classifier layer: 9 outputs (corresponding to tags)

In [18]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lstm_hidden_dim, lstm_dropout, linear_output_dim, num_tags):
        super(BiLSTM, self).__init__()
        
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=lstm_hidden_dim, num_layers=1, 
                            batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.33)
        self.linear = nn.Linear(lstm_hidden_dim*2, linear_output_dim)
        self.elu = nn.ELU(0.35)
        self.classifier = nn.Linear(linear_output_dim, num_tags)
    
    def forward(self, inputs, lengths):
        embedded = self.embedding(inputs)
        packed_embedded = pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        output = self.dropout(output)
        linear_output = self.linear(output)
        elu_output = self.elu(linear_output)
        logits = self.classifier(elu_output)
        
        return logits

bilstm_model = BiLSTM(len(word_index.keys()), 100, 256, 0.33, 128, 9)
print(bilstm_model)

BiLSTM(
  (embedding): Embedding(11985, 100, padding_idx=0)
  (lstm): LSTM(100, 256, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.33, inplace=False)
  (linear): Linear(in_features=512, out_features=128, bias=True)
  (elu): ELU(alpha=0.35)
  (classifier): Linear(in_features=128, out_features=9, bias=True)
)


11985 corresponds to custom created vocabulary data size from training data.

In [19]:
criterion = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = torch.optim.SGD(bilstm_model.parameters(), lr=0.33)

As required, SGD optimizer is used. Trying out various learning rates and epochs, I determined that a learning rate = 0.33, and epochs=50 gave best convergence and promising results on the dev dataset.

In [20]:
epochs = 50

validn_min_loss = np.Inf

for epoch in range(epochs):
    train_loss = 0.0
    
    bilstm_model.train()
    for sentences, lengths, labels in train_loader:
        optimizer.zero_grad()
        output = bilstm_model(sentences, lengths)
        output = output.permute(0,2,1)
        loss = criterion(output, labels)
        
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()*sentences.size(0)
    
    train_loss = train_loss/(len(train_loader.dataset))
    
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch+1, train_loss))

Epoch: 1 	Training Loss: 0.678128
Epoch: 2 	Training Loss: 0.466299
Epoch: 3 	Training Loss: 0.353567
Epoch: 4 	Training Loss: 0.274990
Epoch: 5 	Training Loss: 0.220591
Epoch: 6 	Training Loss: 0.185073
Epoch: 7 	Training Loss: 0.157392
Epoch: 8 	Training Loss: 0.135189
Epoch: 9 	Training Loss: 0.118245
Epoch: 10 	Training Loss: 0.107091
Epoch: 11 	Training Loss: 0.094217
Epoch: 12 	Training Loss: 0.084123
Epoch: 13 	Training Loss: 0.078213
Epoch: 14 	Training Loss: 0.068511
Epoch: 15 	Training Loss: 0.062229
Epoch: 16 	Training Loss: 0.058264
Epoch: 17 	Training Loss: 0.052831
Epoch: 18 	Training Loss: 0.049279
Epoch: 19 	Training Loss: 0.045459
Epoch: 20 	Training Loss: 0.041738
Epoch: 21 	Training Loss: 0.038185
Epoch: 22 	Training Loss: 0.035769
Epoch: 23 	Training Loss: 0.033005
Epoch: 24 	Training Loss: 0.031737
Epoch: 25 	Training Loss: 0.030521
Epoch: 26 	Training Loss: 0.027203
Epoch: 27 	Training Loss: 0.026357
Epoch: 28 	Training Loss: 0.024640
Epoch: 29 	Training Loss: 0.0

In [21]:
torch.save(bilstm_model.state_dict(), 'bilstm_model.pt')

In [22]:
# bilstm_model = BiLSTM(len(word_index.keys()), 100, 256, 0.33, 128, 9)
# bilstm_model.load_state_dict(torch.load('bilstm_model.pt'))

In [23]:
def getDevResults(model, dataloader):
    model.eval()
    
    f_read = open("./data/dev","r")
    f_write = open("dev1.out","w")
    for sentences, lengths, labels in dataloader:
        output = model(sentences, lengths)
        max_values, max_indices = torch.max(output, dim=2)
        y = max_indices
        
        for i in range(len(sentences)):
            for j in range(len(sentences[i])):
                read_line = f_read.readline().split()
                if len(read_line)>0:
                    f_write.write(str(read_line[0])+" "+str(read_line[1])+" "+index_label[labels[i][j].item()]+" "+index_label[y[i][j].item()]+"\n")
                else:
                    break
                if j+1>=len(sentences[i]):
                    f_read.readline()
            if len(sentences)==batch_size or i<len(sentences)-1:
                f_write.write("\n")
    f_read.close()
    f_write.close()

In [24]:
getDevResults(bilstm_model, dev_loader)

#### The dev results are as follows:            
              
              
    processed 51578 tokens with 5942 phrases; found: 5453 phrases; correct: 4468.

    accuracy:  95.69%; precision:  81.94%; recall:  75.19%; FB1:  78.42

              LOC: precision:  89.39%; recall:  83.45%; FB1:  86.32  1715
              
             MISC: precision:  80.81%; recall:  75.81%; FB1:  78.23  865
             
              ORG: precision:  75.31%; recall:  67.56%; FB1:  71.23  1203
              
              PER: precision:  79.64%; recall:  72.20%; FB1:  75.74  1670


In [27]:
def getTestResults(model, dataloader):
    model.eval()
    
    f_read = open("./data/test","r")
    f_write = open("test1.out","w")
    for sentences, lengths in dataloader:
        output = model(sentences, lengths)
        max_values, max_indices = torch.max(output, dim=2)
        y = max_indices
        
        for i in range(len(sentences)):
            for j in range(len(sentences[i])):
                read_line = f_read.readline().split()
                if len(read_line)>0:
                    f_write.write(str(read_line[0])+" "+str(read_line[1])+" "+index_label[y[i][j].item()]+"\n")
                else:
                    break
                if j+1>=len(sentences[i]):
                    f_read.readline()
            if len(sentences)==batch_size or i<len(sentences)-1:
                f_write.write("\n")
    f_read.close()
    f_write.close()

In [28]:
getTestResults(bilstm_model, test_loader)

### Task 2: Bi-directional LSTM model with GloVe embeddings

In [31]:
embed_vectors = []
embed_vocab = []
file_embed = open("glove.6B.100d","r")
for line in file_embed:
    line = line.split()
    embed_vocab.append(line[0])
    embed_vectors.append(line[1:])

In [32]:
embed_vocab = np.array(embed_vocab)
embed_vectors = np.array(embed_vectors, dtype=np.float64)

In [33]:
embed_vocab.shape

(400000,)

In [34]:
embed_vectors.shape

(400000, 100)

In [35]:
pad_vector = np.zeros((1,embed_vectors.shape[1]))
unk_vector = np.mean(embed_vectors,axis=0,keepdims=True)

embed_vocab = np.insert(embed_vocab, 0, '<PAD>')
embed_vocab = np.insert(embed_vocab, 1, '<UNK>')

embed_vectors = np.vstack((pad_vector,unk_vector,embed_vectors))

#### Above, word embeddings from GloVe are loaded into vectors. Also, I have added vectors for the padding and unknown token.

Experimentally, I found that assigned 100 dim embeddings of zero for padding token, and average of all available glove word embeddings to unknown token, gave best results.


In [36]:
demo_embed = nn.Embedding.from_pretrained(torch.from_numpy(embed_vectors),padding_idx=0)

In [37]:
demo_embed(torch.LongTensor([2]))

tensor([[-0.0382, -0.2449,  0.7281, -0.3996,  0.0832,  0.0440, -0.3914,  0.3344,
         -0.5755,  0.0875,  0.2879, -0.0673,  0.3091, -0.2638, -0.1323, -0.2076,
          0.3340, -0.3385, -0.3174, -0.4834,  0.1464, -0.3730,  0.3458,  0.0520,
          0.4495, -0.4697,  0.0263, -0.5415, -0.1552, -0.1411, -0.0397,  0.2828,
          0.1439,  0.2346, -0.3102,  0.0862,  0.2040,  0.5262,  0.1716, -0.0824,
         -0.7179, -0.4153,  0.2034, -0.1276,  0.4137,  0.5519,  0.5791, -0.3348,
         -0.3656, -0.5486, -0.0629,  0.2658,  0.3020,  0.9978, -0.8048, -3.0243,
          0.0125, -0.3694,  2.2167,  0.7220, -0.2498,  0.9214,  0.0345,  0.4674,
          1.1079, -0.1936, -0.0746,  0.2335, -0.0521, -0.2204,  0.0572, -0.1581,
         -0.3080, -0.4163,  0.3797,  0.1501, -0.5321, -0.2055, -1.2526,  0.0716,
          0.7056,  0.4974, -0.4206,  0.2615, -1.5380, -0.3022, -0.0734, -0.2831,
          0.3710, -0.2522,  0.0162, -0.0171, -0.3898,  0.8742, -0.7257, -0.5106,
         -0.5203, -0.1459,  

In [38]:
embed_vectors[2]

array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

In [39]:
embed_vocab

array(['<PAD>', '<UNK>', 'the', ..., 'rolonda', 'zsombor', 'sandberger'],
      dtype='<U68')

Above was some sample code to demonstrate loaded glove embeddings.

In [40]:
# glove_word_index = dict(zip(embed_vocab, embed_vectors))
glove_word_index = {k: v for v, k in enumerate(embed_vocab)}

#### The way train, test and dev datasets are created is similar to task 1, but has some differences.

#### GloVe only has lowercase word embeddings. Hence, to retrieve word embeddings, each word's lowercase form is taken to access corresponding glove index. 

#### Also, an extra vector of is_capitals is passed, which contains 1 if the word is either first word capitlaized or full capitalized, and 0 if the word is completely lowercase. This vector will be concatenated to output of embedding layer in forward pass, and then passed to Bi-LSTLM.

In [41]:
class TrainDataBiLSTMGlove:
    def __init__(self, sentences, tags, glove_word_index, label_index):
        self.sentences = sentences
        self.tags = tags
        self.glove_word_index = glove_word_index
        self.label_index = label_index
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, i):
        sentence = self.sentences.iloc[i].split()
        ner_tag = self.tags.iloc[i].split()
        
        is_capital = [1 if (word.isupper() or word.istitle()) else 0 for word in sentence]
        sentence = [self.glove_word_index.get(word.lower(), self.glove_word_index['<UNK>']) for word in sentence]
        ner_tag = [self.label_index[tag] for tag in ner_tag]
        
        sentence = torch.tensor(sentence)
        is_capital = torch.tensor(is_capital)
        ner_tag = torch.tensor(ner_tag)
        
        return sentence, is_capital, ner_tag

def pad_collate_glove(batch):
    
    sentences, is_capitals, ner_tags = zip(*batch)
    
    lengths = torch.tensor([len(sentence) for sentence in sentences])
    sentences = pad_sequence(sentences, batch_first=True, padding_value=0)
    is_capitals = pad_sequence(is_capitals, batch_first=True, padding_value=-1)
    ner_tags = pad_sequence(ner_tags, batch_first=True, padding_value=-1)
    
    return sentences, is_capitals, lengths, ner_tags

In [42]:
class DevDataBiLSTMGlove:
    def __init__(self, sentences, tags, glove_word_index, label_index):
        self.sentences = sentences
        self.tags = tags
        self.glove_word_index = glove_word_index
        self.label_index = label_index
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, i):
        sentence = self.sentences.iloc[i].split()
        ner_tag = self.tags.iloc[i].split()
        
        is_capital = [1 if (word.isupper() or word.istitle()) else 0 for word in sentence]
        sentence = [self.glove_word_index.get(word.lower(), self.glove_word_index['<UNK>']) for word in sentence]
        ner_tag = [self.label_index[tag] for tag in ner_tag]
        
        sentence = torch.tensor(sentence)
        is_capital = torch.tensor(is_capital)
        ner_tag = torch.tensor(ner_tag)
        
        return sentence, is_capital, ner_tag

In [43]:
class TestDataBiLSTMGlove:
    def __init__(self, sentences, glove_word_index):
        self.sentences = sentences
#         self.tags = tags
        self.glove_word_index = glove_word_index
#         self.label_index = label_index
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, i):
        sentence = self.sentences.iloc[i].split()
#         ner_tag = self.tags.iloc[i].split()
        
        is_capital = [1 if (word.isupper() or word.istitle()) else 0 for word in sentence]
        sentence = [self.glove_word_index.get(word.lower(), self.glove_word_index['<UNK>']) for word in sentence]
#         ner_tag = [self.label_index[tag] for tag in ner_tag]
        
        sentence = torch.tensor(sentence)
        is_capital = torch.tensor(is_capital)
#         ner_tag = torch.tensor(ner_tag)
        
        return sentence, is_capital
    
def pad_collate_glove_test(batch):
#     batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)
    
    sentences, is_capitals = zip(*batch)
    
    lengths = torch.tensor([len(sentence) for sentence in sentences])
    sentences = pad_sequence(sentences, batch_first=True, padding_value=0)
    is_capitals = pad_sequence(is_capitals, batch_first=True, padding_value=-1)
#     ner_tags = pad_sequence(ner_tags, batch_first=True, padding_value=-1)
    
    return sentences, is_capitals, lengths

In [44]:
train_dataset = TrainDataBiLSTMGlove(train_data['sentences'], train_data['tags'], glove_word_index, label_index)
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=pad_collate_glove)

dev_dataset = DevDataBiLSTMGlove(dev_data['sentences'], dev_data['tags'], glove_word_index, label_index)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, collate_fn=pad_collate_glove)

test_dataset = TestDataBiLSTMGlove(test_data['sentences'], glove_word_index)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=pad_collate_glove_test)

#### The Bi-LSTM model with GloVe embeddings is defined below, with almost same hyperparameters as task 1, but a few changes which will be described below:

The fixed hyper-parameters as per the assignment requirement are:

Embedding dimension: 100,

LSTM hidden dim: 256,

LSTM dropout: 0.33,

LSTM layers: 1,

Linear output dim: 128,

ELU layer: Here, alpha=1.0 unlike task 1.

Classifier layer: 9 outputs (corresponding to tags)

The optimizer used is again SGD as required.

The learning rate is also same as task 1: 0.33

Number of epochs: 50

#### Changes are as follows:
The embedding layer contains pre-trained GloVe embeddings. These are set to trainable, so will still keep learning throughout as the model runs epochs.
The size is 400002, which indicates the 400000 words for which embeddings exist, plus the unknown and padding token embeddings (created and explained earlier).

Also the input dim to LSTM will be 101, because the is_capitals tensor is also passed which has values 0 when corresponding word was all lowercase, and 1, when it had one(or more) letters capitalized.
This is concatenated to the output from embedding layer, thus passing in input of size 100+1 to the Bi-LSTM.

In [45]:
class BiLSTMGlove(nn.Module):
    def __init__(self, embedding_dim, lstm_hidden_dim, lstm_dropout, linear_output_dim, num_tags):
        super(BiLSTMGlove, self).__init__()
        
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embed_vectors),padding_idx=0)
        self.lstm = nn.LSTM(input_size=embedding_dim+1, hidden_size=lstm_hidden_dim, num_layers=1, 
                            batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(lstm_dropout)
        self.linear = nn.Linear(lstm_hidden_dim*2, linear_output_dim)
        self.elu = nn.ELU()
        self.classifier = nn.Linear(linear_output_dim, num_tags)
    
    def forward(self, inputs, is_capitals, lengths):
        embedded = self.embedding(inputs)
        concatenated_tensor = torch.cat((embedded, is_capitals.unsqueeze(-1)), dim=-1)
        packed_embedded = pack_padded_sequence(concatenated_tensor, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_embedded = packed_embedded.float()
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        linear_output = self.linear(output)
        elu_output = self.elu(linear_output)
        logits = self.classifier(elu_output)
        
        return logits

bilstm_glove_model = BiLSTMGlove(100, 256, 0.33, 128, 9)
print(bilstm_glove_model)

BiLSTMGlove(
  (embedding): Embedding(400002, 100, padding_idx=0)
  (lstm): LSTM(101, 256, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.33, inplace=False)
  (linear): Linear(in_features=512, out_features=128, bias=True)
  (elu): ELU(alpha=1.0)
  (classifier): Linear(in_features=128, out_features=9, bias=True)
)


In [46]:
criterion = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = torch.optim.SGD(bilstm_glove_model.parameters(), lr=0.33)

In [47]:
epochs = 50

for epoch in range(epochs):
    train_loss = 0.0
    
    bilstm_glove_model.train()
    for sentences, is_capitals, lengths, labels in train_loader:
        optimizer.zero_grad()
        output = bilstm_glove_model(sentences, is_capitals, lengths)
        output = output.permute(0,2,1)
        loss = criterion(output, labels)
        
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()*sentences.size(0)
    
    train_loss = train_loss/(len(train_dataset))
    
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch+1, train_loss))

Epoch: 1 	Training Loss: 0.319138
Epoch: 2 	Training Loss: 0.133306
Epoch: 3 	Training Loss: 0.104646
Epoch: 4 	Training Loss: 0.091157
Epoch: 5 	Training Loss: 0.081219
Epoch: 6 	Training Loss: 0.073103
Epoch: 7 	Training Loss: 0.066240
Epoch: 8 	Training Loss: 0.060333
Epoch: 9 	Training Loss: 0.055168
Epoch: 10 	Training Loss: 0.050570
Epoch: 11 	Training Loss: 0.046397
Epoch: 12 	Training Loss: 0.042540
Epoch: 13 	Training Loss: 0.038943
Epoch: 14 	Training Loss: 0.035568
Epoch: 15 	Training Loss: 0.032413
Epoch: 16 	Training Loss: 0.029915
Epoch: 17 	Training Loss: 0.026851
Epoch: 18 	Training Loss: 0.024158
Epoch: 19 	Training Loss: 0.021704
Epoch: 20 	Training Loss: 0.019351
Epoch: 21 	Training Loss: 0.017327
Epoch: 22 	Training Loss: 0.015248
Epoch: 23 	Training Loss: 0.013537
Epoch: 24 	Training Loss: 0.011889
Epoch: 25 	Training Loss: 0.010464
Epoch: 26 	Training Loss: 0.009082
Epoch: 27 	Training Loss: 0.007881
Epoch: 28 	Training Loss: 0.006778
Epoch: 29 	Training Loss: 0.0

In [48]:
torch.save(bilstm_glove_model.state_dict(), 'bilstm_glove_model.pt')

In [49]:
# bilstm_glove_model = BiLSTMGlove(100, 256, 0.33, 128, 9)
# bilstm_glove_model.load_state_dict(torch.load('bilstm_glove_model.pt'))

In [50]:
def getDevResultsGlove(model, dataloader):
    
    model.eval()
    f_read = open("./data/dev","r")
    f_write = open("dev2.out","w")
    for sentences, is_capitals, lengths, labels in dataloader:
        output = model(sentences, is_capitals, lengths)
        max_values, max_indices = torch.max(output, dim=2)
        y = max_indices
        
        for i in range(len(sentences)):
            for j in range(len(sentences[i])):
                read_line = f_read.readline().split()
                if len(read_line)>0:
                    f_write.write(str(read_line[0])+" "+str(read_line[1])+" "+index_label[labels[i][j].item()]+" "+index_label[y[i][j].item()]+"\n")
                else:
                    break
                if j+1>=len(sentences[i]):
                    f_read.readline()
            if len(sentences)==batch_size or i<len(sentences)-1:
                f_write.write("\n")
    f_read.close()
    f_write.close()

In [51]:
getDevResultsGlove(bilstm_glove_model, dev_loader)

#### The dev results are as follows:
              
              
    processed 51578 tokens with 5942 phrases; found: 6097 phrases; correct: 5392.

    accuracy:  98.25%; precision:  88.44%; recall:  90.74%; FB1:  89.58

              LOC: precision:  94.00%; recall:  93.85%; FB1:  93.93  1834
              
             MISC: precision:  76.07%; recall:  84.49%; FB1:  80.06  1024
             
              ORG: precision:  82.55%; recall:  85.38%; FB1:  83.94  1387
              
              PER: precision:  94.17%; recall:  94.68%; FB1:  94.42  1852


In [54]:
def getTestResultsGlove(model, dataloader):
    
    model.eval()
    f_read = open("./data/test","r")
    f_write = open("test2.out","w")
    for sentences, is_capitals, lengths in dataloader:
        output = model(sentences, is_capitals, lengths)
        max_values, max_indices = torch.max(output, dim=2)
        y = max_indices
        
        for i in range(len(sentences)):
            for j in range(len(sentences[i])):
                read_line = f_read.readline().split()
                if len(read_line)>0:
                    f_write.write(str(read_line[0])+" "+str(read_line[1])+" "+index_label[y[i][j].item()]+"\n")
                else:
                    break
                if j+1>=len(sentences[i]):
                    f_read.readline()
            if len(sentences)==batch_size or i<len(sentences)-1:
                f_write.write("\n")
    f_read.close()
    f_write.close()

In [55]:
getTestResultsGlove(bilstm_glove_model, test_loader)