In [21]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np

In [22]:
train_file = "data/train"
dev_file = "data/dev"
test_file = "data/test"
glove_file = "glove.6B.100d"

padding_value = 9
unknown_value = 756774
threshold_value = 0
embedding_dim = 101

In [23]:
def mapSentenceToList(sentences, entity):
  sentences = sentences.split("\n")

  if entity:
    sentences = list(map(lambda x: tuple(x.split(" ")[1:]), sentences))
  else:
    sentences = list(map(lambda x: x.split(" ")[1], sentences))

  return sentences


def readFile(file_path, entity=True):
  with open(file_path, "r") as fin:
    sentences = "".join(list(fin)).split("\n\n")
    sentences[-1] = sentences[-1].rstrip()

    sentences = list(
      map(
        lambda x: mapSentenceToList(x, entity),
        sentences
      )
    )

    return sentences

def writeFile(file_path, sentences):
  with open(file_path, "w") as fout:
    for sentence in sentences:
      for index, (word, tag) in enumerate(sentence):
        s = str(index + 1) + " " + word + " " + tag + "\n"
        fout.write(s)
      fout.write("\n")

In [24]:
def readGloveEmbedding(glove_path):
    word_vec = {}

    word_vec["<PAD>"] = np.zeros(101)  
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.array(values[1:], dtype=np.float32)
            if word.lower() == word.capitalize():
                lower_vector = np.append(vector, 0)
                word_vec[word.lower()] = lower_vector
            else:
                lower_vector = np.append(vector, 0)
                upper_vector = np.append(vector, 1)
                word_vec[word.lower()] = lower_vector
                word_vec[word.capitalize()] = upper_vector

    
    word_vec["<UNK>"] = np.ones(101)

    return word_vec


def mapWordToGloveIndex(word_vec):
    rev_map = {word: index for index, word in enumerate(word_vec)}
    return rev_map


def getEmbeddingLayerWeights(word_vec, word_index_map):
    vocab_size = len(word_index_map) + 2
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, idx in word_index_map.items():
        if word in word_vec:
            embedding_matrix[idx] = word_vec[word]
    return embedding_matrix

In [25]:
word_vec = readGloveEmbedding(glove_file)

word_index_map = mapWordToGloveIndex(word_vec)

embedding_layer_weights = getEmbeddingLayerWeights(word_vec, word_index_map)

In [26]:
class CustomLSTM(nn.Module):
  def __init__(self, num_classes):
    super(CustomLSTM, self).__init__()
    self.embedding_dim = embedding_dim
    self.hidden_dim = 256
    self.lstm_dropout = 0.33
    self.linear_dim = 128

    self.embedding_layer = nn.Embedding.from_pretrained(
      embeddings=torch.FloatTensor(embedding_layer_weights)
    )

    self.lstm_layer = nn.LSTM(
      input_size=self.embedding_dim,
      hidden_size=self.hidden_dim,
      bidirectional=True,
      batch_first=True
    )

    self.dropout_layer = nn.Dropout(self.lstm_dropout)

    self.linear_layer = nn.Linear(
      in_features=self.hidden_dim * 2,
      out_features=self.linear_dim
    )

    self.elu_layer = nn.ELU()

    self.classifier = nn.Linear(
      in_features=self.linear_dim,
      out_features=num_classes,
    )

  def forward(self, x):
    x = self.embedding_layer(x)
    x, _ = self.lstm_layer(x)
    x = self.dropout_layer(x)
    x = self.linear_layer(x)
    x = self.elu_layer(x)
    x = self.classifier(x)

    return x

In [27]:
def train_lstm_model(dataloader, num_classes, weight_balance):
  model = CustomLSTM(num_classes)

  learning_rate = 0.095
  criterion = nn.CrossEntropyLoss(ignore_index=padding_value, weight=weight_balance)
  optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
  exp_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
  no_epochs = 40

  for epoch in range(no_epochs):
    train_loss = 0.0

    model.train()

    match = 0
    word_count = 0

    for data, target in dataloader:
      optimizer.zero_grad()

      output = model(data)

      output = output.view(-1, num_classes)
      target = target.view(-1)

      for i in range(output.shape[0]):
        pred = torch.argmax(output[i])
        actual = target[i].item()
        word_count += 1
        if (actual == padding_value):
          continue
        if (pred == actual):
          match += 1

      loss = criterion(output, target)

      loss.backward()

      optimizer.step()

      train_loss += loss.item()

    train_loss = train_loss/len(dataloader.dataset)

    exp_scheduler.step()
    
    accuracy = (match / word_count) * 100

    print(
      'Epoch: {} \tTraining Loss: {:.6f}'.format(
        epoch+1,
        train_loss,
      )
    )
    print("Accuracy: ", accuracy)

  return model

In [28]:
class CustomDataset(Dataset):
  def __init__(self, X, Y):
    self.X = X
    self.Y = Y

  def __len__(self):
    return len(self.X)

  def __getitem__(self, index):
    return self.X[index], self.Y[index]

In [29]:
def custom_collate_fn(batch):
    sentences = [torch.LongTensor(item[0]) for item in batch]
    tags = [torch.LongTensor(item[1]) for item in batch]

    padded_sentences = pad_sequence(
        sentences, batch_first=True)    
    padded_tags = pad_sequence(
        tags, batch_first=True, padding_value=padding_value)

    return padded_sentences, padded_tags


def getDatasetAndDataLoader(X, Y, batch_size=32):
    dataset = CustomDataset(X, Y)

    dataloader = DataLoader(dataset, batch_size=batch_size,
                            shuffle=False, collate_fn=custom_collate_fn)

    return dataloader

In [30]:
train_sentences = readFile(train_file)

train_sentences = sorted(train_sentences, key=lambda x: len(x))

nerTagSet = set()
nerTagFreq = {}
for sentence in train_sentences:
  for word, nerTag in sentence:
    nerTagSet.add(nerTag)
    if nerTag in nerTagFreq:
      nerTagFreq[nerTag] += 1
    else:
      nerTagFreq[nerTag] = 1
  
tag_map = { tag: index for index, tag in enumerate(list(nerTagSet)) }

map_index_to_tag = {
  index: tag for tag, index in tag_map.items()
}

In [31]:
X = []
Y = []

for sentence in train_sentences:
  ip = []
  op = []

  for word, tag in sentence:
    ip.append(word_index_map.get(word, unknown_value))  
    op.append(tag_map[tag])

  X.append(ip)
  Y.append(op)

train_dataloader = getDatasetAndDataLoader(X, Y, batch_size=8)

In [32]:
weights = [0 for _ in range(len(tag_map))]

for index, tag in enumerate(tag_map):
  weights[index] = len(train_sentences) / (len(tag_map) * nerTagFreq[tag])

weight_balance = torch.Tensor(weights)

weight_balance

tensor([0.4496, 0.0098, 0.2634, 0.4844, 1.4393, 1.4418, 0.2523, 0.3678, 0.2332])

In [33]:
model = train_lstm_model(dataloader=train_dataloader, num_classes=len(tag_map), weight_balance=torch.Tensor(weight_balance))

Epoch: 1 	Training Loss: 0.116681
Accuracy:  83.43901165146168
Epoch: 2 	Training Loss: 0.072189
Accuracy:  88.48862920821801
Epoch: 3 	Training Loss: 0.062164
Accuracy:  89.65670410559045
Epoch: 4 	Training Loss: 0.056598
Accuracy:  90.35715854676448
Epoch: 5 	Training Loss: 0.052015
Accuracy:  90.97658480868076
Epoch: 6 	Training Loss: 0.048789
Accuracy:  91.32217487443073
Epoch: 7 	Training Loss: 0.045626
Accuracy:  91.85813234928027
Epoch: 8 	Training Loss: 0.043317
Accuracy:  92.09877627924459
Epoch: 9 	Training Loss: 0.040961
Accuracy:  92.39213733788263
Epoch: 10 	Training Loss: 0.038883
Accuracy:  92.59861275852138
Epoch: 11 	Training Loss: 0.037600
Accuracy:  92.84853099815979
Epoch: 12 	Training Loss: 0.035881
Accuracy:  93.06379260691082
Epoch: 13 	Training Loss: 0.034964
Accuracy:  93.16873874269648
Epoch: 14 	Training Loss: 0.033356
Accuracy:  93.48113654224449
Epoch: 15 	Training Loss: 0.032832
Accuracy:  93.59242825833346
Epoch: 16 	Training Loss: 0.031351
Accuracy:  93.

In [39]:
torch.save(model, "blstm2.pt")

In [34]:
def predict_dev(pred_model, dataloader):
  # pred_model.eval()
  y_actual = []
  y_pred = []
  
  # with torch.no_grad():
  for data, target in dataloader:
    pred = pred_model(data)

    pred = torch.argmax(pred, 2)

    y_pred.extend(pred)
    y_actual.extend(target)

  return y_pred, y_actual

In [35]:
dev_sentences = readFile(dev_file)

X_dev = []
Y_dev = []

for sentence in dev_sentences:
  ip_dev = []
  op_dev = []

  for word, tag in sentence:
    ip_dev.append(word_index_map.get(word, unknown_value))
    op_dev.append(tag_map[tag])

  X_dev.append(ip_dev)
  Y_dev.append(op_dev)

dev_dataloader = getDatasetAndDataLoader(X_dev, Y_dev, batch_size=2)

y_pred, y_actual = predict_dev(model, dev_dataloader)

In [36]:
pred_sentences = []
for index, sentence in enumerate(dev_sentences):
  preds = y_pred[index].tolist()
  if not isinstance(preds, list):
    preds = [preds]
  curr_sentence = []
  for word_index, (word, _) in enumerate(sentence):
    curr_sentence.append((word, map_index_to_tag[preds[word_index]]))
  pred_sentences.append(curr_sentence)

writeFile("dev2.out", pred_sentences)

In [37]:
class TestDataset(Dataset):
    def __init__(self, X):
        self.X = X

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index]


def test_collate_fn(batch):
    sentences = [torch.LongTensor(item) for item in batch]

    padded_sentences = pad_sequence(
        sentences, batch_first=True)

    return padded_sentences


def getTestDataloader(X, batch_size=2):
    dataset = TestDataset(X)

    return DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=test_collate_fn)


def predict_test(pred_model, dataloader):
    # pred_model.eval()
    y_pred = []

    # with torch.no_grad():
    for data in dataloader:
        pred = pred_model(data)

        pred = torch.argmax(pred, 2)

        y_pred.extend(pred)

    return y_pred

In [38]:
test_sentences = readFile(test_file, entity=False)

X_test = []

for sentence in test_sentences:
  ip_test = []
  for word in sentence:
    ip_test.append(word_index_map.get(word, unknown_value))
  X_test.append(ip_test)

test_dataloader = getTestDataloader(X_test)

y_pred = predict_test(model, test_dataloader)

pred_sentences = []
for index, sentence in enumerate(test_sentences):
  preds = y_pred[index].tolist()
  if not isinstance(preds, list):
    preds = [preds]
  curr_sentence = []
  for word_index, word in enumerate(sentence):
    curr_sentence.append((word, map_index_to_tag[preds[word_index]]))
  pred_sentences.append(curr_sentence)

writeFile("test2.out", pred_sentences)

processed 51578 tokens with 5942 phrases; found: 7693 phrases; correct: 4697.

- accuracy:  93.40%; precision:  61.06%; recall:  79.05%; FB1:  68.90

- LOC: precision:  66.88%; recall:  78.61%; FB1:  72.27  2159

- MISC: precision:  41.42%; recall:  67.57%; FB1:  51.36  1504

- ORG: precision:  49.06%; recall:  72.04%; FB1:  58.37  1969

- PER: precision:  80.74%; recall:  90.34%; FB1:  85.27  2061