In [1]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from torch.nn.utils.rnn import pad_sequence

In [2]:
train_file = "data/train"
dev_file = "data/dev"
test_file = "data/test"

padding_value = 9
unknown_value = 21012
threshold_value = 0

In [3]:
def mapSentenceToList(sentences, entity):
  sentences = sentences.split("\n")

  if entity:
    sentences = list(map(lambda x: tuple(x.split(" ")[1:]), sentences))
  else:
    sentences = list(map(lambda x: x.split(" ")[1], sentences))

  return sentences


def readFile(file_path, entity=True):
  with open(file_path, "r") as fin:
    sentences = "".join(list(fin)).split("\n\n")
    sentences[-1] = sentences[-1].rstrip()

    sentences = list(
      map(
        lambda x: mapSentenceToList(x, entity),
        sentences
      )
    )

    return sentences

In [4]:
def writeFile(file_path, sentences):
  with open(file_path, "w") as fout:
    for sentence in sentences:
      for index, (word, tag) in enumerate(sentence):
        s = str(index + 1) + " " + word + " " + tag + "\n"
        fout.write(s)
      fout.write("\n")

In [5]:
class CustomLSTM(nn.Module):
  def __init__(self, vocab_size, num_classes):
    super(CustomLSTM, self).__init__()
    self.embedding_dim = 100
    self.hidden_dim = 256
    self.lstm_dropout = 0.33
    self.linear_dim = 128

    self.embedding_layer = nn.Embedding(
      num_embeddings=vocab_size,
      embedding_dim=100,
      padding_idx=0
    )

    self.lstm_layer = nn.LSTM(
      input_size=self.embedding_dim,
      hidden_size=self.hidden_dim,
      bidirectional=True,
      batch_first=True
    )

    self.dropout_layer = nn.Dropout(self.lstm_dropout)

    self.linear_layer = nn.Linear(
      in_features=self.hidden_dim * 2,
      out_features=self.linear_dim
    )

    self.elu_layer = nn.ELU()

    self.classifier = nn.Linear(
      in_features=self.linear_dim,
      out_features=num_classes,
    )

  def forward(self, x):
    x = self.embedding_layer(x)
    x, _ = self.lstm_layer(x)
    x = self.dropout_layer(x)
    x = self.linear_layer(x)
    x = self.elu_layer(x)
    x = self.classifier(x)

    return x

In [6]:
def train_lstm_model(dataloader, vocab_size, num_classes, weight_balance):
  model = CustomLSTM(vocab_size, num_classes)

  learning_rate = 0.095
  criterion = nn.CrossEntropyLoss(ignore_index=padding_value, weight=weight_balance)
  optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
  exp_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
  no_epochs = 40

  for epoch in range(no_epochs):
    train_loss = 0.0

    model.train()

    match = 0
    word_count = 0

    for data, target in dataloader:
      optimizer.zero_grad()

      output = model(data)

      output = output.view(-1, num_classes)
      target = target.view(-1)

      for i in range(output.shape[0]):
        pred = torch.argmax(output[i])
        actual = target[i].item()
        word_count += 1
        if (actual == padding_value):
          continue
        if (pred == actual):
          match += 1

      loss = criterion(output, target)

      loss.backward()

      optimizer.step()

      train_loss += loss.item()

    train_loss = train_loss/len(dataloader.dataset)

    exp_scheduler.step()
    
    accuracy = (match / word_count) * 100

    print(
      'Epoch: {} \tTraining Loss: {:.6f}'.format(
        epoch+1,
        train_loss,
      )
    )
    print("Accuracy: ", accuracy)

  return model

In [7]:
class CustomDataset(Dataset):
  def __init__(self, X, Y):
    self.X = X
    self.Y = Y

  def __len__(self):
    return len(self.X)

  def __getitem__(self, index):
    return self.X[index], self.Y[index]

In [8]:
def custom_collate_fn(batch):
    sentences = [torch.LongTensor(item[0]) for item in batch]
    tags = [torch.LongTensor(item[1]) for item in batch]

    padded_sentences = pad_sequence(
        sentences, batch_first=True)
    padded_tags = pad_sequence(
        tags, batch_first=True, padding_value=padding_value)

    return padded_sentences, padded_tags


def getDatasetAndDataLoader(X, Y, batch_size=32):
    dataset = CustomDataset(X, Y)

    dataloader = DataLoader(dataset, batch_size=batch_size,
                            shuffle=False, collate_fn=custom_collate_fn)

    return dataloader

In [9]:
train_sentences = readFile(train_file)

train_sentences = sorted(train_sentences, key=lambda x: len(x))

words_list = []
nerTagSet = set()
nerTagList = []
nerTagFreq = {}
for sentence in train_sentences:
  for word, nerTag in sentence:
    words_list.append(word.lower())
    nerTagSet.add(nerTag)
    nerTagList.append(nerTag)
    if nerTag in nerTagFreq:
      nerTagFreq[nerTag] += 1
    else:
      nerTagFreq[nerTag] = 1
  
words_counter = Counter(words_list)
words_counter = [word for word, count in words_counter.items() if count >= threshold_value]
vocab = {word: index + 1 for index, word in enumerate(words_counter)}
vocab["<PAD>"] = 0
vocab["<UNK>"] = unknown_value

tag_map = { tag: index for index, tag in enumerate(list(nerTagSet)) }

map_index_to_tag = {
  index: tag for tag, index in tag_map.items()
}

print(len(words_counter))
print(len(vocab))
print(tag_map)

21010
21012
{'I-ORG': 0, 'B-PER': 1, 'B-MISC': 2, 'B-LOC': 3, 'B-ORG': 4, 'I-LOC': 5, 'I-PER': 6, 'I-MISC': 7, 'O': 8}


In [10]:
X = []
Y = []

for sentence in train_sentences:
  ip = []
  op = []

  for word, tag in sentence:
    ip.append(vocab.get(word.lower(), unknown_value))
    op.append(tag_map[tag])

  X.append(ip)
  Y.append(op)

train_dataloader = getDatasetAndDataLoader(X, Y, batch_size=8)

In [11]:
weights = [0 for _ in range(len(tag_map))]

for index, tag in enumerate(tag_map):
  weights[index] = len(train_sentences) / (len(tag_map) * nerTagFreq[tag])

weight_balance = torch.Tensor(weights)

weight_balance

tensor([0.4496, 0.2523, 0.4844, 0.2332, 0.2634, 1.4393, 0.3678, 1.4418, 0.0098])

In [12]:
model = train_lstm_model(train_dataloader, len(vocab) + 1, len(tag_map), weight_balance=torch.Tensor(weight_balance))

Epoch: 1 	Training Loss: 0.176754
Accuracy:  51.28986122703998
Epoch: 2 	Training Loss: 0.122137
Accuracy:  60.96735931116285
Epoch: 3 	Training Loss: 0.088448
Accuracy:  67.6092294024904
Epoch: 4 	Training Loss: 0.065097
Accuracy:  72.49532623604583
Epoch: 5 	Training Loss: 0.049974
Accuracy:  76.08985341709499
Epoch: 6 	Training Loss: 0.038772
Accuracy:  78.89801676209444
Epoch: 7 	Training Loss: 0.031037
Accuracy:  81.29274114425456
Epoch: 8 	Training Loss: 0.026076
Accuracy:  83.02313208081341
Epoch: 9 	Training Loss: 0.022166
Accuracy:  84.45918571561062
Epoch: 10 	Training Loss: 0.018915
Accuracy:  85.82055675145338
Epoch: 11 	Training Loss: 0.016891
Accuracy:  86.91980650861291
Epoch: 12 	Training Loss: 0.015038
Accuracy:  87.77450736331376
Epoch: 13 	Training Loss: 0.014151
Accuracy:  88.19575627114176
Epoch: 14 	Training Loss: 0.012490
Accuracy:  89.03142038493267
Epoch: 15 	Training Loss: 0.011243
Accuracy:  89.82705853065647
Epoch: 16 	Training Loss: 0.010675
Accuracy:  90.1

In [13]:
torch.save(model, "blstm1.pt")

In [14]:
def predict_dev(pred_model, dataloader):
  # pred_model.eval()
  y_actual = []
  y_pred = []
  
  # with torch.no_grad():
  for data, target in dataloader:
    pred = pred_model(data)

    pred = torch.argmax(pred, 2)

    y_pred.extend(pred)
    y_actual.extend(target)

  return y_pred, y_actual

In [15]:
dev_sentences = readFile(dev_file)

X_dev = []
Y_dev = []

for sentence in dev_sentences:
  ip_dev = []
  op_dev = []

  for word, tag in sentence:
    ip_dev.append(vocab.get(word.lower(), unknown_value))
    op_dev.append(tag_map[tag])

  X_dev.append(ip_dev)
  Y_dev.append(op_dev)

dev_dataloader = getDatasetAndDataLoader(X_dev, Y_dev, batch_size=2)

y_pred, y_actual = predict_dev(model, dev_dataloader)

In [16]:
pred_sentences = []
for index, sentence in enumerate(dev_sentences):
  preds = y_pred[index].tolist()
  if not isinstance(preds, list):
    preds = [preds]
  curr_sentence = []
  for word_index, (word, _) in enumerate(sentence):
    curr_sentence.append((word, map_index_to_tag[preds[word_index]]))
  pred_sentences.append(curr_sentence)

writeFile("dev1.out", pred_sentences)

In [17]:
class TestDataset(Dataset):
    def __init__(self, X):
        self.X = X

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index]


def test_collate_fn(batch):
    sentences = [torch.LongTensor(item) for item in batch]

    padded_sentences = pad_sequence(
        sentences, batch_first=True)

    return padded_sentences


def getTestDataloader(X, batch_size=2):
    dataset = TestDataset(X)

    return DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=test_collate_fn)


def predict_test(pred_model, dataloader):
    # pred_model.eval()
    y_pred = []

    # with torch.no_grad():
    for data in dataloader:
        pred = pred_model(data)

        pred = torch.argmax(pred, 2)

        y_pred.extend(pred)

    return y_pred

In [18]:
test_sentences = readFile(test_file, entity=False)

X_test = []

for sentence in test_sentences:
  ip_test = []
  for word in sentence:
    ip_test.append(vocab.get(word.lower(), unknown_value))
  X_test.append(ip_test)

test_dataloader = getTestDataloader(X_test)

y_pred = predict_test(model, test_dataloader)

pred_sentences = []
for index, sentence in enumerate(test_sentences):
  preds = y_pred[index].tolist()
  if not isinstance(preds, list):
    preds = [preds]
  curr_sentence = []
  for word_index, word in enumerate(sentence):
    curr_sentence.append((word, map_index_to_tag[preds[word_index]]))
  pred_sentences.append(curr_sentence)

writeFile("test1.out", pred_sentences)

processed 51578 tokens with 5942 phrases; found: 9516 phrases; correct: 4264.
- accuracy:  87.89%; precision:  44.81%; recall:  71.76%; FB1:  55.17
- LOC: precision:  69.04%; recall:  76.59%; FB1:  72.62  2038
- MISC: precision:  47.49%; recall:  68.66%; FB1:  56.14  1333
- ORG: precision:  32.21%; recall:  63.61%; FB1:  42.77  2648
- PER: precision:  39.21%; recall:  74.43%; FB1:  51.36  3497