In [50]:
# Setup and install dependencies
# !pip3 install numpy
# !pip3 install torch

# Import libraries
import os
import numpy as np
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# print(torch.cuda.is_available())
print(torch.cuda.device_count())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(0)
torch.manual_seed(1)

# Enable floating-point underflow warning
np.seterr(under="warn")

# Set OS-independent paths, relative to current directory
es_train_path = os.path.join("data", "ES", "train")
es_dev_in_path = os.path.join("data", "ES", "dev.in")
es_dev_out_path = os.path.join("data", "ES", "dev.out")
es_dev_p1_out_path = os.path.join("data", "ES", "dev.p1.out")
es_dev_p2_out_path = os.path.join("data", "ES", "dev.p2.out")
es_dev_p3_out_path = os.path.join("data", "ES", "dev.p3.out")
es_dev_p4_out_path = os.path.join("data", "ES", "dev.p4.out")
es_test_in_path = os.path.join("data", "ES-test", "test.in")
es_test_out_path = os.path.join("data", "ES-test", "test.out")
ru_train_path = os.path.join("data", "RU", "train")
ru_dev_in_path = os.path.join("data", "RU", "dev.in")
ru_dev_out_path = os.path.join("data", "RU", "dev.out")
ru_dev_p1_out_path = os.path.join("data", "RU", "dev.p1.out")
ru_dev_p2_out_path = os.path.join("data", "RU", "dev.p2.out")
ru_dev_p3_out_path = os.path.join("data", "RU", "dev.p3.out")
ru_dev_p4_out_path = os.path.join("data", "RU", "dev.p4.out")
ru_test_in_path = os.path.join("data", "RU-test", "test.in")
ru_test_out_path = os.path.join("data", "RU-test", "test.out")


# Define constant variables
N = 7
O, BPOS, IPOS, BNEU, INEU, BNEG, INEG = 0, 1, 2, 3, 4, 5, 6
label_to_id = {"O": O,
          "B-positive": BPOS,
          "I-positive": IPOS,
          "B-neutral": BNEU,
          "I-neutral": INEU,
          "B-negative": BNEG,
          "I-negative": INEG,}
id_to_label = ["O", "B-positive", "I-positive", "B-neutral", "I-neutral", "B-negative", "I-negative"]

# Initialise a random number generator with a fixed seed for reproducible results and deterministic behavior
rng = np.random.default_rng(1004519 + 1004103 + 1004555)

1


In [51]:
# Read dev.in data
def read_dev_in_data(filepath):
    results = []
    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        sentence = []
        for line in lines:
            if line.strip() != '':
                sentence.append(line.strip())  # add zero if meet unkown token
            else:
                results.append(sentence.copy())
                sentence = []
    return results

# Read training data
def read_training_data(filepath):
    results = []
    vocab = {'': 0}
    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        tokens = []
        labels = []
        for line in lines:
            if len(line.strip().rsplit(" ", 1)) == 2:
                token, label = line.strip().rsplit(" ", 1)
                if token not in vocab:
                    vocab[token] = len(vocab)
                tokens.append(vocab[token])
                labels.append(label_to_id[label])
            else:
                results.append((tokens.copy(), labels.copy()))
                tokens = []
                labels = []
    return results, vocab

def prepare_sequence(seq, to_ix):
    idxs = [to_ix.get(w, 0) for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [52]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, bidirectional=False)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [58]:
LANG = 'ru'

if LANG == 'es':
    # load training data:
    training_data, word_to_id = read_training_data(es_train_path)
    # print(word_to_id)
elif LANG == 'ru':
    # load training data:
    training_data, word_to_id = read_training_data(ru_train_path)
    # print(word_to_id)

EMBEDDING_DIM = 16
HIDDEN_DIM = 16
NUM_LAYERS = 4
# With the above:
# ES: 30 epochs
# Loss: 0.054697051644325256
# Entity F: 0.5746
# Sentiment F: 0.4474
# RU: 45 epochs
# Loss: 0.029415829107165337
# Entity F: 0.5191
# Sentiment F: 0.3788

In [59]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS, len(word_to_id), len(label_to_id))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# with torch.no_grad():
#     inputs = torch.tensor(training_data[0][0], dtype=torch.long)
#     tag_scores = model(inputs)
#     print(tag_scores)

model.cuda()

LSTMTagger(
  (word_embeddings): Embedding(8329, 16)
  (lstm): LSTM(16, 16, num_layers=4)
  (hidden2tag): Linear(in_features=16, out_features=7, bias=True)
)

In [67]:
s = 0
for epoch in range(30):  # again, normally you would NOT do 300 epochs, it is toy data   
    print("epoch ", epoch)
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = torch.tensor(sentence, dtype=torch.long).type(torch.cuda.LongTensor)
        targets = torch.tensor(tags, dtype=torch.long).type(torch.cuda.LongTensor)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
    print(loss.item())
    if s == 0 and loss.item() < 0.01:
        s = 1
    elif s == 1 and loss.item() > 0.05:
        break

# See what the scores are after training
# with torch.no_grad():
#     inputs = prepare_sequence(test_data[0], word_to_id).type(torch.cuda.LongTensor)
#     tag_scores = model(inputs)

#     print(tag_scores)
print("Done")

epoch  0
0.029119115322828293
epoch  1
0.034608807414770126
epoch  2
0.031016912311315536
epoch  3
0.03697674721479416
epoch  4
0.030748898163437843
Done


In [68]:
# running on dev.in
if LANG == 'es':
    test_data = read_dev_in_data(es_dev_in_path)
    path = es_dev_p4_out_path
elif LANG == 'ru':
    test_data = read_dev_in_data(ru_dev_in_path)
    path = ru_dev_p4_out_path

with torch.no_grad():
    with open(path, "w+", encoding="utf-8") as file:
        for sentence in test_data:
            inputs = prepare_sequence(sentence, word_to_id).type(torch.cuda.LongTensor)
            tag_scores = np.array(model(inputs).cpu())
            pred_labels = np.argmax(tag_scores, axis=1)
            for i in range(len(sentence)):
                file.write("{} {}\n".format(sentence[i], id_to_label[pred_labels[i]]))
            file.write("\n")


In [64]:
# running on test.in
if LANG == 'es':
    test_data = read_dev_in_data(es_test_in_path)
    path = es_test_out_path
elif LANG == 'ru':
    test_data = read_dev_in_data(ru_test_in_path)
    path = ru_test_out_path

with torch.no_grad():
    with open(path, "w+", encoding="utf-8") as file:
        for sentence in test_data:
            inputs = prepare_sequence(sentence, word_to_id).type(torch.cuda.LongTensor)
            tag_scores = np.array(model(inputs).cpu())
            pred_labels = np.argmax(tag_scores, axis=1)
            for i in range(len(sentence)):
                file.write("{} {}\n".format(sentence[i], id_to_label[pred_labels[i]]))
            file.write("\n")
