In [70]:
# Setup and install dependencies
# !pip3 install numpy
# !pip3 install torch

# Import libraries
import os
import numpy as np
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# print(torch.cuda.is_available())
print(torch.cuda.device_count())
torch.cuda.set_device(0)
torch.manual_seed(1)

# Enable floating-point underflow warning
np.seterr(under="warn")

# Set OS-independent paths, relative to current directory
es_train_path = os.path.join("data", "ES", "train")
es_dev_in_path = os.path.join("data", "ES", "dev.in")
es_dev_out_path = os.path.join("data", "ES", "dev.out")
es_dev_p1_out_path = os.path.join("data", "ES", "dev.p1.out")
es_dev_p2_out_path = os.path.join("data", "ES", "dev.p2.out")
es_dev_p3_out_path = os.path.join("data", "ES", "dev.p3.out")
es_dev_p4_out_path = os.path.join("data", "ES", "dev.p4.out")
ru_train_path = os.path.join("data", "RU", "train")
ru_dev_in_path = os.path.join("data", "RU", "dev.in")
ru_dev_out_path = os.path.join("data", "RU", "dev.out")
ru_dev_p1_out_path = os.path.join("data", "RU", "dev.p1.out")
ru_dev_p2_out_path = os.path.join("data", "RU", "dev.p2.out")
ru_dev_p3_out_path = os.path.join("data", "RU", "dev.p3.out")
ru_dev_p4_out_path = os.path.join("data", "RU", "dev.p4.out")

# Define constant variables
N = 7
O, BPOS, IPOS, BNEU, INEU, BNEG, INEG = 0, 1, 2, 3, 4, 5, 6
label_to_id = {"O": O,
          "B-positive": BPOS,
          "I-positive": IPOS,
          "B-neutral": BNEU,
          "I-neutral": INEU,
          "B-negative": BNEG,
          "I-negative": INEG,}
id_to_label = ["O", "B-positive", "I-positive", "B-neutral", "I-neutral", "B-negative", "I-negative"]

# Initialise a random number generator with a fixed seed for reproducible results and deterministic behavior
rng = np.random.default_rng(1004519 + 1004103 + 1004555)

1


In [57]:
# Read dev.in data
def read_dev_in_data(filepath):
    results = []
    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        sentence = []
        for line in lines:
            if line.strip() != '':
                sentence.append(line.strip())  # add zero if meet unkown token
            else:
                results.append(sentence.copy())
                sentence = []
    return results

# Read training data
def read_training_data(filepath):
    results = []
    vocab = {'': 0}
    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        tokens = []
        labels = []
        for line in lines:
            if len(line.strip().rsplit(" ", 1)) == 2:
                token, label = line.strip().rsplit(" ", 1)
                if token not in vocab:
                    vocab[token] = len(vocab)
                tokens.append(vocab[token])
                labels.append(label_to_id[label])
            else:
                results.append((tokens.copy(), labels.copy()))
                tokens = []
                labels = []
    return results, vocab

def prepare_sequence(seq, to_ix):
    idxs = [to_ix.get(w, 0) for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [37]:
# load training data:
training_data, word_to_id = read_training_data(es_train_path)
# print(word_to_id)

# load the test data:
test_data = read_dev_in_data(es_dev_in_path)
# print(test_data)

EMBEDDING_DIM = 6
HIDDEN_DIM = 6

In [20]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [23]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_id), len(label_to_id))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = torch.tensor(training_data[0][0], dtype=torch.long)
    tag_scores = model(inputs)
    print(tag_scores)

tensor([[-1.6177, -2.1740, -2.1268, -2.3428, -1.5927, -2.1651, -1.8674],
        [-1.6600, -2.1226, -2.0203, -2.3408, -1.6991, -2.1271, -1.8373],
        [-1.6473, -2.1037, -2.0571, -2.3674, -1.6408, -2.1772, -1.8534],
        [-1.7730, -2.0963, -1.9605, -2.3229, -1.6316, -2.2307, -1.7993],
        [-1.6879, -2.1069, -2.0223, -2.3662, -1.6765, -2.1452, -1.8126],
        [-1.6987, -2.1225, -2.0197, -2.3861, -1.5958, -2.2219, -1.8231],
        [-1.6494, -2.0551, -2.0582, -2.3670, -1.6637, -2.2132, -1.8356],
        [-1.6452, -2.1272, -2.0568, -2.4101, -1.5964, -2.2009, -1.8524],
        [-1.6374, -2.0665, -2.0304, -2.3608, -1.6987, -2.1570, -1.8665],
        [-1.6296, -2.0674, -2.0744, -2.3700, -1.6498, -2.1826, -1.8738],
        [-1.6925, -2.0788, -2.0453, -2.2668, -1.6751, -2.1933, -1.8386],
        [-1.6847, -2.1335, -2.0630, -2.3403, -1.5677, -2.2538, -1.8378],
        [-1.6358, -2.1035, -2.0591, -2.3747, -1.6520, -2.1861, -1.8417],
        [-1.7041, -2.1420, -2.0098, -2.3664, -1.634

In [52]:
model.cuda()

LSTMTagger(
  (word_embeddings): Embedding(5348, 6)
  (lstm): LSTM(6, 6)
  (hidden2tag): Linear(in_features=6, out_features=7, bias=True)
)

In [71]:
for epoch in range(1000):  # again, normally you would NOT do 300 epochs, it is toy data
    print("epoch ", epoch)
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = torch.tensor(sentence, dtype=torch.long).type(torch.cuda.LongTensor)
        targets = torch.tensor(tags, dtype=torch.long).type(torch.cuda.LongTensor)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
    print(loss.item())

# See what the scores are after training
# with torch.no_grad():
#     inputs = prepare_sequence(test_data[0], word_to_id).type(torch.cuda.LongTensor)
#     tag_scores = model(inputs)

#     print(tag_scores)
print("Done")

epoch  0
0.11633754521608353
epoch  1
0.11078547686338425
epoch  2
0.1169263944029808
epoch  3
0.1217745766043663
epoch  4
0.11806479096412659
epoch  5
0.12061285972595215
epoch  6
0.12528499960899353
epoch  7
0.11965995281934738
epoch  8
0.12225370109081268
epoch  9
0.12126162648200989
epoch  10
0.11848057061433792
epoch  11
0.12149934470653534
epoch  12
0.12037868052721024
epoch  13
0.12346511334180832
epoch  14
0.12152369320392609
epoch  15
0.12529826164245605
epoch  16
0.12122891843318939
epoch  17
0.1259893774986267
epoch  18
0.1219339445233345
epoch  19
0.11951681226491928
epoch  20
0.12747812271118164
epoch  21
0.12532302737236023
epoch  22
0.12266930937767029
epoch  23
0.1306934654712677
epoch  24
0.125125914812088
epoch  25
0.12236467003822327
epoch  26
0.12498821318149567
epoch  27
0.12341330945491791
epoch  28
0.12230958044528961
epoch  29
0.1263711303472519
epoch  30
0.12294110655784607
epoch  31
0.12145999073982239
epoch  32
0.12499453872442245
epoch  33
0.1230209246277809

In [69]:
with torch.no_grad():
    with open(es_dev_p4_out_path, "w+", encoding="utf-8") as file:
        for sentence in test_data:
            inputs = prepare_sequence(sentence, word_to_id).type(torch.cuda.LongTensor)
            tag_scores = np.array(model(inputs).cpu())
            pred_labels = np.argmax(tag_scores, axis=1)
            for i in range(len(sentence)):
                file.write("{} {}\n".format(sentence[i], id_to_label[pred_labels[i]]))
            file.write("\n")
