In [101]:
# Setup and install dependencies
# !pip3 install numpy
# !pip3 install torch

# Import libraries
import os
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

# from gensim.models import Word2Vec

from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense, Input
from keras.layers import TimeDistributed
from keras.layers import LSTM, GRU, Bidirectional, SimpleRNN, RNN
from keras.models import Model
from keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# Enable floating-point underflow warning
np.seterr(under="warn")

# Set OS-independent paths, relative to current directory
es_train_path = os.path.join("data", "ES", "train")
es_dev_in_path = os.path.join("data", "ES", "dev.in")
es_dev_out_path = os.path.join("data", "ES", "dev.out")
es_dev_p1_out_path = os.path.join("data", "ES", "dev.p1.out")
es_dev_p2_out_path = os.path.join("data", "ES", "dev.p2.out")
es_dev_p3_out_path = os.path.join("data", "ES", "dev.p3.out")
es_dev_p4_out_path = os.path.join("data", "ES", "dev.p4.out")
es_test_in_path = os.path.join("data", "ES-test", "test.in")
es_test_out_path = os.path.join("data", "ES-test", "test.out")
ru_train_path = os.path.join("data", "RU", "train")
ru_dev_in_path = os.path.join("data", "RU", "dev.in")
ru_dev_out_path = os.path.join("data", "RU", "dev.out")
ru_dev_p1_out_path = os.path.join("data", "RU", "dev.p1.out")
ru_dev_p2_out_path = os.path.join("data", "RU", "dev.p2.out")
ru_dev_p3_out_path = os.path.join("data", "RU", "dev.p3.out")
ru_dev_p4_out_path = os.path.join("data", "RU", "dev.p4.out")
ru_test_in_path = os.path.join("data", "RU-test", "test.in")
ru_test_out_path = os.path.join("data", "RU-test", "test.out")


# Define constant variables
N = 7
O, BPOS, IPOS, BNEU, INEU, BNEG, INEG = 0, 1, 2, 3, 4, 5, 6
label_to_id = {"O": O,
          "B-positive": BPOS,
          "I-positive": IPOS,
          "B-neutral": BNEU,
          "I-neutral": INEU,
          "B-negative": BNEG,
          "I-negative": INEG,}
id_to_label = ["O", "O", "B-positive", "I-positive", "B-neutral", "I-neutral", "B-negative", "I-negative"]

# Initialise a random number generator with a fixed seed for reproducible results and deterministic behavior
# rng = np.random.default_rng(1004519 + 1004103 + 1004555)

In [53]:
# Read dev.in data
def read_dev_in_data(filepath):
    results = []
    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        sentence = []
        for line in lines:
            if line.strip() != '':
                sentence.append(line.strip())  # add zero if meet unkown token
            else:
                results.append(sentence.copy())
                sentence = []
    return results

# Read training data
def read_training_data(filepath):
    X = []
    Y = []
    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        tokens = []
        labels = []
        for line in lines:
            if len(line.strip().rsplit(" ", 1)) == 2:
                token, label = line.strip().rsplit(" ", 1)
                tokens.append(token.lower())
                labels.append(label)
            else:
                X.append(tokens.copy())
                Y.append(labels.copy())
                tokens = []
                labels = []
    return X, Y

# def prepare_sequence(seq, to_ix):
#     idxs = [to_ix.get(w.lower(), 0) for w in seq]
#     return torch.tensor(idxs, dtype=torch.long)

In [71]:
X, Y = read_training_data(es_train_path)

# encode X
word_tokenizer = Tokenizer()              # instantiate tokeniser
word_tokenizer.fit_on_texts(X)            # fit tokeniser on data
# use the tokeniser to encode input sequence
X_encoded = word_tokenizer.texts_to_sequences(X)


# encode Y
tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(Y)
Y_encoded = tag_tokenizer.texts_to_sequences(Y)

# look at first encoded data point
print("** Raw data point **", "\n", "-"*100, "\n")
print('X: ', X[0], '\n')
print('Y: ', Y[0], '\n')
print()
print("** Encoded data point **", "\n", "-"*100, "\n")
print('X: ', X_encoded[0], '\n')
print('Y: ', Y_encoded[0], '\n')

# make sure that each sequence of input and output is same length
different_length = [1 if len(input) != len(output) else 0 for input, output in zip(X_encoded, Y_encoded)]
print("{} sentences have disparate input-output lengths.".format(sum(different_length)))
#187

** Raw data point ** 
 ---------------------------------------------------------------------------------------------------- 

X:  ['disfrutemos', 'de', 'una', 'buenísima', 'calidad', 'en', 'el', 'producto', 'y', 'una', 'inmejorable', 'relación', 'calidad', 'precio', '.'] 

Y:  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-positive', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] 


** Encoded data point ** 
 ---------------------------------------------------------------------------------------------------- 

X:  [1904, 3, 18, 760, 27, 8, 7, 228, 4, 18, 229, 75, 27, 35, 2] 

Y:  [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] 

0 sentences have disparate input-output lengths.


In [72]:
# check length of longest sentence
lengths = [len(seq) for seq in X_encoded]
print("Length of longest sentence: {}".format(max(lengths)))

# sns.boxplot(lengths)
# plt.show()

EMBEDDING_SIZE = 300
MAX_SEQ_LENGTH = 200  # sequences greater than 100 in length will be truncated
VOCABULARY_SIZE = len(word_tokenizer.word_index) + 1

X_padded = pad_sequences(X_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")
Y_padded = pad_sequences(Y_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")

# print(X_padded[0], "\n")
# print(Y_padded[0])

X, Y = X_padded, Y_padded

Length of longest sentence: 163


In [73]:
Y = to_categorical(Y)
# print(Y.shape)
NUM_CLASSES = Y.shape[2]

# split entire data into training and testing sets
# TEST_SIZE = 0.15
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=4)

# split training data into training and validation sets
VALID_SIZE = 0.1
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=VALID_SIZE, random_state=4)

# print number of samples in each set
print("TRAINING DATA")
print('Shape of input sequences: {}'.format(X_train.shape))
print('Shape of output sequences: {}'.format(Y_train.shape))
print("-"*50)
print("VALIDATION DATA")
print('Shape of input sequences: {}'.format(X_validation.shape))
print('Shape of output sequences: {}'.format(Y_validation.shape))
print("-"*50)
print("TESTING DATA")
print('Shape of input sequences: {}'.format(X_test.shape))
print('Shape of output sequences: {}'.format(Y_test.shape))

TRAINING DATA
Shape of input sequences: (1858, 200)
Shape of output sequences: (1858, 200, 8)
--------------------------------------------------
VALIDATION DATA
Shape of input sequences: (207, 200)
Shape of output sequences: (207, 200, 8)
--------------------------------------------------
TESTING DATA
Shape of input sequences: (310, 100)
Shape of output sequences: (310, 100, 8)


In [128]:
bidirect_model = Sequential()
bidirect_model.add(Embedding(input_dim     = VOCABULARY_SIZE,
                             output_dim    = EMBEDDING_SIZE,
                             input_length  = MAX_SEQ_LENGTH,
                             trainable     = True
))
bidirect_model.add(Bidirectional(LSTM(128, return_sequences=True)))
# bidirect_model.add(Bidirectional(LSTM(64, return_sequences=True)))
bidirect_model.add(TimeDistributed(Dense(NUM_CLASSES, activation='softmax')))

bidirect_model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

bidirect_model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 200, 300)          1441500   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 200, 256)          439296    
_________________________________________________________________
time_distributed_4 (TimeDist (None, 200, 8)            2056      
Total params: 1,882,852
Trainable params: 1,882,852
Non-trainable params: 0
_________________________________________________________________


In [135]:
bidirect_training = bidirect_model.fit(X_train, Y_train, batch_size=128, epochs=20, validation_data=(X_validation, Y_validation))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [133]:
X_test_raw = read_dev_in_data(es_dev_in_path)
X_test_encoded = word_tokenizer.texts_to_sequences(X_test_raw)
X_test_padded = pad_sequences(X_test_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")

res = bidirect_model.apply(X_test_padded)
index = np.argmax(res, axis=2)

In [134]:
id_to_label = ["O", "O", "B-positive", "I-positive", "B-neutral", "I-neutral", "B-negative", "I-negative"]
with open(es_dev_p4_out_path, "w+", encoding="utf-8") as file:
    for i in range(len(X_test_raw)):
        sentence = X_test_raw[i]
        pred_labels = index[i][-len(sentence):]
        for j in range(len(sentence)):
            # print("{} {}\n".format(sentence[j], id_to_label[pred_labels[j]]))
            file.write("{} {}\n".format(sentence[j], id_to_label[pred_labels[j]]))
        file.write("\n")

In [134]:
LANG = 'ru'

if LANG == 'es':
    # load training data:
    training_data, word_to_id = read_training_data(es_train_path)
    # print(word_to_id)
elif LANG == 'ru':
    # load training data:
    training_data, word_to_id = read_training_data(ru_train_path)
    # print(word_to_id)

EMBEDDING_DIM = 16
HIDDEN_DIM = 16
NUM_LAYERS = 4

# ES: 25 epochs
# Loss: 0.03923250734806061
# Entity F: 0.6081
# Sentiment F: 0.4955
# RU: 45 epochs
# Loss: 0.036458928138017654
# Entity F: 0.5487
# Sentiment F: 0.4043

In [135]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS, len(word_to_id), len(label_to_id))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# with torch.no_grad():
#     inputs = torch.tensor(training_data[0][0], dtype=torch.long)
#     tag_scores = model(inputs)
#     print(tag_scores)

model.cuda()

LSTMTagger(
  (word_embeddings): Embedding(7480, 16)
  (lstm): LSTM(16, 16, num_layers=4)
  (hidden2tag): Linear(in_features=16, out_features=7, bias=True)
)

In [150]:
# s = 0
for epoch in range(5):  # again, normally you would NOT do 300 epochs, it is toy data   
    print("epoch ", epoch)
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = torch.tensor(sentence, dtype=torch.long).type(torch.cuda.LongTensor)
        targets = torch.tensor(tags, dtype=torch.long).type(torch.cuda.LongTensor)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
    print(loss.item())
    # if s == 0 and loss.item() < 0.01:
    #     s = 1
    # elif s == 1 and loss.item() > 0.05:
    #     break

# See what the scores are after training
# with torch.no_grad():
#     inputs = prepare_sequence(test_data[0], word_to_id).type(torch.cuda.LongTensor)
#     tag_scores = model(inputs)

#     print(tag_scores)
print("Done")

epoch  0
0.026569059118628502
epoch  1
0.03420393541455269
epoch  2
0.031586337834596634
epoch  3
0.030467532575130463
epoch  4
0.027004273608326912
Done


In [149]:
# running on dev.in
if LANG == 'es':
    test_data = read_dev_in_data(es_dev_in_path)
    path = es_dev_p4_out_path
elif LANG == 'ru':
    test_data = read_dev_in_data(ru_dev_in_path)
    path = ru_dev_p4_out_path

with torch.no_grad():
    with open(path, "w+", encoding="utf-8") as file:
        for sentence in test_data:
            inputs = prepare_sequence(sentence, word_to_id).type(torch.cuda.LongTensor)
            tag_scores = np.array(model(inputs).cpu())
            pred_labels = np.argmax(tag_scores, axis=1)
            for i in range(len(sentence)):
                file.write("{} {}\n".format(sentence[i], id_to_label[pred_labels[i]]))
            file.write("\n")


In [113]:
# running on test.in
if LANG == 'es':
    test_data = read_dev_in_data(es_test_in_path)
    path = es_test_out_path
elif LANG == 'ru':
    test_data = read_dev_in_data(ru_test_in_path)
    path = ru_test_out_path

with torch.no_grad():
    with open(path, "w+", encoding="utf-8") as file:
        for sentence in test_data:
            inputs = prepare_sequence(sentence, word_to_id).type(torch.cuda.LongTensor)
            tag_scores = np.array(model(inputs).cpu())
            pred_labels = np.argmax(tag_scores, axis=1)
            for i in range(len(sentence)):
                file.write("{} {}\n".format(sentence[i], id_to_label[pred_labels[i]]))
            file.write("\n")
