In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, BucketIterator, Dataset, NestedField, Example

from sklearn.metrics import f1_score, precision_recall_fscore_support

import random

In [10]:
def validation_step(model, iterator, loss_function, num_tags):
    model.eval()

    epoch_loss = 0
    y_true = []
    y_pred = []

    total_accuracy = 0
    total_amount = 0

    with torch.no_grad():
        for batch in iterator:
            words = batch.word
            tags = batch.tag

            predictions = model(words)

            tags = tags.view(-1)
            predictions = predictions.view(-1, num_tags)

            loss = loss_function(predictions, tags)

            labels = tags.cpu().numpy()
            predicted_labels = torch.argmax(predictions, dim=1).cpu().numpy()
            y_true.extend(labels)

            _, pred_tags = torch.max(predictions, 1)
            y_pred.extend(pred_tags.cpu().numpy())
            # all_tags.extend(labels)

            mask = labels != 0
            correct_predictions = (predicted_labels[mask] == labels[mask]).sum()
            accuracy = correct_predictions / len(labels[mask])
            
            epoch_loss += loss
            total_amount += 1

    precision, recall, f1_score, support = precision_recall_fscore_support(
        y_true,
        y_pred,
        average='macro',
        zero_division=0
    )


    return (epoch_loss/total_amount)*100, precision*100, recall*100, f1_score*100

In [3]:
INDEX = Field(sequential=False, use_vocab=False)
WORD = Field(sequential=True, tokenize=lambda x: x.split(), lower=True)
# WORD = Field(sequential=True, tokenize=lambda x: x.split(), lower=False)
TAG = Field(sequential=True, tokenize=lambda x: x.split(), is_target=True)

fields = [('index', INDEX), ('word', WORD), ('tag', TAG)]

In [4]:
def load_data(file_path):
    data = []
    with open(file_path, 'r') as f:
        indexs = []
        words = []
        tags = []
        for line in f:
            if not line.strip():
                if len(words) > 0 and len(tags) > 0:
                    data.append(Example.fromlist([0, words, tags], fields))
                    indexs = []
                    words = []
                    tags = []
            else:
                index, word, tag = line.strip().split()
                indexs.append(index)
                words.append(word)
                tags.append(tag)
    return data

In [20]:
train_data = load_data('data/train')
train_dataset = Dataset(train_data, fields)

dev_data = load_data('data/dev')
dev_dataset = Dataset(dev_data, fields)

WORD.build_vocab(train_dataset, min_freq=1)  # min_freq handles unknown
TAG.build_vocab(train_dataset)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 1

train_iterator = BucketIterator(
    train_dataset,
    batch_size=BATCH_SIZE,
    device=device,
    sort_within_batch=True,
    sort_key=lambda x: len(x.word),
    repeat=False,
)
dev_iterator = BucketIterator(
    dev_dataset,
    batch_size=BATCH_SIZE,
    device=device,
    sort_within_batch=True,
    sort_key=lambda x: len(x.word),
    repeat=False,
)

In [40]:
for e in train_iterator:
#     print(e.tag)
#     print(e.word)
    print(translator(e.word, WORD.vocab))
    break

# for l in train_data:
#     print(l.tag)
#     print(l.word)
#     break

['a', 'chain-smoking', 'former', 'paratroop', 'general', 'with', 'a', 'sharp', 'line', 'in', 'deadpan', 'putdowns', 'and', 'a', 'soldier', "'s", 'knack', 'for', 'making', 'life', 'sound', 'simple', ',', 'lebed', 'managed', 'to', 'arrange', 'an', 'ambitious', 'ceasefire', 'in', 'the', 'region', 'last', 'week', ',', 'days', 'after', 'the', 'russian', 'army', 'threatened', 'to', 'bomb', 'its', 'way', 'back', 'into', 'the', 'rebel-held', 'chechen', 'capital', 'grozny', '.']


In [37]:
def translator(tensor, vocab):
    e = []
    for w in tensor:
        e.append(vocab.itos[int(w.data[0])])
    return e

# Task 1

In [24]:
vocab_size = len(WORD.vocab)
num_tags = len(TAG.vocab)

embedding_dim = 100
hidden_dim = 256
num_layers = 1
dropout = 0.33
linear_output_dim = 128

class BiLSTM(nn.Module):
    def __init__(self, vocab_size, linear_output_dim, embedding_dim, hidden_dim, num_layers, dropout):
        super(BiLSTM, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, bidirectional=True, batch_first=True)
        self.linear1 = nn.Linear(hidden_dim * 2, linear_output_dim)
        self.elu = nn.ELU()
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(linear_output_dim, num_tags)

    def forward(self, x):
        # TODO lol fix name conv pls thx
        x = self.embedding(x)
        # print("NO GLOVE Forward",x)
        x, _ = self.lstm(x)
        x = self.linear1(x)
        x = self.elu(x)
        x = self.dropout(x)
        logits = self.linear2(x)

        return logits

In [8]:
loaded_model = BiLSTM(vocab_size, linear_output_dim, embedding_dim, hidden_dim, num_layers, dropout)

saved_state_dict = torch.load("model_1.pt")
loaded_model.load_state_dict(saved_state_dict)
loaded_model.eval()

BiLSTM(
  (embedding): Embedding(21012, 100)
  (lstm): LSTM(100, 256, batch_first=True, bidirectional=True)
  (linear1): Linear(in_features=512, out_features=128, bias=True)
  (elu): ELU(alpha=1.0)
  (dropout): Dropout(p=0.33, inplace=False)
  (linear2): Linear(in_features=128, out_features=11, bias=True)
)

In [60]:
'''
lr
    0.001 - leads to 76%. Good enough, but the first epoch is 76 and doesn't lead to much learning (epoch 10)
    0.0001 - leads to good learning curve, but still only reaches to 76 after 7th epoch
    0.00005 - 
'''

model = BiLSTM(vocab_size, linear_output_dim, embedding_dim, hidden_dim, num_layers, dropout)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, weight_decay=0.01)
highest_f1_score = 0
epochs = 500
for epoch in range(epochs):
    for batch in train_iterator:
        model.zero_grad()

        words = batch.word
        tags = batch.tag
        print("words", translator(words, WORD.vocab))
        
        predictions = model(words)

        predictions = predictions.view(-1, num_tags)
        tags = tags.view(-1)
#         print("predictions", predictions)
        l = []
        for tag in tags.tolist():
            l.append(TAG.vocab.itos[tag])
        print("tags", l)
        loss = loss_function(predictions, tags)
        loss.backward()

        optimizer.step()
        break
    break

    accuracy,precision,recall,f1_score = validation_step(model, dev_iterator, loss_function, num_tags)
    if f1_score > highest_f1_score:
        highest_f1_score = f1_score
        torch.save(model.state_dict(), "model_1.pt")
    print(f"Epoch {epoch + 1} - DEV accuracy: {accuracy:.4f} precision: {precision:.4f} recall {recall:.4f} f1_score {f1_score:.4f}")

words ['"', 'today', 'i', 'am', 'pleased', 'to', 'announce', 'that', 'we', 'are', 'following', 'through', 'on', 'our', 'commitment', 'to', 'keep', 'track', 'of', 'these', 'criminals', ',', 'not', 'just', 'in', 'a', 'single', 'state', 'but', 'wherever', 'they', 'go', ',', '"', 'he', 'said', '.']
tags ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [10]:
def createFile(model, textFile):
    with open(textFile, 'r') as input_file, open('pred.txt', 'w') as output_file:
        indexs = []
        words = []
        tags = []
        for line in input_file:
            if not line.strip():
                if len(words) > 0 and len(tags) > 0:
                    data = Example.fromlist([0, words, tags], fields)
                    model.eval()
                    
                    with torch.no_grad():
                        prediction = model(word)
                    
                    predictionLine = str(index) + " " + str(word) + " " + str(tag) + " " + str(prediction)
                    output_file.write(predictionLine)
                    
                    indexs = []
                    words = []
                    tags = []
            else:
                index, word, tag = line.strip().split()
                indexs.append(index)
                words.append(word)
                tags.append(tag)
    return data

In [56]:
TAG.vocab.itos[0]

'<unk>'

In [149]:
model.eval()

with torch.no_grad():
    for batch in dev_iterator:
        words = batch.word
        tags = batch.tag

        predictions = model(words)

        tags = tags.view(-1)
        predictions = predictions.view(-1, num_tags)

        labels = tags.cpu().numpy()
        predicted_labels = torch.argmax(predictions, dim=1).cpu().numpy()
        # y_true.extend(labels)
        # print(list(labels))
        _, pred_tags = torch.max(predictions, 1)
        # y_pred.extend(pred_tags.cpu().numpy())
        # print(pred_tags.tolist())
        if len(set(pred_tags.tolist())) >= 2:
            print(pred_tags.tolist())
            break
        # all_tags.extend(labels)

        precision, recall, f1_score, support = precision_recall_fscore_support(
            labels,
            pred_tags.cpu().numpy(),
            average='micro',
            zero_division=0
        )
        # print(precision, recall, f1_score, support)

In [50]:
createFile(model, "data/dev")

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not str

In [37]:
model(train_data[0])


TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not Example