In [120]:
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.metrics import f1_score, precision_recall_fscore_support
from torch.utils.data import DataLoader, Dataset

import random

In [121]:
def validation_step(model, iterator, loss_function, num_tags):
    model.eval()

    epoch_loss = 0
    y_true = []
    y_pred = []

    total_accuracy = 0
    total_amount = 0
    total_loss = 0

    with torch.no_grad():
        for batch in iterator:
            word_seqs, tag_seqs = batch
            word_seqs = word_seqs.to(device)
            tag_seqs = tag_seqs.to(device)

            logits = model(word_seqs)
            logits = logits.view(-1, num_tags)
            tag_seqs = tag_seqs.view(-1)

            loss = loss_function(logits, tag_seqs)
            total_loss += loss.item()

            labels = tag_seqs.cpu().numpy()
            predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
            y_true.extend(labels)

            _, pred_tags = torch.max(logits, 1)
            y_pred.extend(pred_tags.cpu().numpy())
            # all_tags.extend(labels)

            mask = labels != 0
            correct_predictions = (predicted_labels[mask] == labels[mask]).sum()
            accuracy = correct_predictions / len(labels[mask])
            
            total_accuracy += accuracy
            epoch_loss += loss
            total_amount += 1

    precision, recall, f1_score, support = precision_recall_fscore_support(
        y_true,
        y_pred,
        average='macro',
        zero_division=0
    )

    return (epoch_loss/total_amount), (total_accuracy/total_amount)*100, precision*100, recall*100, f1_score*100

In [122]:
def read_data(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    data = []
    words, tags = [], []
    unique_words, unique_tags = set(), set()
    for line in lines:
        if line.strip() == "":
            data.append((words, tags))
            unique_words.update(words)
            unique_tags.update(tags)
            words, tags = [], []
        else:
            _, word, tag = line.strip().split()
            words.append(word)
            tags.append(tag)
    if words and tags:
        data.append((words, tags))
        unique_words.update(words)
        unique_tags.update(tags)

    return data, unique_words, unique_tags


In [123]:

def read_data1(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    data = []
    words, tags = [], []
    for line in lines:
        if line.strip() == "":
            data.append((words, tags))
            words, tags = [], []
        else:
            _, word, tag = line.strip().split()
            words.append(word)
            tags.append(tag)
    if words and tags:
        data.append((words, tags))
    return data

def tokenize(text):
    return text.split()

def pad_sequences(batch, word2idx, tag2idx, pad_token='<pad>', init_token='<s>', eos_token='</s>', unk_token='<unk>'):
    max_len = max([len(seq) + 2 for seq, _ in batch])  # Add 2 to account for <s> and </s> tokens

    padded_word_seqs = []
    padded_tag_seqs = []

    for words, tags in batch:
        padded_words = [init_token] + words + [eos_token]
        padded_words = [word2idx.get(word, word2idx[unk_token]) for word in padded_words] + [word2idx[pad_token]] * (max_len - len(padded_words))
        padded_word_seqs.append(padded_words)

        padded_tags = [init_token] + tags + [eos_token]
        padded_tags = [tag2idx[tag] for tag in padded_tags] + [tag2idx[pad_token]] * (max_len - len(padded_tags))
        padded_tag_seqs.append(padded_tags)

    seq_lengths = torch.tensor([len(seq) for seq in padded_word_seqs])

    return torch.tensor(padded_word_seqs), torch.tensor(padded_tag_seqs), seq_lengths


class CustomDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        sample = self.data[idx]

        if self.transform:
            sample = self.transform(sample)

        return sample

In [124]:
train_file = "data/train" 
raw_data, unique_words, unique_tags = read_data(train_file)
tokenized_data = [([word for word in words], [tag for tag in tags]) for words, tags in raw_data]
train_dataset = CustomDataset(tokenized_data)

dev_file = "data/dev" 
raw_data, unique_words, unique_tags = read_data(dev_file)
tokenized_data = [([word for word in words], [tag for tag in tags]) for words, tags in raw_data]
dev_dataset = CustomDataset(tokenized_data)


In [125]:
from collections import Counter

def create_vocab_mappings(raw_data, unique_tags, threshold):
    word_freqs = Counter(word for words, _ in raw_data for word in words)
    filtered_words = [word for word, count in word_freqs.items() if count >= threshold]

    # print(filtered_words)
    word2idx = {word: idx + 4 for idx, word in enumerate(filtered_words)}
    word2idx['<pad>'] = 0
    word2idx['<s>'] = 1
    word2idx['</s>'] = 2
    word2idx['<unk>'] = 3

    tag2idx = {tag: idx + 3 for idx, tag in enumerate(unique_tags)}
    tag2idx['<pad>'] = 0
    tag2idx['<s>'] = 1
    tag2idx['</s>'] = 2

    return word2idx, tag2idx

def pad_sequences(batch, word2idx, tag2idx, pad_token='<pad>', init_token='<s>', eos_token='</s>', unk_token='<unk>'):
    max_len = max([len(seq) + 2 for seq, _ in batch])

    padded_word_seqs = []
    padded_tag_seqs = []

    for words, tags in batch:
        padded_words = [init_token] + words + [eos_token]
        padded_words = [word2idx.get(word, word2idx[unk_token]) for word in padded_words] + [word2idx[pad_token]] * (max_len - len(padded_words))
        padded_word_seqs.append(padded_words)

        padded_tags = [init_token] + tags + [eos_token]
        padded_tags = [tag2idx[tag] for tag in padded_tags] + [tag2idx[pad_token]] * (max_len - len(padded_tags))
        padded_tag_seqs.append(padded_tags)

    return torch.tensor(padded_word_seqs), torch.tensor(padded_tag_seqs)

word2idx, tag2idx = create_vocab_mappings(raw_data, unique_tags, threshold=1)

train_loader = DataLoader(
    train_dataset,
    batch_size=8,
    collate_fn=lambda batch: pad_sequences(batch, word2idx, tag2idx),
    shuffle=True,
)
dev_loader = DataLoader(
    dev_dataset,
    batch_size=8,
    collate_fn=lambda batch: pad_sequences(batch, word2idx, tag2idx),
    shuffle=True,
)

In [126]:
def tokenize_and_pad(text, word2idx, pad_token='<pad>', init_token='<s>', eos_token='</s>', unk_token='<unk>'):
    tokens = text.split()
    padded_tokens = [init_token] + tokens + [eos_token]
    indices = [word2idx.get(word, word2idx[unk_token]) for word in padded_tokens]
    
    return indices

def predict_tags(model, input_text, word2idx, idx2tag):
    model.eval()
    tokenized_input = tokenize_and_pad(input_text, word2idx)
    input_tensor = torch.tensor([tokenized_input]).to(device)
    
    with torch.no_grad():
        logits = model(input_tensor)
    
    predicted_indices = torch.argmax(logits, dim=-1).squeeze().cpu().numpy()
    predicted_tags = [idx2tag[idx] for idx in predicted_indices][1:-1]

    return predicted_tags

def createFile(model, textFile):
    with open(textFile, 'r') as input_file, open('pred.txt', 'w') as output_file:
        indexs = []
        words = []
        tags = []
        for line in input_file:
            if not line.strip():
                if len(words) > 0 and len(tags) > 0:
                    idx2tag = {idx: tag for tag, idx in tag2idx.items()}

                    new_text = " ".join(words)
                    predicted_tags = predict_tags(model, new_text, word2idx, idx2tag)

                    for i in range(len(indexs)):
                        index = indexs[i]
                        word = words[i]
                        tag = tags[i]
                        prediction = predicted_tags[i]

                        predictionLine = str(index) + " " + str(word) + " " + str(tag) + " " + str(prediction) + "\n"
                        output_file.write(predictionLine)
                    
                    indexs = []
                    words = []
                    tags = []
                    output_file.write("\n")
            else:
                index, word, tag = line.strip().split()
                indexs.append(index)
                words.append(word)
                tags.append(tag)

In [138]:
vocab_size = len(word2idx)
num_tags = len(tag2idx)

embedding_dim = 100
hidden_dim = 256
num_layers = 1
dropout = 0.33
linear_output_dim = 128

class BiLSTM(nn.Module):
    def __init__(self, vocab_size, linear_output_dim, embedding_dim, hidden_dim, num_layers, dropout):
        super(BiLSTM, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, bidirectional=True, batch_first=True)
        self.linear1 = nn.Linear(hidden_dim * 2, linear_output_dim)
        self.elu = nn.ELU()
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(linear_output_dim, num_tags)

    def forward(self, x):
        # TODO lol fix name conv pls thx
        x = self.embedding(x)
        # print("NO GLOVE Forward",x)
        x, _ = self.lstm(x)
        x = self.linear1(x)
        x = self.elu(x)
        x = self.dropout(x)
        logits = self.linear2(x)

        return logits
    
    # UTilize pad-packed maybe???

# Task 1

In [133]:
import torch.optim as optim
from torch.nn import CrossEntropyLoss
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.tensorboard import SummaryWriter


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

final_model = None
highest_f1_score = 0

model = BiLSTM(vocab_size, linear_output_dim, embedding_dim, hidden_dim, num_layers, dropout)
model.to(device)

num_epochs = 0

loss_function = CrossEntropyLoss(ignore_index=tag2idx['<pad>']) 
optimizer = optim.SGD(model.parameters(), lr=0.25, momentum=0.9, weight_decay=0.00005)  # TODO add parameters
# scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs)
patience = 6
writer = SummaryWriter()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=patience, factor=0.5, verbose=True)

early_stopping_counter = 0
best_f1_score = -1
clip_value = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_samples = 0

    for batch in train_loader:
        word_seqs, tag_seqs = batch
        word_seqs = word_seqs.to(device)
        tag_seqs = tag_seqs.to(device)

        optimizer.zero_grad()

        logits = model(word_seqs)
        logits = logits.view(-1, num_tags)
        tag_seqs = tag_seqs.view(-1)

        loss = loss_function(logits, tag_seqs)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)

        optimizer.step()

        total_loss += loss.item() * word_seqs.size(0)
        total_samples += word_seqs.size(0)

    avg_train_loss = total_loss / total_samples
    writer.add_scalar("Loss/train", avg_train_loss, epoch)

    val_loss, val_accuracy, val_precision, val_recall, val_f1_score = validation_step(model, dev_loader, loss_function, num_tags)

    scheduler.step(val_loss)

    if val_f1_score > best_f1_score:
        best_f1_score = val_f1_score
        final_model = model
        torch.save(model.state_dict(), "model_1.pt")
    #     early_stopping_counter = 0
    # else:
    #     early_stopping_counter += 1
    #     if early_stopping_counter >= patience:
    #         print("Early stopping.")
    #         break

    writer.add_scalar("Loss/val", val_loss, epoch)
    writer.add_scalar("F1_score/val", val_f1_score, epoch)

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}, Precision: {val_precision:.4f}, Recall {val_recall:.4f}, F1_score {val_f1_score:.4f}")

writer.close()

Epoch 1/30, Train Loss: 0.3426, Val Loss: 0.2646, Accuracy: 93.0054, Precision: 74.8382, Recall 60.8802, F1_score 64.5026
Epoch 2/30, Train Loss: 0.2101, Val Loss: 0.2131, Accuracy: 94.3782, Precision: 75.4201, Recall 68.7162, F1_score 69.8750
Epoch 3/30, Train Loss: 0.1874, Val Loss: 0.1959, Accuracy: 94.5244, Precision: 74.6869, Recall 70.7534, F1_score 70.7019
Epoch 4/30, Train Loss: 0.1746, Val Loss: 0.1769, Accuracy: 95.1696, Precision: 76.0531, Recall 71.2838, F1_score 71.6576
Epoch 5/30, Train Loss: 0.1678, Val Loss: 0.1901, Accuracy: 94.9656, Precision: 78.5072, Recall 69.6892, F1_score 71.7756
Epoch 6/30, Train Loss: 0.1612, Val Loss: 0.1868, Accuracy: 95.2181, Precision: 78.3773, Recall 70.7337, F1_score 72.4869
Epoch 7/30, Train Loss: 0.1545, Val Loss: 0.1845, Accuracy: 95.2681, Precision: 76.7846, Recall 72.6587, F1_score 73.0453


KeyboardInterrupt: 

In [149]:
loaded_model = BiLSTM(9971, linear_output_dim, embedding_dim, hidden_dim, num_layers, dropout)

saved_state_dict = torch.load("model_task_1_final.pt")
loaded_model.load_state_dict(saved_state_dict)
loaded_model.eval()

BiLSTM(
  (embedding): Embedding(9971, 100)
  (lstm): LSTM(100, 256, batch_first=True, bidirectional=True)
  (linear1): Linear(in_features=512, out_features=128, bias=True)
  (elu): ELU(alpha=1.0)
  (dropout): Dropout(p=0.33, inplace=False)
  (linear2): Linear(in_features=128, out_features=12, bias=True)
)

In [151]:
createFile(loaded_model, "data/dev")