In [88]:
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.metrics import f1_score, precision_recall_fscore_support
from torch.utils.data import DataLoader, Dataset

import random

In [89]:
def validation_step(model, iterator, loss_function, num_tags):
    model.eval()

    epoch_loss = 0
    y_true = []
    y_pred = []

    total_accuracy = 0
    total_amount = 0
    total_loss = 0

    with torch.no_grad():
        for batch in iterator:
            word_seqs, upper_seqs, tag_seqs = batch
            word_seqs = word_seqs.to(device)
            upper_seqs = upper_seqs.to(device)
            tag_seqs = tag_seqs.to(device)

            logits = model(word_seqs, upper_seqs)
            logits = logits.view(-1, num_tags)
            tag_seqs = tag_seqs.view(-1)

            loss = loss_function(logits, tag_seqs)
            total_loss += loss.item()

            labels = tag_seqs.cpu().numpy()
            predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
            y_true.extend(labels)

            _, pred_tags = torch.max(logits, 1)
            y_pred.extend(pred_tags.cpu().numpy())
            # all_tags.extend(labels)

            mask = labels != 0
            correct_predictions = (predicted_labels[mask] == labels[mask]).sum()
            accuracy = correct_predictions / len(labels[mask])
            
            total_accuracy += accuracy
            epoch_loss += loss
            total_amount += 1

    precision, recall, f1_score, support = precision_recall_fscore_support(
        y_true,
        y_pred,
        average='macro',
        zero_division=0
    )

    return (epoch_loss/total_amount), (total_accuracy/total_amount)*100, precision*100, recall*100, f1_score*100

In [90]:
def read_data(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    data = []
    words, tags = [], []
    unique_words, unique_tags = set(), set()
    for line in lines:
        if line.strip() == "":
            data.append((words, tags))
            unique_words.update(words)
            unique_tags.update(tags)
            words, tags = [], []
        else:
            _, word, tag = line.strip().split()
            words.append(word)
            tags.append(tag)
    if words and tags:
        data.append((words, tags))
        unique_words.update(words)
        unique_tags.update(tags)

    return data, unique_words, unique_tags


In [91]:

def read_data1(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    data = []
    words, tags = [], []
    for line in lines:
        if line.strip() == "":
            data.append((words, tags))
            words, tags = [], []
        else:
            _, word, tag = line.strip().split()
            words.append(word)
            tags.append(tag)
    if words and tags:
        data.append((words, tags))
    return data

def tokenize(text):
    return text.split()

def pad_sequences(batch, word2idx, tag2idx, pad_token='<pad>', init_token='<s>', eos_token='</s>', unk_token='<unk>'):
    max_len = max([len(seq) + 2 for seq, _ in batch])  # Add 2 to account for <s> and </s> tokens

    padded_word_seqs = []
    padded_tag_seqs = []

    for words, tags in batch:
        padded_words = [init_token] + words + [eos_token]
        padded_words = [word2idx.get(word, word2idx[unk_token]) for word in padded_words] + [word2idx[pad_token]] * (max_len - len(padded_words))
        padded_word_seqs.append(padded_words)

        padded_tags = [init_token] + tags + [eos_token]
        padded_tags = [tag2idx[tag] for tag in padded_tags] + [tag2idx[pad_token]] * (max_len - len(padded_tags))
        padded_tag_seqs.append(padded_tags)

    seq_lengths = torch.tensor([len(seq) for seq in padded_word_seqs])

    return torch.tensor(padded_word_seqs), torch.tensor(padded_tag_seqs), seq_lengths


class CustomDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        sample = self.data[idx]

        if self.transform:
            sample = self.transform(sample)

        return sample

In [92]:
train_file = "data/train" 
raw_data, unique_words, unique_tags = read_data(train_file)
tokenized_data = [([word for word in words], [tag for tag in tags]) for words, tags in raw_data]
train_dataset = CustomDataset(tokenized_data)

dev_file = "data/dev" 
raw_data, unique_words, unique_tags = read_data(dev_file)
tokenized_data = [([word for word in words], [tag for tag in tags]) for words, tags in raw_data]
dev_dataset = CustomDataset(tokenized_data)


In [93]:
from collections import Counter

def create_vocab_mappings(raw_data, unique_tags, threshold):
    word_freqs = Counter(word.lower() for words, _ in raw_data for word in words)
    filtered_words = [word.lower() for word, count in word_freqs.items() if count >= threshold]
    
    # print(filtered_words)
    word2idx = {word: idx + 4 for idx, word in enumerate(filtered_words)}
    word2idx['<pad>'] = 0
    word2idx['<s>'] = 1
    word2idx['</s>'] = 2
    word2idx['<unk>'] = 3

    tag2idx = {tag: idx + 3 for idx, tag in enumerate(unique_tags)}
    tag2idx['<pad>'] = 0
    tag2idx['<s>'] = 1
    tag2idx['</s>'] = 2

    return word2idx, tag2idx

def pad_sequences(batch, word2idx, tag2idx, pad_token='<pad>', init_token='<s>', eos_token='</s>', unk_token='<unk>'):
    max_len = max([len(seq) + 2 for seq, _ in batch])  # Add 2 to account for <s> and </s> tokens

    padded_word_seqs = []
    padded_upper_seqs = []
    padded_tag_seqs = []

    for words, tags in batch:
        lower_words = [word.lower() for word in words]

        padded_words = [init_token] + lower_words + [eos_token]
        padded_words = [word2idx.get(word, word2idx[unk_token]) for word in padded_words] + [word2idx[pad_token]] * (max_len - len(padded_words))
        padded_word_seqs.append(padded_words)

        padded_uppers = [0] + [int(word[0].isupper()) for word in words] + [0] + [0] * (max_len - len(words) - 2)
        padded_upper_seqs.append(padded_uppers)

        padded_tags = [init_token] + tags + [eos_token]
        padded_tags = [tag2idx[tag] for tag in padded_tags] + [tag2idx[pad_token]] * (max_len - len(padded_tags))
        padded_tag_seqs.append(padded_tags)

    return torch.tensor(padded_word_seqs), torch.tensor(padded_upper_seqs), torch.tensor(padded_tag_seqs)

word2idx, tag2idx = create_vocab_mappings(raw_data, unique_tags, threshold=1)
train_loader = DataLoader(
    train_dataset,
    batch_size=8,
    collate_fn=lambda batch: pad_sequences(batch, word2idx, tag2idx),
    shuffle=True,
)
dev_loader = DataLoader(
    dev_dataset,
    batch_size=8,
    collate_fn=lambda batch: pad_sequences(batch, word2idx, tag2idx),
    shuffle=True,
)

In [94]:
def tokenize_and_pad(text, word2idx, pad_token='<pad>', init_token='<s>', eos_token='</s>', unk_token='<unk>'):
    tokens = text.split()

    lower_tokens = text.lower().split()
    padded_tokens = [init_token] + lower_tokens + [eos_token]
    indices = [word2idx.get(word, word2idx[unk_token]) for word in padded_tokens]
    
    upper_indices = [0] + [int(token[0].isupper()) for token in tokens] + [0]
    
    return indices, upper_indices

def predict_tags(model, input_text, word2idx, idx2tag):
    model.eval()
    tokenized_input, upper_input = tokenize_and_pad(input_text, word2idx)
    input_tensor = torch.tensor([tokenized_input]).to(device)
    upper_tensor = torch.tensor([upper_input]).to(device)
    
    with torch.no_grad():
        logits = model(input_tensor, upper_tensor)
    
    predicted_indices = torch.argmax(logits, dim=-1).squeeze().cpu().numpy()
    predicted_tags = [idx2tag[idx] for idx in predicted_indices][1:-1]

    return predicted_tags

def createFile(model, textFile):
    with open(textFile, 'r') as input_file, open('pred.txt', 'w') as output_file:
        indexs = []
        words = []
        tags = []
        for line in input_file:
            if not line.strip():
                if len(words) > 0 and len(tags) > 0:
                    idx2tag = {idx: tag for tag, idx in tag2idx.items()}

                    new_text = " ".join(words)
                    predicted_tags = predict_tags(model, new_text, word2idx, idx2tag)

                    for i in range(len(indexs)):
                        index = indexs[i]
                        word = words[i]
                        tag = tags[i]
                        prediction = predicted_tags[i]

                        predictionLine = str(index) + " " + str(word) + " " + str(tag) + " " + str(prediction) + "\n"
                        output_file.write(predictionLine)
                    
                    indexs = []
                    words = []
                    tags = []
                    output_file.write("\n")
            else:
                index, word, tag = line.strip().split()
                indexs.append(index)
                words.append(word)
                tags.append(tag)

In [114]:
vocab_size = len(word2idx)
num_tags = len(tag2idx)
print(vocab_size)
print(num_tags)
embedding_dim = 100
hidden_dim = 256
num_layers = 1
dropout = 0.33
linear_output_dim = 128

class BiLSTM(nn.Module):
    def __init__(self, vocab_size, linear_output_dim, embedding_dim, hidden_dim, num_layers, dropout):
        super(BiLSTM, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.upper_embedding = nn.Embedding(2, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim * 2, hidden_dim, num_layers, bidirectional=True, batch_first=True)
        self.linear1 = nn.Linear(hidden_dim * 2, linear_output_dim)
        self.elu = nn.ELU()
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(linear_output_dim, num_tags)

    def forward(self, x, upper_x):
        x = self.embedding(x)
        upper_x = self.upper_embedding(upper_x)
        x = torch.cat([x, upper_x], dim=-1)
        x, _ = self.lstm(x)
        x = self.linear1(x)
        x = self.elu(x)
        x = self.dropout(x)
        logits = self.linear2(x)

        return logits

9007
12


In [96]:
# num_tags = len(tag2idx)
print(tag2idx)

weight_list = [0,0,0,1,1,1,0.7,1,1,1,1,1]

for i,w in tag2idx.items():
    print(i, weight_list[w])
print(len(weight_list))
weight_tensor = torch.tensor(weight_list, dtype=torch.float)

{'I-MISC': 3, 'B-MISC': 4, 'B-ORG': 5, 'O': 6, 'B-LOC': 7, 'I-ORG': 8, 'B-PER': 9, 'I-PER': 10, 'I-LOC': 11, '<pad>': 0, '<s>': 1, '</s>': 2}
I-MISC 1
B-MISC 1
B-ORG 1
O 0.7
B-LOC 1
I-ORG 1
B-PER 1
I-PER 1
I-LOC 1
<pad> 0
<s> 0
</s> 0
12


# Task 1

In [111]:
import torch.optim as optim
from torch.nn import CrossEntropyLoss
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.tensorboard import SummaryWriter


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

final_model = None
highest_f1_score = 0

model = BiLSTM(vocab_size, linear_output_dim, embedding_dim, hidden_dim, num_layers, dropout)
model.to(device)

num_epochs = 20

loss_function = CrossEntropyLoss(ignore_index=tag2idx['<pad>'], weight=weight_tensor)
optimizer = optim.SGD(model.parameters(), lr=0.3, momentum=0.9, weight_decay=0.00005)  # TODO add parameters
# scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs)
patience = 5
writer = SummaryWriter()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=patience, factor=0.5, verbose=True)

early_stopping_counter = 0
best_f1_score = -1
clip_value = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_samples = 0

    for batch in train_loader:
        word_seqs, upper_seqs, tag_seqs = batch
        word_seqs = word_seqs.to(device)
        upper_seqs = upper_seqs.to(device)
        tag_seqs = tag_seqs.to(device)

        optimizer.zero_grad()

        logits = model(word_seqs, upper_seqs)
        logits = logits.view(-1, num_tags)
        tag_seqs = tag_seqs.view(-1)

        loss = loss_function(logits, tag_seqs)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)

        optimizer.step()

        total_loss += loss.item() * word_seqs.size(0)
        total_samples += word_seqs.size(0)

    avg_train_loss = total_loss / total_samples
    writer.add_scalar("Loss/train", avg_train_loss, epoch)

    val_loss, val_accuracy, val_precision, val_recall, val_f1_score = validation_step(model, dev_loader, loss_function, num_tags)

    scheduler.step(val_loss)

    if val_f1_score > best_f1_score:
        best_f1_score = val_f1_score
        final_model = model
    #     early_stopping_counter = 0
    # else:
    #     early_stopping_counter += 1
    #     if early_stopping_counter >= patience:
    #         print("Early stopping.")
    #         break

    writer.add_scalar("Loss/val", val_loss, epoch)
    writer.add_scalar("F1_score/val", val_f1_score, epoch)

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}, Precision: {val_precision:.4f}, Recall {val_recall:.4f}, F1_score {val_f1_score:.4f}")

writer.close()

Epoch 1/20, Train Loss: 0.3187, Val Loss: 0.2187, Accuracy: 82.7629, Precision: 54.1126, Recall 53.8521, F1_score 52.1396
Epoch 2/20, Train Loss: 0.1826, Val Loss: 0.1914, Accuracy: 83.3659, Precision: 56.8016, Recall 56.9725, F1_score 55.1727
Epoch 3/20, Train Loss: 0.1435, Val Loss: 0.1789, Accuracy: 83.8003, Precision: 55.5443, Recall 59.6139, F1_score 56.0740
Epoch 4/20, Train Loss: 0.1238, Val Loss: 0.2008, Accuracy: 83.7593, Precision: 60.0016, Recall 58.5248, F1_score 57.3221
Epoch 5/20, Train Loss: 0.1120, Val Loss: 0.1767, Accuracy: 84.1539, Precision: 60.3346, Recall 60.5250, F1_score 58.2451
Epoch 6/20, Train Loss: 0.1035, Val Loss: 0.1996, Accuracy: 83.3918, Precision: 55.3077, Recall 61.7764, F1_score 56.0244
Epoch 7/20, Train Loss: 0.0968, Val Loss: 0.1402, Accuracy: 84.7236, Precision: 61.1088, Recall 63.0459, F1_score 60.6008
Epoch 8/20, Train Loss: 0.0918, Val Loss: 0.1372, Accuracy: 84.8373, Precision: 62.1709, Recall 63.7555, F1_score 61.5182
Epoch 9/20, Train Loss: 

In [112]:
createFile(model, "data/dev")

In [113]:
# torch.save(model.state_dict(), "model_2.pt")

In [115]:
loaded_model = BiLSTM(vocab_size, linear_output_dim, embedding_dim, hidden_dim, num_layers, dropout)

saved_state_dict = torch.load("model_2.pt")
loaded_model.load_state_dict(saved_state_dict)
loaded_model.eval()

BiLSTM(
  (embedding): Embedding(9007, 100)
  (upper_embedding): Embedding(2, 100)
  (lstm): LSTM(200, 256, batch_first=True, bidirectional=True)
  (linear1): Linear(in_features=512, out_features=128, bias=True)
  (elu): ELU(alpha=1.0)
  (dropout): Dropout(p=0.33, inplace=False)
  (linear2): Linear(in_features=128, out_features=12, bias=True)
)

In [None]:
createFile(loaded_model, "data/dev")