In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.metrics import f1_score, precision_recall_fscore_support
from torch.utils.data import DataLoader, Dataset

import random

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [30]:
def validation_step(model, iterator, loss_function, num_tags):
    model.eval()

    epoch_loss = 0
    y_true = []
    y_pred = []

    total_accuracy = 0
    total_amount = 0
    total_loss = 0

    with torch.no_grad():
        for batch in iterator:
            inputs, upper_inputs, char_inputs, labels = batch

            logits = model(inputs, upper_inputs, char_inputs)

            logits = logits.view(-1, logits.shape[-1])
            labels = labels.view(-1)

            loss = loss_function(logits, labels)
            total_loss += loss.item()

            labels = logits.cpu().numpy()
            predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
            y_true.extend(labels)

            _, pred_tags = torch.max(logits, 1)
            y_pred.extend(pred_tags.cpu().numpy())
            # all_tags.extend(labels)

            mask = labels != 0
            correct_predictions = (predicted_labels[mask] == labels[mask]).sum()
            accuracy = correct_predictions / len(labels[mask])
            
            total_accuracy += accuracy
            epoch_loss += loss
            total_amount += 1

    precision, recall, f1_score, support = precision_recall_fscore_support(
        y_true,
        y_pred,
        average='macro',
        zero_division=0
    )

    return (epoch_loss/total_amount), (total_accuracy/total_amount)*100, precision*100, recall*100, f1_score*100

In [5]:
def read_data(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    data = []
    words, tags = [], []
    unique_words, unique_tags = set(), set()
    for line in lines:
        if line.strip() == "":
            data.append((words, tags))
            unique_words.update(words)
            unique_tags.update(tags)
            words, tags = [], []
        else:
            _, word, tag = line.strip().split()
            words.append(word)
            tags.append(tag)
    if words and tags:
        data.append((words, tags))
        unique_words.update(words)
        unique_tags.update(tags)

    return data, unique_words, unique_tags


In [6]:
def tokenize(text):
    return text.split()

def pad_sequences(batch, word2idx, tag2idx, pad_token='<pad>', init_token='<s>', eos_token='</s>', unk_token='<unk>'):
    max_len = max([len(seq) + 2 for seq, _ in batch]) 
    padded_word_seqs = []
    padded_tag_seqs = []

    for words, tags in batch:
        padded_words = [init_token] + words + [eos_token]
        padded_words = [word2idx.get(word, word2idx[unk_token]) for word in padded_words] + [word2idx[pad_token]] * (max_len - len(padded_words))
        padded_word_seqs.append(padded_words)

        padded_tags = [init_token] + tags + [eos_token]
        padded_tags = [tag2idx[tag] for tag in padded_tags] + [tag2idx[pad_token]] * (max_len - len(padded_tags))
        padded_tag_seqs.append(padded_tags)

    seq_lengths = torch.tensor([len(seq) for seq in padded_word_seqs])

    return torch.tensor(padded_word_seqs), torch.tensor(padded_tag_seqs), seq_lengths


class CustomDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        sample = self.data[idx]

        if self.transform:
            sample = self.transform(sample)

        return sample

In [7]:
train_file = "data/train" 
raw_data, unique_words, unique_tags = read_data(train_file)
tokenized_data = [([word for word in words], [tag for tag in tags]) for words, tags in raw_data]
train_dataset = CustomDataset(tokenized_data)

dev_file = "data/dev" 
raw_data, unique_words, unique_tags = read_data(dev_file)
tokenized_data = [([word for word in words], [tag for tag in tags]) for words, tags in raw_data]
dev_dataset = CustomDataset(tokenized_data)


In [27]:
from collections import Counter

def create_vocab_mappings(raw_data, unique_tags, threshold):
    word_freqs = Counter(word.lower() for words, _ in raw_data for word in words)
    filtered_words = [word.lower() for word, count in word_freqs.items() if count >= threshold]
    
    # print(filtered_words)
    word2idx = {word: idx + 4 for idx, word in enumerate(filtered_words)}
    word2idx['<pad>'] = 0
    word2idx['<s>'] = 1
    word2idx['</s>'] = 2
    word2idx['<unk>'] = 3

    tag2idx = {tag: idx + 3 for idx, tag in enumerate(unique_tags)}
    tag2idx['<pad>'] = 0
    tag2idx['<s>'] = 1
    tag2idx['</s>'] = 2

    all_chars = {char for words, _ in raw_data for word in words for char in word}
    char2idx = {char: idx + 2 for idx, char in enumerate(all_chars)}
    char2idx['<pad>'] = 0
    char2idx['<unk>'] = 1

    return word2idx, tag2idx, char2idx

def pad_word_chars(chars, max_word_len, pad_idx):
    return chars + [pad_idx] * (max_word_len - len(chars))

def pad_sequences(batch, word2idx, tag2idx, char2idx, pad_token='<pad>', init_token='<s>', eos_token='</s>', unk_token='<unk>'):
    max_len = max([len(seq) + 2 for seq, _ in batch])
    max_word_len = max([len(word) for words, _ in batch for word in words])

    padded_word_seqs = []
    padded_upper_seqs = []
    padded_char_seqs = []
    padded_tag_seqs = []

    for words, tags in batch:
        lower_words = [word.lower() for word in words]

        padded_words = [init_token] + lower_words + [eos_token]
        padded_words = [word2idx.get(word, word2idx[unk_token]) for word in padded_words] + [word2idx[pad_token]] * (max_len - len(padded_words))
        padded_word_seqs.append(padded_words)

        padded_uppers = [0] + [int(word[0].isupper()) for word in words] + [0] + [0] * (max_len - len(words) - 2)
        padded_upper_seqs.append(padded_uppers)

        padded_tags = [init_token] + tags + [eos_token]
        padded_tags = [tag2idx[tag] for tag in padded_tags] + [tag2idx[pad_token]] * (max_len - len(padded_tags))
        padded_tag_seqs.append(padded_tags)

        padded_chars = [[char2idx.get(char, char2idx['<unk>']) for char in word] for word in words]
        padded_chars = [pad_word_chars(chars, max_word_len, char2idx[pad_token]) for chars in padded_chars]
        padded_chars.insert(0, [char2idx[pad_token]] * max_word_len)
        padded_chars.append([char2idx[pad_token]] * max_word_len)
        padded_chars += [[char2idx[pad_token]] * max_word_len] * (max_len - len(padded_chars))
        padded_char_seqs.append(padded_chars)

    return torch.tensor(padded_word_seqs), torch.tensor(padded_upper_seqs), torch.tensor(padded_char_seqs), torch.tensor(padded_tag_seqs)

word2idx, tag2idx, char2idx = create_vocab_mappings(raw_data, unique_tags, threshold=1)
train_loader = DataLoader(
    train_dataset,
    batch_size=8,
    collate_fn=lambda batch: pad_sequences(batch, word2idx, tag2idx, char2idx),
    shuffle=True,
)
dev_loader = DataLoader(
    dev_dataset,
    batch_size=8,
    collate_fn=lambda batch: pad_sequences(batch, word2idx, tag2idx, char2idx),
    shuffle=True,
)

In [39]:
def tokenize_and_pad(text, word2idx, char2idx, pad_token='<pad>', init_token='<s>', eos_token='</s>', unk_token='<unk>'):
    tokens = text.split()

    lower_tokens = text.lower().split()
    padded_tokens = [init_token] + lower_tokens + [eos_token]
    indices = [word2idx.get(word, word2idx[unk_token]) for word in padded_tokens]
    
    upper_indices = [0] + [int(token[0].isupper()) for token in tokens] + [0]

    char_indices = [[char2idx.get(char, char2idx[unk_token]) for char in word] for word in tokens]
    max_word_len = max([len(word_chars) for word_chars in char_indices]) + 2
    char_indices = [[char2idx[pad_token]] * max_word_len] + char_indices + [[char2idx[pad_token]] * max_word_len]
    char_indices_padded = [word_chars + [char2idx[pad_token]] * (max_word_len - len(word_chars)) for word_chars in char_indices]

    return indices, upper_indices, char_indices_padded


def predict_tags(model, input_text, word2idx, char2idx, idx2tag):
    model.eval()
    tokenized_input, upper_input, char_input = tokenize_and_pad(input_text, word2idx, char2idx)
    input_tensor = torch.tensor([tokenized_input]).to(device)
    upper_tensor = torch.tensor([upper_input]).to(device)
    char_input_tensor = torch.tensor([char_input]).to(device)
    
    with torch.no_grad():
        logits = model(input_tensor, upper_tensor, char_input_tensor)
    
    predicted_indices = torch.argmax(logits, dim=-1).squeeze().cpu().numpy()
    predicted_tags = [idx2tag[idx] for idx in predicted_indices][1:-1]

    return predicted_tags

def createFile(model, textFile, char2idx):
    with open(textFile, 'r') as input_file, open('pred.txt', 'w') as output_file:
        indexs = []
        words = []
        tags = []
        for line in input_file:
            if not line.strip():
                if len(words) > 0 and len(tags) > 0:
                    idx2tag = {idx: tag for tag, idx in tag2idx.items()}

                    new_text = " ".join(words)
                    predicted_tags = predict_tags(model, new_text, word2idx, char2idx, idx2tag)

                    for i in range(len(indexs)):
                        index = indexs[i]
                        word = words[i]
                        tag = tags[i]
                        prediction = predicted_tags[i]

                        predictionLine = str(index) + " " + str(word) + " " + str(tag) + " " + str(prediction) + "\n"
                        output_file.write(predictionLine)
                    
                    indexs = []
                    words = []
                    tags = []
                    output_file.write("\n")
            else:
                index, word, tag = line.strip().split()
                indexs.append(index)
                words.append(word)
                tags.append(tag)

In [17]:
vocab_size = len(word2idx)
char_vocab_size = len(char2idx)
num_tags = len(tag2idx)

char_embedding_dim = 30
embedding_dim = 100
hidden_dim = 256
num_layers = 1
dropout = 0.33
linear_output_dim = 128

class BiLSTM_CNN(nn.Module):
    def __init__(self, vocab_size, char_vocab_size, num_tags, char_embedding_dim, embedding_dim, hidden_dim, num_layers, dropout, linear_output_dim):
        super(BiLSTM_CNN, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.upper_embedding = nn.Embedding(2, embedding_dim)
        
        self.char_embedding = nn.Embedding(char_vocab_size, char_embedding_dim)
        self.char_cnn = nn.Conv1d(char_embedding_dim, embedding_dim, kernel_size=3)
        
        self.lstm = nn.LSTM(embedding_dim * 3, hidden_dim, num_layers, bidirectional=True, batch_first=True)
        self.linear1 = nn.Linear(hidden_dim * 2, linear_output_dim)
        self.elu = nn.ELU()
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(linear_output_dim, num_tags)

    def forward(self, x, upper_x, chars):
        x = self.embedding(x)
        upper_x = self.upper_embedding(upper_x)
        
        chars = self.char_embedding(chars)
        batch_size, max_seq_len, max_word_len, _ = chars.shape
        chars = chars.view(batch_size * max_seq_len, max_word_len, -1).permute(0, 2, 1)
        # print(chars)

        char_features = self.char_cnn(chars)
        char_features = nn.functional.relu(char_features)
        char_features, _ = torch.max(char_features, dim=-1)
        char_features = char_features.view(batch_size, max_seq_len, -1)
        # print(char_features)
        
        x = torch.cat([x, upper_x, char_features], dim=-1)
        x, _ = self.lstm(x)
        x = self.linear1(x)
        x = self.elu(x)
        x = self.dropout(x)
        logits = self.linear2(x)

        return logits

In [18]:
# num_tags = len(tag2idx)
print(tag2idx)

weight_list = [0,0,0,1,1,1,1,0.7,1,1,1,1]

for i,w in tag2idx.items():
    print(i, weight_list[w])
print(len(weight_list))
weight_tensor = torch.tensor(weight_list, dtype=torch.float)

{'B-PER': 3, 'I-LOC': 4, 'I-PER': 5, 'B-MISC': 6, 'O': 7, 'I-MISC': 8, 'B-LOC': 9, 'I-ORG': 10, 'B-ORG': 11, '<pad>': 0, '<s>': 1, '</s>': 2}
B-PER 1
I-LOC 1
I-PER 1
B-MISC 1
O 0.7
I-MISC 1
B-LOC 1
I-ORG 1
B-ORG 1
<pad> 0
<s> 0
</s> 0
12


# Task 1

In [45]:
import torch.optim as optim
from torch.nn import CrossEntropyLoss
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.tensorboard import SummaryWriter


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

final_model = None
highest_f1_score = 0

model = BiLSTM_CNN(vocab_size, char_vocab_size, num_tags, char_embedding_dim, embedding_dim, hidden_dim, num_layers, dropout, linear_output_dim)
model.to(device)

num_epochs = 30

loss_function = CrossEntropyLoss(ignore_index=tag2idx['<pad>'], weight=weight_tensor)
optimizer = optim.SGD(model.parameters(), lr=0.25, momentum=0.9, weight_decay=0.00005)
# scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs)
patience = 5
writer = SummaryWriter()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=patience, factor=0.5, verbose=True)

early_stopping_counter = 0
best_f1_score = -1
clip_value = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_samples = 0

    for batch in train_loader:
        inputs, upper_inputs, char_inputs, labels = batch

        optimizer.zero_grad()

        logits = model(inputs, upper_inputs, char_inputs)

        logits = logits.view(-1, logits.shape[-1])
        labels = labels.view(-1)

        loss = loss_function(logits, labels)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)

        optimizer.step()

        total_loss += loss.item() * 16
        total_samples += 16

    avg_train_loss = total_loss / total_samples
    writer.add_scalar("Loss/train", avg_train_loss, epoch)
    print(epoch+1, avg_train_loss)

writer.close()

1 0.27817624615179326
2 0.15618788975092415
3 0.12170213573200284
4 0.0992425325163988
5 0.09329219489848067
6 0.08299665355144452
7 0.0801492516161862
8 0.07257359968415417
9 0.06940187964204618
10 0.0668046612387201
11 0.06635492756799273
12 0.06584975788360425
13 0.06038852361889207
14 0.0650046519076335
15 0.061099910566728886
16 0.05687368435103486
17 0.058552491307279755
18 0.05333677606576452
19 0.05450703007711628
20 0.05733181348644907
21 0.05249355073396546
22 0.054668751738832046
23 0.05327963031606056
24 0.05068245541836792
25 0.05175394057383342
26 0.05232099312806633
27 0.05240830238551468
28 0.0514260288782175
29 0.04969350725377177
30 0.04824242050565956


In [50]:
createFile(model, "data/dev", char2idx)

In [51]:

import pickle

with open("tag2idx_task_3.pickle", "wb") as f:
    pickle.dump(tag2idx, f, protocol=pickle.HIGHEST_PROTOCOL)

with open("word2idx_task_3.pickle", "wb") as f:
    pickle.dump(word2idx, f, protocol=pickle.HIGHEST_PROTOCOL)

torch.save(model.state_dict(), "model_3.pt")

# Task 2