In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, BucketIterator, Dataset, NestedField, Example

from sklearn.metrics import f1_score, precision_recall_fscore_support

import os
from torchtext.data import Field, NestedField, Example, Dataset, BucketIterator


In [124]:
def validation_step(model, iterator, TAG, num_tags):
    model.eval()

    all_predictions = []
    all_tags = []

    total_accuracy = 0
    total_amount = 0

    for batch in iterator:
        model.zero_grad()
        
        words, tags, uppercase_features = batch.word, batch.tag, batch.uppercase
        f = torch.transpose(uppercase_features, 0, 1)

        predictions = model(words, f)

        tags = tags.view(-1)
        predictions = predictions.view(-1, num_tags)

        labels = tags.cpu().numpy()
        predicted_labels = torch.argmax(predictions, dim=1).cpu().numpy()
        all_predictions.extend(predicted_labels)
        all_tags.extend(labels)

        mask = labels != 0
        correct_predictions = (predicted_labels[mask] == labels[mask]).sum()
        accuracy = correct_predictions / len(labels[mask])
        
        total_accuracy += accuracy
        total_amount += 1

    precision, recall, f1_score, support = precision_recall_fscore_support(
        all_tags,
        all_predictions,
        average='macro',
        zero_division=0,
        labels=list(range(1, len(TAG.vocab)))
    )


    return (total_accuracy/total_amount)*100, precision, recall, f1_score

In [5]:
INDEX = Field(sequential=False, use_vocab=False, dtype=torch.int64)
WORD = Field(sequential=True, use_vocab=True, lower=True, batch_first=True)
TAG = Field(sequential=True, use_vocab=True, batch_first=True)
UPPERCASE = Field(sequential=True, use_vocab=False, dtype=torch.float, batch_first=True)


In [68]:
def read_data(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        sentence = {"word": [], "tag": [], "uppercase": []}
        for line in f:
            line = line.strip()
            if not line:
                # End of a sentence
                if sentence["word"]:
                    data.append(Example.fromdict(sentence, fields={"word": ("word", WORD),
                                                                   "tag": ("tag", TAG),
                                                                   "uppercase": ("uppercase", UPPERCASE)}))
                    sentence = {"index": [], "word": [], "tag": [], "uppercase": []}
                continue

            index, word, tag = line.split()
            sentence["word"].append(word)
            sentence["tag"].append(tag)
            sentence["uppercase"].append(1 if word[0].isupper() else 0)

    return Dataset(data, fields={"word": WORD, "tag": TAG, "uppercase": UPPERCASE})


In [78]:
train_data = read_data('data/train')
dev_data = read_data('data/dev')

In [79]:
WORD.build_vocab(train_data)
TAG.build_vocab(train_data)

train_iter = BucketIterator(train_data, batch_size=1, sort_key=lambda x: len(x.word), device=torch.device('cpu'))
dev_iter = BucketIterator(dev_data, batch_size=1, sort_key=lambda x: len(x.word), device=torch.device('cpu'))

# Task 2

In [74]:
print(len(WORD.vocab))

21012


In [33]:
print(train_dataset[0].word)
print(train_dataset[0].case)

AttributeError: 'tuple' object has no attribute 'word'

In [121]:
vocab_size = len(WORD.vocab)
num_tags = len(TAG.vocab)

embedding_dim = 100
hidden_dim = 256
num_layers = 1
dropout = 0.33
linear_output_dim = 128

class BiLSTM_GLOVE(nn.Module):
    def __init__(self, vocab_size, linear_output_dim, embedding_dim, hidden_dim, num_layers, dropout, embedding_matrix):
        super(BiLSTM_GLOVE, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight.data.copy_(embedding_matrix)
        self.case_embedding = nn.Embedding(2, 1)
        self.lstm = nn.LSTM(embedding_dim + 1, hidden_dim, num_layers, bidirectional=True, batch_first=True)
        self.linear1 = nn.Linear(hidden_dim * 2, linear_output_dim)
        self.elu = nn.ELU()
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(linear_output_dim, num_tags)

    def forward(self, x, case_x):
        x = self.embedding(x)
        
        case_x = case_x.long()  # Convert case_x to long data type
        case_x = self.case_embedding(case_x).squeeze(-1)
        x = torch.cat((x, case_x.unsqueeze(-1)), dim=-1)
        x, _ = self.lstm(x)
        x = self.linear1(x)
        x = self.elu(x)
        x = self.dropout(x)
        logits = self.linear2(x)

        return logits

In [42]:
import gzip
import shutil

glove_gz_path = "glove.6B.100d.gz"
glove_txt_path = "glove.6B.100d.txt"

with gzip.open(glove_gz_path, 'rb') as f_in:
    with open(glove_txt_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [91]:
def load_glove_embeddings(glove_path):
    embeddings = {}
    with open(glove_path, "r", encoding="utf-8") as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = torch.tensor([float(val) for val in values[1:]])
            embeddings[word] = vector
    return embeddings

glove_embeddings = load_glove_embeddings(glove_txt_path)

In [92]:
embedding_matrix = torch.zeros((len(WORD.vocab), 100))
for i, word in enumerate(WORD.vocab.itos):
    if word in glove_embeddings:
        embedding_matrix[i] = glove_embeddings[word]
    else:
        # If the word is not in the GloVe vocabulary, initialize it with a random vector
        embedding_matrix[i] = torch.rand(100) * 2 - 1  # Generate a random vector with values between -1 and 1

In [84]:
print(accuracy,precision,recall,f1_score)

78.56210939550724 0.08332008453380382 0.1 0.09090120675614219


In [125]:
'''
lr
    0.001 - leads to 76%. Good enough, but the first epoch is 76 and doesn't lead to much learning (epoch 10)
    0.0001 - leads to good learning curve, but still only reaches to 76 after 7th epoch
    0.00005 - horrible, still 76
    0.002 - best, 
'''

# model.embedding.weight.requires_grad = True

model = BiLSTM_GLOVE(vocab_size, linear_output_dim, embedding_dim, hidden_dim, num_layers, dropout, embedding_matrix)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.002)

epochs = 10
for epoch in range(epochs):
    for batch in train_iter:
        model.zero_grad()
        
        words, tags, uppercase_features = batch.word, batch.tag, batch.uppercase
        f = torch.transpose(uppercase_features, 0, 1)

        predictions = model(words, f)
        
        predictions = predictions.view(-1, num_tags)
        tags = tags.view(-1)

        loss = loss_function(predictions, tags)
        loss.backward()

        optimizer.step()

    accuracy,precision,recall,f1_score = validation_step(model, dev_iter, TAG, num_tags)
    print(f"Epoch {epoch + 1} - DEV accuracy: {accuracy:.4f} precision: {precision:.4f} recall {recall:.4f} f1_score {f1_score:.4f}")

Epoch 1 - DEV accuracy: 83.3657 precision: 0.1412 recall 0.1850 f1_score 0.1373
Epoch 2 - DEV accuracy: 87.7763 precision: 0.2464 recall 0.2840 f1_score 0.2505
Epoch 3 - DEV accuracy: 88.8514 precision: 0.2475 recall 0.3164 f1_score 0.2734


KeyboardInterrupt: 

In [265]:
print("Original Embedding", embedding_matrix[1968])
f = torch.tensor([1968])
print(WORD.vocab.itos[1968])
model = BiLSTM_GLOVE(vocab_size, linear_output_dim, embedding_dim, hidden_dim, num_layers, dropout, embedding_matrix)
model(f)
model = BiLSTM(vocab_size, linear_output_dim, embedding_dim, hidden_dim, num_layers, dropout)
model(f)


Original Embedding tensor([-0.6646, -0.2997, -0.3024, -0.3122, -0.5956,  0.1813, -0.5200, -0.8990,
         0.6687, -0.0509,  0.2027, -0.3327,  0.1811,  0.9803,  0.2843, -0.0340,
        -0.7580, -0.2475,  0.1174,  0.6668, -0.5180,  0.9893, -0.2191,  0.1035,
        -0.5759,  0.3146,  0.7633, -0.3048,  0.3675, -0.0197,  0.2618, -0.8493,
        -0.6716,  0.0912,  0.4773, -0.5021, -0.9139,  0.4568, -0.5791, -0.1815,
        -0.0344,  0.1114, -0.9152,  0.7184, -0.2381, -0.7187, -0.7080, -0.2259,
        -0.7770, -0.0739,  0.7061, -0.0501,  0.0456,  0.6191, -0.7305,  0.2588,
         0.9421, -0.4163, -0.7916, -0.4794, -0.9400,  0.2369, -0.3231, -0.1243,
        -0.9973, -0.5886, -0.2724,  0.6379,  0.8822, -0.0026, -0.8083, -0.2886,
        -0.8846, -0.8714, -0.2287, -0.2878, -0.1113,  0.5348, -0.1951, -0.6304,
         0.7549, -0.0912,  0.9251, -0.9501,  0.4688, -0.7820, -0.6845,  0.3313,
        -0.3404, -0.6320, -0.1865,  0.6554,  0.6044, -0.7518, -0.7201,  0.7517,
        -0.1269, -0.1

tensor([[ 0.0084, -0.0427,  0.0679, -0.1092,  0.0983, -0.1093, -0.0016,  0.1013,
          0.1938,  0.0689,  0.0850]], grad_fn=<AddmmBackward0>)

In [None]:
model = BiLSTM_GLOVE(vocab_size, linear_output_dim, embedding_dim, hidden_dim, num_layers, dropout, embedding_matrix)
model.embedding

Embedding(21012, 100)

In [None]:
print(num_tags)

11


In [None]:
glove_embeddings[WORD.vocab.itos[1968]]

tensor([-0.2287,  0.2687,  0.1696, -0.8279,  0.1421,  0.0879,  0.1527,  0.2460,
        -0.6633,  0.2472,  0.7877, -0.7255, -0.2742,  0.1237,  0.4148, -0.6657,
         0.7754, -0.3948,  0.0852,  0.5552,  0.1641,  0.0209,  0.1017, -0.2413,
         0.0568, -0.2173, -0.6946,  0.1280, -0.4027, -0.6178, -0.8943,  0.5707,
         0.0754, -0.1523, -0.2299,  0.2230, -0.4698,  0.2938, -0.3666, -0.4318,
        -0.2113, -0.3330,  0.2782, -0.5088,  1.0083,  0.5549,  0.4845,  0.2277,
         1.2120,  0.6580,  0.9587, -0.8638, -0.2186,  0.2400,  0.0465, -1.0641,
        -0.3987, -1.5180, -0.3917, -0.3801, -0.7571, -0.1654, -1.3888,  0.1465,
        -0.2775,  0.5438, -0.4989,  1.1819, -0.6496,  0.2811, -0.3472,  0.4645,
         0.7467, -1.2753, -0.0139, -0.5497,  0.2577,  0.1996, -0.4776,  0.1634,
         0.2045, -0.7177, -0.1845,  0.9144, -0.0851, -0.6853, -0.5390,  0.4724,
         0.1316,  0.1568, -1.1030, -0.2084,  0.3321,  0.9873, -0.5288, -0.3076,
         0.9719,  0.9091,  0.4079, -0.39

In [None]:
embedding_matrix[1968]

tensor([-0.2287,  0.2687,  0.1696, -0.8279,  0.1421,  0.0879,  0.1527,  0.2460,
        -0.6633,  0.2472,  0.7877, -0.7255, -0.2742,  0.1237,  0.4148, -0.6657,
         0.7754, -0.3948,  0.0852,  0.5552,  0.1641,  0.0209,  0.1017, -0.2413,
         0.0568, -0.2173, -0.6946,  0.1280, -0.4027, -0.6178, -0.8943,  0.5707,
         0.0754, -0.1523, -0.2299,  0.2230, -0.4698,  0.2938, -0.3666, -0.4318,
        -0.2113, -0.3330,  0.2782, -0.5088,  1.0083,  0.5549,  0.4845,  0.2277,
         1.2120,  0.6580,  0.9587, -0.8638, -0.2186,  0.2400,  0.0465, -1.0641,
        -0.3987, -1.5180, -0.3917, -0.3801, -0.7571, -0.1654, -1.3888,  0.1465,
        -0.2775,  0.5438, -0.4989,  1.1819, -0.6496,  0.2811, -0.3472,  0.4645,
         0.7467, -1.2753, -0.0139, -0.5497,  0.2577,  0.1996, -0.4776,  0.1634,
         0.2045, -0.7177, -0.1845,  0.9144, -0.0851, -0.6853, -0.5390,  0.4724,
         0.1316,  0.1568, -1.1030, -0.2084,  0.3321,  0.9873, -0.5288, -0.3076,
         0.9719,  0.9091,  0.4079, -0.39

In [None]:
from torchtext.data import NestedField, Dataset, Example, TabularDataset

INDEX = Field(sequential=False, use_vocab=False)
CASE = Field(sequential=False, use_vocab=False, dtype=torch.float)
WORD = Field(lower=True)
TAG = Field()

fields = [('index', INDEX), ('word', WORD), ('case', CASE), ('tag', TAG)]
class CustomNERDataset(Dataset):
    def __init__(self, file_path, word_field, tag_field):
        self.word_field = word_field
        self.tag_field = tag_field
        self.examples = []

        with open(file_path, 'r') as f:
            example = []
            for line in f:
                if not line.isspace():
                    fields = line.strip().split()
                    word = fields[1]
                    tag = fields[2]
                    example.append((word, tag))
                else:
                    if example:
                        self.examples.append(example)
                        example = []

        if example:
            self.examples.append(example)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, index):
        example = self.examples[index]
        words = [self.word_field.preprocess(w) for w, _ in example]
        tags = [self.tag_field.preprocess(t) for _, t in example]
        return words, tags
    
train_dataset = CustomNERDataset('data/train', WORD, TAG)
valid_dataset = CustomNERDataset('data/dev', WORD, TAG)

WORD.build_vocab(train_dataset, min_freq=1)  # min_freq handles unknown
TAG.build_vocab(train_dataset)

class CustomBatch:
    def __init__(self, batch):
        self.index = batch.index
        self.word = batch.word
        self.case = torch.tensor([[1.0 if w[0].isupper() else 0.0 for w in words] for words in batch.word], dtype=torch.float).to(device)
        self.tag = batch.tag

    def __iter__(self):
        return (self.index, self.word, self.case, self.tag)

    def __len__(self):
        return len(self.word)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 32
train_iterator = BucketIterator(train_dataset, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.word), sort_within_batch=True, device=device, batch_class=CustomBatch)
valid_iterator = BucketIterator(valid_dataset, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.word), sort_within_batch=True, device=device, batch_class=CustomBatch)

In [None]:
def load_glove_embeddings(glove_path):
    embeddings = {}
    with open(glove_path, "r", encoding="utf-8") as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = torch.tensor([float(val) for val in values[1:]])
            embeddings[word] = vector
    return embeddings

glove_embeddings = load_glove_embeddings("glove.6B.100d.txt")