<center><h1>Hemnani_Hitika_HW4</h1></center>
<br>
<br>

## Task 1: Simple Bidirectional LSTM model

Importing Libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd
from collections import Counter
import random
from torch.nn import CrossEntropyLoss
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.tensorboard import SummaryWriter
import gzip
import numpy as np

Converting Raw Data to Dataframe

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
##reading files line by line and extracting word-level information and 
##storing sentence IDs, word indices, words, and NER tags in a DataFrame.
    
def load_data_to_dataframe(file_path):
    data = []
    with open(file_path, 'r') as f:
        sentence_id = 0
        for line in f:
            if line.strip() == "":
                sentence_id += 1
            else:
                parts = line.strip().split()
                data.append({
                    'sentence_id': sentence_id,
                    'index': parts[0],
                    'word': parts[1],
                    'ner_tag': parts[2]
                })
    return pd.DataFrame(data)

train_df = load_data_to_dataframe('data/train')
dev_df = load_data_to_dataframe('data/dev')

def load_test_data_to_dataframe(file_path):
    data = []
    with open(file_path, 'r') as f:
        sentence_id = 0
        for line in f:
            if line.strip() == "":
                sentence_id += 1
            else:
                parts = line.strip().split()
                if len(parts) >= 2:  # Ensure the line has at least index and word
                    data.append({
                        'sentence_id': sentence_id,
                        'index': parts[0],
                        'word': parts[1]
                    })
    return pd.DataFrame(data)


In [4]:
#train_df

Creating vocabulary from training data

In [5]:
train_words = set(train_df['word'].unique())
special_tag = ['<PAD>', '<UNK>']
word_index= {}

##adding Padding and Unkown to the index
word_index = {token: idx for idx, token in enumerate(special_tag)}
word_index.update({word: idx + len(special_tag) for idx, word in enumerate(train_words)})

Extracting all unique NER tags from the ner_tag column in train_df

In [6]:
#Extracting unique NER tags from the training dataset and storing them in a set
norm_tags = set(train_df['ner_tag'].unique())
tag_index = {tag: i for i, tag in enumerate(norm_tags)}
#Adding a special padding token at the end of the dictionary with a new index
tag_index['<PAD>'] = len(tag_index)

In [7]:
print(f"Vocabulary size: {len(word_index)}")
print(f"Number of NER tags: {len(tag_index)}")

Vocabulary size: 23626
Number of NER tags: 10


Mapping words and tags to its following index

In [8]:
# Mapping each word in dataframe to its corresponding index from the word_index dictionary
# Using '<UNK>' as a default index for words that are not found in the dictionary
train_df['word_idx'] = train_df['word'].map(lambda x: word_index.get(x, word_index['<UNK>']))
train_df['tag_idx'] = train_df['ner_tag'].map(tag_index)

dev_df['word_idx'] = dev_df['word'].map(lambda x: word_index.get(x, word_index['<UNK>']))
dev_df['tag_idx'] = dev_df['ner_tag'].map(tag_index)

Creating Sentences and using index to group them

In [9]:
# Grouping indices by sentence ID in the dataframes and converting them into lists
train_sentences = train_df.groupby('sentence_id')['word_idx'].apply(list).tolist()
train_labels = train_df.groupby('sentence_id')['tag_idx'].apply(list).tolist()

dev_sentences = dev_df.groupby('sentence_id')['word_idx'].apply(list).tolist()
dev_labels = dev_df.groupby('sentence_id')['tag_idx'].apply(list).tolist()


In [10]:
# Loading test data
test_df = load_test_data_to_dataframe('data/test')
test_df['word_idx'] = test_df['word'].map(lambda x: word_index.get(x, word_index['<UNK>']))
test_sentences = test_df.groupby('sentence_id')['word_idx'].apply(list).tolist()
test_sentence_dataset = list(zip(test_sentences, [None] * len(test_sentences)))  # No labels

In [11]:
from torch.nn.utils.rnn import pad_sequence
#padding sequences of word and label indices to a uniform length
def pad_seq(batch):
    sentences, labels = zip(*batch)
    # Converting sentences to tensors and padding them with the predefined padding index
    sentences_padded = pad_sequence([torch.tensor(s) for s in sentences], batch_first=True, padding_value=word_index['<PAD>'])
    # Converting Labels to tensors and padding them with the predefined padding index
    labels_padded = pad_sequence([torch.tensor(l) for l in labels], batch_first=True, padding_value=tag_index['<PAD>'])
    return sentences_padded, labels_padded
    
train_sentence_dataset = list(zip(train_sentences, train_labels))
dev_sentence_dataset = list(zip(dev_sentences, dev_labels))

train_loader = DataLoader(train_sentence_dataset, batch_size=32, shuffle=True, collate_fn=pad_seq)
dev_loader = DataLoader(dev_sentence_dataset, batch_size=32, shuffle=False, collate_fn=pad_seq)

test_loader = DataLoader(test_sentence_dataset, batch_size=32, shuffle=False, collate_fn=pad_seq)

In [12]:
# HYPER-PARAMETERS
embedding_dim = 100
hidden_dim = 256
num_layers = 1
dropout = 0.33
linear_output_dim = 128
batch_size = 32
learning_rate = 0.1
num_epochs = 100
clip_value = 5
patience = 6
num_tags=len(tag_index)
vocab=len(word_index)

In [13]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, linear_output_dim, embedding_dim, hidden_dim, num_layers, dropout):
        super(BiLSTM, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, bidirectional=True, batch_first=True)
        self.linear1 = nn.Linear(hidden_dim * 2, linear_output_dim)
        self.elu = nn.ELU()
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(linear_output_dim, num_tags)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.linear1(x)
        x = self.elu(x)
        x = self.dropout(x)
        logits = self.linear2(x)
        
        return logits

In [14]:
##If a GPU is available, device will be set to "cuda"; otherwise, it will fall back to "cpu" ...trying to prevent memory error
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
##Moving the model to device
model = BiLSTM(vocab, linear_output_dim, embedding_dim, hidden_dim, num_layers, dropout)
model.to(device)

BiLSTM(
  (embedding): Embedding(23626, 100)
  (lstm): LSTM(100, 256, batch_first=True, bidirectional=True)
  (linear1): Linear(in_features=512, out_features=128, bias=True)
  (elu): ELU(alpha=1.0)
  (dropout): Dropout(p=0.33, inplace=False)
  (linear2): Linear(in_features=128, out_features=10, bias=True)
)

In [15]:
Loss_Function = nn.CrossEntropyLoss(ignore_index=tag_index['<PAD>'])
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=6, factor=0.5, verbose=True)

# Early stopping
early_stopping_counter = 0
best_f1_score = -1
patience = 6
clip_value = 5
num_epochs=50



In [20]:
#training the BiLSTM model with a learning rate scheduler

def train_with_scheduler(model, train_loader, Loss_Function, optimizer, scheduler, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        total_samples = 0

        for sentences, labels in train_loader:
            sentences = sentences.to(device)
            labels = labels.to(device)
             # Resetting gradients before backpropagation
            optimizer.zero_grad()

            predictions = model(sentences)
            # Reshaping predictions and labels for loss calculation
            predictions = predictions.view(-1, len(tag_index)) 
            labels = labels.view(-1)

            loss = Loss_Function(predictions, labels)
            loss.backward() # Performing backpropagation

            # Applying gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)

            optimizer.step() # Updating model parameters

            total_loss += loss.item() * sentences.size(0)
            total_samples += sentences.size(0)

        # Update learning rate
        scheduler.step(total_loss / total_samples)

        avg_loss = total_loss / total_samples
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

In [17]:
#validating the BiLSTM model using loss, accuracy, precision, recall, and F1-score
def validate_with_metrics(model, dev_loader, Loss_Function, num_labels):
    model.eval()
    epoch_loss = 0
    total_accuracy = 0
    total_samples = 0
    true_labels = []
    predicted_labels = []

    with torch.no_grad():# Disabling gradient calculations
        for sentences, labels in dev_loader:
            sentences = sentences.to(device)
            labels = labels.to(device)

            logits = model(sentences)
            logits = logits.view(-1, num_labels)# Reshaping logits for loss calculation
            labels = labels.view(-1)

            loss = Loss_Function(logits, labels)
            epoch_loss += loss.item()

            labels_cpu = labels.cpu().numpy()
            predictions = torch.argmax(logits, dim=1).cpu().numpy()
            true_labels.extend(labels_cpu)
            predicted_labels.extend(predictions)

            # Creating a mask to exclude padding tokens from accuracy calculation
            mask = labels != tag_index['<PAD>']
            correct_predictions = (predictions[mask] == labels_cpu[mask]).sum()
            accuracy = correct_predictions / len(labels_cpu[mask])

            total_accuracy += accuracy
            total_samples += 1

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='macro', zero_division=0)

    avg_loss = epoch_loss / total_samples
    avg_accuracy = (total_accuracy / total_samples) * 100

    print(f"Validation Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.2f}%")
    print(f"Precision: {precision * 100:.2f}%, Recall: {recall * 100:.2f}%, F1: {f1 * 100:.2f}%")

    return avg_loss, avg_accuracy, precision, recall, f1

In [18]:
def train_and_validate(model, train_loader, dev_loader, Loss_Function, optimizer, scheduler, num_epochs):
    best_f1_score = -1
    patience = 6

    for epoch in range(num_epochs):
        print(f"Training Epoch {epoch + 1}/{num_epochs}")
        train_with_scheduler(model, train_loader, Loss_Function, optimizer, scheduler, num_epochs=1)

        print(f"Validating Epoch {epoch + 1}/{num_epochs}")
        avg_loss, avg_accuracy, precision, recall, f1 = validate_with_metrics(model, dev_loader, Loss_Function, num_labels=len(tag_index))

        if f1 > best_f1_score:
            best_f1_score = f1
            torch.save(model.state_dict(), "Blstm1.pt")

In [21]:
from sklearn.metrics import precision_recall_fscore_support
train_and_validate(model, train_loader, dev_loader, Loss_Function, optimizer, scheduler, num_epochs)

Training Epoch 1/50
Epoch 1/1, Loss: 0.6687
Validating Epoch 1/50
Validation Loss: 0.7005, Accuracy: 81.01%
Precision: 24.05%, Recall: 11.17%, F1: 6.79%
Training Epoch 2/50
Epoch 1/1, Loss: 0.6149
Validating Epoch 2/50
Validation Loss: 0.6387, Accuracy: 82.31%
Precision: 28.87%, Recall: 14.16%, F1: 11.51%
Training Epoch 3/50
Epoch 1/1, Loss: 0.5522
Validating Epoch 3/50
Validation Loss: 0.5740, Accuracy: 83.92%
Precision: 24.90%, Recall: 19.01%, F1: 17.04%
Training Epoch 4/50
Epoch 1/1, Loss: 0.4944
Validating Epoch 4/50
Validation Loss: 0.5204, Accuracy: 84.94%
Precision: 32.40%, Recall: 22.08%, F1: 19.54%
Training Epoch 5/50
Epoch 1/1, Loss: 0.4518
Validating Epoch 5/50
Validation Loss: 0.4880, Accuracy: 85.74%
Precision: 37.69%, Recall: 26.27%, F1: 22.47%
Training Epoch 6/50
Epoch 1/1, Loss: 0.4152
Validating Epoch 6/50
Validation Loss: 0.4535, Accuracy: 86.55%
Precision: 45.97%, Recall: 28.27%, F1: 25.16%
Training Epoch 7/50
Epoch 1/1, Loss: 0.3852
Validating Epoch 7/50
Validation 

In [22]:
def save_predictions_dev(model, data_loader, output_file, original_data, tag_index):
    model.eval()
    predictions = []

    with torch.no_grad():
        for sentences, _ in data_loader:
            sentences = sentences.to(device)

            # Forward pass
            logits = model(sentences)
            preds = torch.argmax(logits, dim=2)  # Get predicted tags

            # Append predictions
            predictions.extend(preds.cpu().numpy())

    # Creating a reverse mapping from tag index to tag label
    index_to_tag = {index: tag for tag, index in tag_index.items()}

    # Saving predictions to file ## we dont need to load test data earlier
    with open(output_file, "w") as f:
        for sentence_id, preds in enumerate(predictions):
            sentence_data = original_data[original_data['sentence_id'] == sentence_id]
            words = sentence_data['word'].tolist()
            indices = sentence_data['index'].tolist()

            for idx, word, pred_tag in zip(indices, words, preds):
                # Convert the predicted tag index back to the actual tag
                tag = index_to_tag[pred_tag]
                f.write(f"{idx} {word} {tag}\n")
            f.write("\n")  # Add a newline after each sentence
save_predictions_dev(model, dev_loader, "dev1.out", dev_df, tag_index)


In [None]:
#!python eval.py -p dev1.out -g data/dev

Dealing with Test Data

In [24]:
def save_predictions_test(model, data_loader, output_file, original_data, tag_index):
    model.eval()
    predictions = []

    with torch.no_grad():
        for sentences, _ in data_loader:
            sentences = sentences.to(device)

            # Forward pass
            logits = model(sentences)
            preds = torch.argmax(logits, dim=2)  # Get predicted tags

            # Append predictions
            predictions.extend(preds.cpu().numpy())

    # Creating a reverse mapping from tag index to tag label
    index_to_tag = {index: tag for tag, index in tag_index.items()}

    # Saving predictions to file
    with open(output_file, "w") as f:
        for sentence_id, preds in enumerate(predictions):
            # Get the original words and indices for this sentence
            sentence_data = original_data[original_data['sentence_id'] == sentence_id]
            words = sentence_data['word'].tolist()
            indices = sentence_data['index'].tolist()

            # Write each word, index, and predicted tag to the file
            for idx, word, pred_tag in zip(indices, words, preds):
                # Converting the predicted tag index back to the actual tag
                tag = index_to_tag[pred_tag]
                f.write(f"{idx} {word} {tag}\n")
            f.write("\n") 
save_predictions_test(model, dev_loader, "test1.out", test_df, tag_index)

## Task 2: Using GloVe word embeddings

Reading Data for GloVe word embeddings

as we have to take care of upper case as well redoing everything

In [25]:
#Found a better method to load data 
#Loading sequence data without labels from a file into a list of (words, tags) tuples
def load_data_to_dataframe(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    data = []
    words, tags = [], []
    unique_words, unique_tags = set(), set()
    for line in lines:
        if line.strip() == "":
            data.append((words, tags))
            unique_words.update(words)
            unique_tags.update(tags)
            words, tags = [], []
        else:
            _, word, tag = line.strip().split()
            words.append(word)
            tags.append(tag)
    if words and tags:
        data.append((words, tags))
        unique_words.update(words)
        unique_tags.update(tags)

    return data, unique_words, unique_tags


def load_test_data_to_dataframe(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    data = []
    words, tags = [], []
    for line in lines:
        if line.strip() == "":
            data.append((words, tags))
            words, tags = [], []
        else:
            _, word, tag = line.strip().split()
            words.append(word)
            tags.append(tag)
    if words and tags:
        data.append((words, tags))
    return data

Preprocessing Data
Using different functions and preprocessing techniques to get a better F1 score and handle Upper Cases as well <br> not getting resonable F1 score so Trying to use "init" and "eos" tag to get better accuracy ref:https://arxiv.org/abs/1409.3215

In [26]:
# CReating Mappings Considering Upper case letters as well
def case_sensitive_mappings(raw_data, unique_tags, threshold):
    word_freqs = Counter(word.lower() for words, _ in raw_data for word in words)
    filtered_words = [word.lower() for word, count in word_freqs.items() if count >= threshold]
    
    word_index = {word: idx + 4 for idx, word in enumerate(filtered_words)}
    word_index['<pad>'] = 0
    word_index['<s>'] = 1
    word_index['</s>'] = 2
    word_index['<unk>'] = 3

    tag_index = {tag: idx + 3 for idx, tag in enumerate(unique_tags)}
    tag_index['<pad>'] = 0
    tag_index['<s>'] = 1
    tag_index['</s>'] = 2

    return word_index, tag_index
##Pad Sequences for the tags
def pad_sequences(batch, word_index, tag_index, pad_token='<pad>', init='<s>', eos='</s>', unk='<unk>'):
    max_len = max([len(seq) + 2 for seq, _ in batch])  # Add 2 to account for <s> and </s> tokens

    padded_word_seqs = []
    padded_upper_seqs = []
    padded_tag_seqs = []

    for words, tags in batch:
        lower_words = [word.lower() for word in words]

        padded_words = [init] + lower_words + [eos]
        padded_words = [word_index.get(word, word_index[unk]) for word in padded_words] + [word_index[pad_token]] * (max_len - len(padded_words))
        padded_word_seqs.append(padded_words)

        padded_uppers = [0] + [int(word[0].isupper()) for word in words] + [0] + [0] * (max_len - len(words) - 2)
        padded_upper_seqs.append(padded_uppers)

        padded_tags = [init] + tags + [eos]
        padded_tags = [tag_index[tag] for tag in padded_tags] + [tag_index[pad_token]] * (max_len - len(padded_tags))
        padded_tag_seqs.append(padded_tags)

    return torch.tensor(padded_word_seqs), torch.tensor(padded_upper_seqs), torch.tensor(padded_tag_seqs)

def preprocess(text, word_index, pad_token='<pad>', init='<s>', eos='</s>', unk='<unk>'):
    tokens = text.split()

    lower_tokens = text.lower().split()
    padded_tokens = [init] + lower_tokens + [eos]
    indices = [word_index.get(word, word_index[unk]) for word in padded_tokens]
    
    upper_indices = [0] + [int(token[0].isupper()) for token in tokens] + [0]
    
    return indices, upper_indices

Adding Custom Dataset

In [27]:
##Did not receive a good F1 score so Trying to Create Custom Dataset
class CustomDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        sample = self.data[idx]

        if self.transform:
            sample = self.transform(sample)

        return sample

In [28]:
# Data Loaders and Mapping
train_file = "data/train" 
raw_data, unique_words, unique_tags = load_data_to_dataframe(train_file)
tokenized_data = [([word for word in words], [tag for tag in tags]) for words, tags in raw_data]
train_dataset = CustomDataset(tokenized_data)

dev_file = "data/dev" 
raw_data, unique_words, unique_tags = load_data_to_dataframe(dev_file)
tokenized_data = [([word for word in words], [tag for tag in tags]) for words, tags in raw_data]
dev_dataset = CustomDataset(tokenized_data)

word_index, tag_index = case_sensitive_mappings(raw_data, unique_tags, threshold=1)
train_loader = DataLoader(train_dataset,batch_size=8,collate_fn=lambda batch: pad_sequences(batch, word_index, tag_index),shuffle=True,)
dev_loader = DataLoader(dev_dataset,batch_size=8,collate_fn=lambda batch: pad_sequences(batch, word_index, tag_index),shuffle=True,)



In [29]:
##Same Function from Task1
def validate_with_metrics(model, dev_loader, loss_function, num_tags):
    model.eval()

    epoch_loss = 0
    y_true = []
    y_pred = []

    total_accuracy = 0
    total_amount = 0
    total_loss = 0

    with torch.no_grad():
        for batch in dev_loader:
            word_seqs, upper_seqs, tag_seqs = batch
            word_seqs = word_seqs.to(device)
            upper_seqs = upper_seqs.to(device)
            tag_seqs = tag_seqs.to(device)

            logits = model(word_seqs, upper_seqs)
            logits = logits.view(-1, num_tags)
            tag_seqs = tag_seqs.view(-1)

            loss = loss_function(logits, tag_seqs)
            total_loss += loss.item()

            labels = tag_seqs.cpu().numpy()
            predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
            y_true.extend(labels)

            _, pred_tags = torch.max(logits, 1)
            y_pred.extend(pred_tags.cpu().numpy())
            # all_tags.extend(labels)

            mask = labels != 0
            correct_predictions = (predicted_labels[mask] == labels[mask]).sum()
            accuracy = correct_predictions / len(labels[mask])
            
            total_accuracy += accuracy
            epoch_loss += loss
            total_amount += 1

    precision, recall, f1_score, support = precision_recall_fscore_support(y_true,y_pred,average='macro',zero_division=0)

    print(f"Validation Loss: {(epoch_loss/total_amount)}, Accuracy: {(total_accuracy/total_amount)*100}%")
    print(f"Precision: {precision * 100:.2f}%, Recall: {recall * 100:.2f}%, F1: {f1_score * 100:.2f}%")
    return (epoch_loss/total_amount), (total_accuracy/total_amount)*100, precision*100, recall*100, f1_score*100

In [30]:
# Predicting Results
def predict_tags(model, input_text, word_index, tag_index):
    model.eval()
    tokenized_input, upper_input = preprocess(input_text, word_index)
    input_tensor = torch.tensor([tokenized_input]).to(device)
    upper_tensor = torch.tensor([upper_input]).to(device)
    
    with torch.no_grad():
        logits = model(input_tensor, upper_tensor)
    
    predicted_indices = torch.argmax(logits, dim=-1).squeeze().cpu().numpy()
    predicted_tags = [tag_index[idx] for idx in predicted_indices][1:-1]

    return predicted_tags

In [31]:
# HYPER-PARAMETERS
vocab_size = len(word_index)
num_tags = len(tag_index)
embedding_dim = 100
hidden_dim = 256
num_layers = 1
dropout = 0.33
linear_output_dim = 128


# Load pre-trained GloVe embeddings from the gzip-compressed file
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

glove_file = "glove.6B.100d.gz"
glove_embeddings = load_glove_embeddings(glove_file)

embedding_matrix = np.zeros((vocab_size, 100)) 

for word, idx in word_index.items():
    if word in glove_embeddings:
        embedding_matrix[idx] = glove_embeddings[word]

embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)


In [32]:
# Modifying BiLSTM model to use float32 data type for parameters
class BiLSTM_glove(nn.Module):
    def __init__(self, embedding_matrix, linear_output_dim, hidden_dim, num_layers, dropout):
        super(BiLSTM_glove, self).__init__()
        
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False).to(torch.float32)
        self.upper_embedding = nn.Embedding(2, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim * 2, hidden_dim, num_layers, bidirectional=True, batch_first=True)
        self.linear1 = nn.Linear(hidden_dim * 2, linear_output_dim)
        self.elu = nn.ELU()
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(linear_output_dim, num_tags)

    def forward(self, x, upper_x):
        x = self.embedding(x)
        upper_x = self.upper_embedding(upper_x)
        x = torch.cat([x, upper_x], dim=-1)
        x, _ = self.lstm(x)
        x = self.linear1(x)
        x = self.elu(x)
        x = self.dropout(x)
        logits = self.linear2(x)

        return logits

In [36]:
##Same as Task 1
def train_with_scheduler(model, train_loader, loss_function, optimizer, scheduler, num_epochs, clip_value, device, num_tags):

    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        total_samples = 0
        
        for batch in train_loader:
            word_seqs, upper_seqs, tag_seqs = batch
            word_seqs, upper_seqs, tag_seqs = word_seqs.to(device), upper_seqs.to(device), tag_seqs.to(device)
            
            optimizer.zero_grad()
            logits = model(word_seqs, upper_seqs)
            logits = logits.view(-1, num_tags)
            tag_seqs = tag_seqs.view(-1)
            
            loss = loss_function(logits, tag_seqs)
            loss.backward()
            #Gradienr Clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)
            optimizer.step()
            
            total_loss += loss.item() * word_seqs.size(0)
            total_samples += word_seqs.size(0)
        
        avg_loss = total_loss / total_samples if total_samples > 0 else 0
        scheduler.step(avg_loss)
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

def train_and_validate(model, train_loader, dev_loader, loss_function, optimizer, scheduler, num_epochs, clip_value, device, num_tags):
    best_f1_score = -1
    early_stopping_counter = 0
    patience = 5
    writer = SummaryWriter()
    
    for epoch in range(num_epochs):
        print(f"Training Epoch {epoch + 1}/{num_epochs}")
        train_with_scheduler(model, train_loader, loss_function, optimizer, scheduler, 1, clip_value, device, num_tags)
        
        print(f"Validating Epoch {epoch + 1}/{num_epochs}")
        val_loss, val_accuracy, val_precision, val_recall, val_f1_score = validate_with_metrics(model, dev_loader, loss_function, num_tags)
        
        writer.add_scalar("Loss/val", val_loss, epoch)
        writer.add_scalar("F1_score/val", val_f1_score, epoch)
        
        if val_f1_score > best_f1_score:
            best_f1_score = val_f1_score
            final_model=model
            print("updated")
            # early_stopping_counter = 0
            torch.save(model.state_dict(), "Blstm2.pt")
    writer.close()
    return final_model

In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

final_model = None
highest_f1_score = 0

# # Initializing the model with pre-trained embeddings
model = BiLSTM_glove(embedding_matrix, linear_output_dim, hidden_dim, num_layers, dropout)
model.to(device)

num_epochs = 50

loss_function = CrossEntropyLoss(ignore_index=tag_index['<pad>'])
optimizer = optim.SGD(model.parameters(), lr=0.15, momentum=0.9, weight_decay=0.00005)

patience = 5
writer = SummaryWriter()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=patience, factor=0.5, verbose=True)

best_f1_score = -1
clip_value = 5



In [38]:
final_model=train_and_validate(model, train_loader, dev_loader, loss_function, optimizer, scheduler, num_epochs, clip_value, device, num_tags)

Training Epoch 1/50
Epoch 1/1, Loss: 0.2248
Validating Epoch 1/50
Validation Loss: 0.10901833325624466, Accuracy: 96.8455454708737%
Precision: 74.43%, Recall: 76.85%, F1: 73.88%
updated
Training Epoch 2/50
Epoch 1/1, Loss: 0.0932
Validating Epoch 2/50
Validation Loss: 0.08257260173559189, Accuracy: 97.78758724820814%
Precision: 79.28%, Recall: 80.03%, F1: 78.22%
updated
Training Epoch 3/50
Epoch 1/1, Loss: 0.0744
Validating Epoch 3/50
Validation Loss: 0.0762222558259964, Accuracy: 97.85914558767426%
Precision: 78.29%, Recall: 80.65%, F1: 78.08%
Training Epoch 4/50
Epoch 1/1, Loss: 0.0638
Validating Epoch 4/50
Validation Loss: 0.11592167615890503, Accuracy: 96.76841526260652%
Precision: 76.85%, Recall: 77.81%, F1: 74.99%
Training Epoch 5/50
Epoch 1/1, Loss: 0.0568
Validating Epoch 5/50
Validation Loss: 0.08500494062900543, Accuracy: 97.84277932289086%
Precision: 80.09%, Recall: 79.62%, F1: 78.34%
updated
Training Epoch 6/50
Epoch 1/1, Loss: 0.0505
Validating Epoch 6/50
Validation Loss: 

In [39]:
#Saving Predictions
def save_predictions_dev(model, text_file, output_file, tag_index, word_index):
    with open(text_file, 'r') as input_file, open(output_file, 'w') as output_file:
        indices = []
        words = []
        tags = []
        for line in input_file:
            if not line.strip():
                if len(words) > 0 and len(tags) > 0:
                    idx_to_tag = {idx: tag for tag, idx in tag_index.items()}

                    new_text = " ".join(words)
                    predicted_tags = predict_tags(model, new_text, word_index, idx_to_tag)

                    for i in range(len(indices)):
                        index = indices[i]
                        word = words[i]
                        tag = tags[i]
                        prediction = predicted_tags[i]

                        prediction_line = str(index) + " " + str(word) + " " + str(tag) + " " + str(prediction) + "\n"
                        output_file.write(prediction_line)

                    indices = []
                    words = []
                    tags = []
                    output_file.write("\n")
            else:
                index, word, tag = line.strip().split()
                indices.append(index)
                words.append(word)
                tags.append(tag)

def save_predictions_test(model, textFile, outputFile, tag_index, word_index):
    with open(textFile, 'r') as input_file, open(outputFile, 'w') as output_file:
        indexs = []
        words = []
        for line in input_file:
            if not line.strip():
                if len(words) > 0:
                    tag_index = {idx: tag for tag, idx in tag2idx.items()}

                    new_text = " ".join(words)
                    predicted_tags = predict_tags(model, new_text, word_index, tag_index)

                    for i in range(len(indexs)):
                        index = indexs[i]
                        word = words[i]
                        prediction = predicted_tags[i]

                        predictionLine = str(index) + " " + str(word) + " " + str(prediction) + "\n"
                        output_file.write(predictionLine)
                    
                    indexs = []
                    words = []
                    output_file.write("\n")
            else:
                index, word = line.strip().split()
                indexs.append(index)
                words.append(word)

In [40]:
# CREATING OUTPUT FILES
save_predictions_dev(final_model, "data/dev", "dev2.out", tag_index, word_index)

save_predictions_test(final_model, "data/test", "test2.out", tag_index, word_index)

final_model.eval()

BiLSTM_glove(
  (embedding): Embedding(9007, 100)
  (upper_embedding): Embedding(2, 100)
  (lstm): LSTM(200, 256, batch_first=True, bidirectional=True)
  (linear1): Linear(in_features=512, out_features=128, bias=True)
  (elu): ELU(alpha=1.0)
  (dropout): Dropout(p=0.33, inplace=False)
  (linear2): Linear(in_features=128, out_features=12, bias=True)
)

In [41]:
#!python eval.py -p dev2.out -g data/dev

processed 51577 tokens with 5942 phrases; found: 6294 phrases; correct: 5212.
accuracy:  97.45%; precision:  82.81%; recall:  87.71%; FB1:  85.19
              LOC: precision:  89.31%; recall:  92.32%; FB1:  90.79  1899
             MISC: precision:  78.54%; recall:  79.39%; FB1:  78.96  932
              ORG: precision:  75.86%; recall:  81.80%; FB1:  78.72  1446
              PER: precision:  83.64%; recall:  91.59%; FB1:  87.43  2017


## Bonus: LSTM-CNN model 

In [61]:
# Function to Create Mappings Considering upper Case Letters and Individual  CREATE MAPPINGS Letters[for CNN]
def cnn_vocab_mappings(raw_data, unique_tags, threshold):
    word_freqs = Counter(word.lower() for words, _ in raw_data for word in words)
    filtered_words = [word.lower() for word, count in word_freqs.items() if count >= threshold]
    
    # print(filtered_words)
    word_index = {word: idx + 4 for idx, word in enumerate(filtered_words)}
    word_index['<pad>'] = 0
    word_index['<s>'] = 1
    word_index['</s>'] = 2
    word_index['<unk>'] = 3

    tag_index = {tag: idx + 3 for idx, tag in enumerate(unique_tags)}
    tag_index['<pad>'] = 0
    tag_index['<s>'] = 1
    tag_index['</s>'] = 2

    all_chars = {char for words, _ in raw_data for word in words for char in word}
    char_index = {char: idx + 2 for idx, char in enumerate(all_chars)}
    char_index['<pad>'] = 0
    char_index['<unk>'] = 1

    return word_index, tag_index, char_index

def pad_word_chars(chars, max_word_len, pad_idx):
    return chars + [pad_idx] * (max_word_len - len(chars))

def pad_sequences(batch, word_index, tag_index, char_index, pad_token='<pad>', init='<s>', eos='</s>', unk='<unk>'):
    max_len = max([len(seq) + 2 for seq, _ in batch])
    max_word_len = max([len(word) for words, _ in batch for word in words])

    padded_word_seqs = []
    padded_upper_seqs = []
    padded_char_seqs = []
    padded_tag_seqs = []

    for words, tags in batch:
        lower_words = [word.lower() for word in words]

        padded_words = [init] + lower_words + [eos]
        padded_words = [word_index.get(word, word_index[unk]) for word in padded_words] + [word_index[pad_token]] * (max_len - len(padded_words))
        padded_word_seqs.append(padded_words)

        padded_uppers = [0] + [int(word[0].isupper()) for word in words] + [0] + [0] * (max_len - len(words) - 2)
        padded_upper_seqs.append(padded_uppers)

        padded_tags = [init] + tags + [eos]
        padded_tags = [tag_index[tag] for tag in padded_tags] + [tag_index[pad_token]] * (max_len - len(padded_tags))
        padded_tag_seqs.append(padded_tags)

        padded_chars = [[char_index.get(char, char_index['<unk>']) for char in word] for word in words]
        padded_chars = [pad_word_chars(chars, max_word_len, char_index[pad_token]) for chars in padded_chars]
        padded_chars.insert(0, [char_index[pad_token]] * max_word_len)
        padded_chars.append([char_index[pad_token]] * max_word_len)
        padded_chars += [[char_index[pad_token]] * max_word_len] * (max_len - len(padded_chars))
        padded_char_seqs.append(padded_chars)

    return torch.tensor(padded_word_seqs), torch.tensor(padded_upper_seqs), torch.tensor(padded_char_seqs), torch.tensor(padded_tag_seqs)

def preprocess(text, word_index, char_index, pad_token='<pad>', init='<s>', eos='</s>', unk='<unk>'):
    tokens = text.split()

    lower_tokens = text.lower().split()
    padded_tokens = [init] + lower_tokens + [eos]
    indices = [word_index.get(word, word_index[unk]) for word in padded_tokens]
    
    upper_indices = [0] + [int(token[0].isupper()) for token in tokens] + [0]

    char_indices = [[char_index.get(char, char_index[unk]) for char in word] for word in tokens]
    max_word_len = max([len(word_chars) for word_chars in char_indices]) + 2
    char_indices = [[char_index[pad_token]] * max_word_len] + char_indices + [[char_index[pad_token]] * max_word_len]
    char_indices_padded = [word_chars + [char_index[pad_token]] * (max_word_len - len(word_chars)) for word_chars in char_indices]

    return indices, upper_indices, char_indices_padded

# FUNCTION TO PREDICT RESULTS
def predict_tags(model, input_text, word_index, char_index, tag_index):
    model.eval()
    tokenized_input, upper_input, char_input = preprocess(input_text, word_index, char_index)
    input_tensor = torch.tensor([tokenized_input]).to(device)
    upper_tensor = torch.tensor([upper_input]).to(device)
    char_input_tensor = torch.tensor([char_input]).to(device)
    
    with torch.no_grad():
        logits = model(input_tensor, upper_tensor, char_input_tensor)
    
    predicted_indices = torch.argmax(logits, dim=-1).squeeze().cpu().numpy()
    predicted_tags = [tag_index[idx] for idx in predicted_indices][1:-1]

    return predicted_tags

In [43]:
# Everything same as Task 2 just Adding Char_index for CNN
train_file = "data/train" 
raw_data, unique_words, unique_tags = load_data_to_dataframe(train_file)
tokenized_data = [([word for word in words], [tag for tag in tags]) for words, tags in raw_data]
train_dataset = CustomDataset(tokenized_data)

dev_file = "data/dev" 
raw_data, unique_words, unique_tags = load_data_to_dataframe(dev_file)
tokenized_data = [([word for word in words], [tag for tag in tags]) for words, tags in raw_data]
dev_dataset = CustomDataset(tokenized_data)

word_index, tag_index, char_index = cnn_vocab_mappings(raw_data, unique_tags, threshold=1)
train_loader = DataLoader(
    train_dataset,
    batch_size=8,
    collate_fn=lambda batch: pad_sequences(batch, word_index, tag_index, char_index),
    shuffle=True,
)
dev_loader = DataLoader(
    dev_dataset,
    batch_size=8,
    collate_fn=lambda batch: pad_sequences(batch, word_index, tag_index, char_index),
    shuffle=True,
)

In [44]:
# HYPER-PARAMETERS
vocab_size = len(word_index)
num_tags = len(tag_index)
embedding_dim = 100
hidden_dim = 256
num_layers = 1
dropout = 0.33
linear_output_dim = 128

# HYPER-PARAMETERS
vocab_size = len(word_index)
char_vocab_size = len(char_index)
num_tags = len(tag_index)

char_embedding_dim = 30
embedding_dim = 100
hidden_dim = 256
num_layers = 1
dropout = 0.33
linear_output_dim = 128


In [45]:
##Same as Task 2 along with Char_inputs for CNN
def validate_with_metrics(model, dev_loader, loss_function, num_tags):
    model.eval()

    epoch_loss = 0
    y_true = []
    y_pred = []

    total_accuracy = 0
    total_amount = 0
    total_loss = 0

    with torch.no_grad():
        for batch in dev_loader:
            word_seqs, upper_seqs, char_inputs, tag_seqs = batch
            word_seqs = word_seqs.to(device)
            upper_seqs = upper_seqs.to(device)
            char_inputs = char_inputs.to(device)
            tag_seqs = tag_seqs.to(device)
            # Pass char_inputs to the model
            logits = model(word_seqs, upper_seqs, char_inputs)
            logits = logits.view(-1, num_tags)
            tag_seqs = tag_seqs.view(-1)

            loss = loss_function(logits, tag_seqs)
            total_loss += loss.item()

            labels = tag_seqs.cpu().numpy()
            predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
            y_true.extend(labels)

            _, pred_tags = torch.max(logits, 1)
            y_pred.extend(pred_tags.cpu().numpy())

            mask = labels != 0
            correct_predictions = (predicted_labels[mask] == labels[mask]).sum()
            accuracy = correct_predictions / len(labels[mask])
            
            total_accuracy += accuracy
            epoch_loss += loss
            total_amount += 1

    precision, recall, f1_score, support = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)

    print(f"Validation Loss: {(epoch_loss/total_amount)}, Accuracy: {(total_accuracy/total_amount)*100}%")
    print(f"Precision: {precision * 100:.2f}%, Recall: {recall * 100:.2f}%, F1: {f1_score * 100:.2f}%")
    return (epoch_loss/total_amount), (total_accuracy/total_amount)*100, precision*100, recall*100, f1_score*100

In [46]:
class BiLSTM_CNN(nn.Module):
    def __init__(self, embedding_matrix, char_vocab_size, num_tags, char_embedding_dim, embedding_dim, hidden_dim, num_layers, dropout, linear_output_dim):
        super(BiLSTM_CNN, self).__init__()

        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False).to(torch.float32)
        self.upper_embedding = nn.Embedding(2, embedding_dim)
        
        self.char_embedding = nn.Embedding(char_vocab_size, char_embedding_dim)
        self.char_cnn = nn.Conv1d(char_embedding_dim, embedding_dim, kernel_size=3)
        
        self.lstm = nn.LSTM(embedding_dim * 3, hidden_dim, num_layers, bidirectional=True, batch_first=True)
        self.linear1 = nn.Linear(hidden_dim * 2, linear_output_dim)
        self.elu = nn.ELU()
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(linear_output_dim, num_tags)

    def forward(self, x, upper_x, chars):
        x = self.embedding(x)
        upper_x = self.upper_embedding(upper_x)
        
        chars = self.char_embedding(chars)
        batch_size, max_seq_len, max_word_len, _ = chars.shape
        chars = chars.view(batch_size * max_seq_len, max_word_len, -1).permute(0, 2, 1)

        char_features = self.char_cnn(chars)
        char_features = nn.functional.relu(char_features)
        char_features, _ = torch.max(char_features, dim=-1)
        char_features = char_features.view(batch_size, max_seq_len, -1)
        # print(char_features)
        
        x = torch.cat([x, upper_x, char_features], dim=-1)
        x, _ = self.lstm(x)
        x = self.linear1(x)
        x = self.elu(x)
        x = self.dropout(x)
        logits = self.linear2(x)

        return logits
        

In [47]:
# Training and Predictions and saving the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

final_model = None
highest_f1_score = 0

model = BiLSTM_CNN(embedding_matrix, char_vocab_size, num_tags, char_embedding_dim, embedding_dim, hidden_dim, num_layers, dropout, linear_output_dim)
model.to(device)

num_epochs = 50

loss_function = CrossEntropyLoss(ignore_index=tag_index['<pad>'])
optimizer = optim.SGD(model.parameters(), lr=0.25, momentum=0.9, weight_decay=0.00005)

patience = 5
writer = SummaryWriter()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=patience, factor=0.5, verbose=True)

early_stopping_counter = 0
best_f1_score = -1
clip_value = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_samples = 0

    for batch in train_loader:
        inputs, upper_inputs, char_inputs, labels = batch

        optimizer.zero_grad()

        logits = model(inputs, upper_inputs, char_inputs)

        logits = logits.view(-1, logits.shape[-1])
        labels = labels.view(-1)

        loss = loss_function(logits, labels)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)

        optimizer.step()

        total_loss += loss.item() * 16
        total_samples += 16

    avg_train_loss = total_loss / total_samples
    writer.add_scalar("Loss/train", avg_train_loss, epoch)
    
    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}")
    
    val_loss, val_accuracy, val_precision, val_recall, val_f1_score = validate_with_metrics(model, dev_loader, loss_function, num_tags)
torch.save(model.state_dict(), "Blstm_bonus.pt")
writer.close()



Epoch 1/50, Train Loss: 0.2544
Validation Loss: 0.17113442718982697, Accuracy: 94.3049021037741%
Precision: 62.51%, Recall: 62.89%, F1: 58.11%
Epoch 2/50, Train Loss: 0.1289
Validation Loss: 0.10949129611253738, Accuracy: 96.7271199085005%
Precision: 73.49%, Recall: 74.92%, F1: 72.27%
Epoch 3/50, Train Loss: 0.0921
Validation Loss: 0.08842873573303223, Accuracy: 97.42239549995104%
Precision: 76.68%, Recall: 78.81%, F1: 76.27%
Epoch 4/50, Train Loss: 0.0728
Validation Loss: 0.08052228391170502, Accuracy: 97.60897806476953%
Precision: 76.54%, Recall: 80.34%, F1: 77.06%
Epoch 5/50, Train Loss: 0.0664
Validation Loss: 0.08060210198163986, Accuracy: 97.70596031446863%
Precision: 77.12%, Recall: 81.99%, F1: 78.16%
Epoch 6/50, Train Loss: 0.0600
Validation Loss: 0.07155390083789825, Accuracy: 97.932711853478%
Precision: 78.53%, Recall: 80.93%, F1: 78.38%
Epoch 7/50, Train Loss: 0.0548
Validation Loss: 0.07590071856975555, Accuracy: 97.86991462635703%
Precision: 77.72%, Recall: 81.56%, F1: 78.

In [62]:
#FUNCTION TO CREATE OUTPUT FILES
def save_predictions_dev(model, text_file, output_file, tag_to_index, word_to_index, char_to_index):
    with open(text_file, 'r') as input_file, open(output_file, 'w') as output_file:
        indices = []
        words = []
        tags = []
        for line in input_file:
            if not line.strip():
                if len(words) > 0 and len(tags) > 0:
                    idx_to_tag = {idx: tag for tag, idx in tag_to_index.items()}

                    new_text = " ".join(words)
                    predicted_tags = predict_tags(model, new_text, word_to_index, char_to_index, idx_to_tag)

                    for i in range(len(indices)):
                        index = indices[i]
                        word = words[i]
                        tag = tags[i]
                        prediction = predicted_tags[i]

                        prediction_line = str(index) + " " + str(word) + " " + str(tag) + " " + str(prediction) + "\n"
                        output_file.write(prediction_line)

                    indices = []
                    words = []
                    tags = []
                    output_file.write("\n")
            else:
                index, word, tag = line.strip().split()
                indices.append(index)
                words.append(word)
                tags.append(tag)

def save_predictions_test(model, text_file, output_file, tag_to_index, word_to_index, char_to_index):
    with open(text_file, 'r') as input_file, open(output_file, 'w') as output_file:
        indexs = []
        words = []
        for line in input_file:
            if not line.strip():
                if len(words) > 0:
                    idx2tag = {idx: tag for tag, idx in tag_to_index.items()}

                    new_text = " ".join(words)
                    predicted_tags = predict_tags(model, new_text, word_to_index, char_to_index, idx2tag)

                    for i in range(len(indexs)):
                        index = indexs[i]
                        word = words[i]
                        prediction = predicted_tags[i]

                        predictionLine = str(index) + " " + str(word) + " " + str(prediction) + "\n"
                        output_file.write(predictionLine)
                    
                    indexs = []
                    words = []
                    output_file.write("\n")
            else:
                index, word = line.strip().split()
                indexs.append(index)
                words.append(word)

In [1]:
# CREATING OUTPUT FILES
save_predictions_dev(model, "data/dev", "dev_bonus.out", tag_index, word_index,char_to_index)

save_predictions_test(model, "data/test", "test_bonus.out", tag_index, word_index,char_to_index)

model.eval()

In [2]:
#!python eval.py -p dev_bonus.out -g data/dev