## Preprocessing

In [1]:
import json
import re

def preprocess(input_file, output_file):
    # Read the JSON file
    with open(input_file, 'r', encoding='utf-8') as f:
        input_data = json.load(f)
    
    output_data = []
    
    for sentence_data in input_data:
        sentence = sentence_data['sentence'].lower() 
        sentence = re.sub(r'[^a-zA-Z0-9\s]', ' ', sentence)  # Remove punctuation and special characters
        tokens = sentence.split()
        labels = ['O'] * len(tokens)
        aspect_terms = []
        
        # Process aspect terms
        for aspect in sentence_data.get('aspect_terms', []):
            term = aspect['term'].lower()
            term = re.sub(r'[^a-zA-Z0-9\s]', ' ', term)  # Remove punctuation and special characters
            term_tokens = term.split()
            aspect_terms.append(term)
            
            # Assign BIO labels
            start_idx = tokens.index(term_tokens[0])  # Find first token index
            labels[start_idx] = 'B'
            for i in range(1, len(term_tokens)):
                labels[start_idx + i] = 'I'
        
        output_data.append({
            'sentence': sentence,
            'tokens': tokens,
            'labels': labels,
            'aspect_terms': aspect_terms
        })
    
    # Write to output JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=4)


preprocess("/kaggle/input/nlp-a2/train.json", "train_task_1.json")
preprocess("/kaggle/input/nlp-a2/val.json", "val_task_1.json")

## Pre-Trained Word Embeddings

In [2]:
import torch

class WordEmbeddings:
    PAD_IDX = 0
    UNK_IDX = 1
    def __init__(self, file_path, skip_first=False):
        """
        Args:
            file_path (str): Path to the word embedding file (GloVe or FastText).
            skip_first (bool): Whether to skip the first line (needed for FastText).
        """
        self.idx2word, self.embeddings = self.load_embeddings(file_path, skip_first)

        # Add special tokens <PAD> (index = 0) and <UNK> (index = 1)
        self.idx2word = ["<PAD>", "<UNK>"] + self.idx2word

        # Convert embeddings to a tensor
        self.embeddings = torch.as_tensor(self.embeddings, dtype=torch.float32)

        # Generate word2idx mapping
        self.word2idx = {word: idx for idx, word in enumerate(self.idx2word)}

        # Append new embeddings (zeros for PAD, small random for UNK)
        pad_embedding = torch.zeros((1, self.embeddings.shape[1]))  # <PAD> = all zeros
        unk_embedding = torch.rand((1, self.embeddings.shape[1])) * 0.01  # Small random values for <UNK>
        self.embeddings = torch.cat([pad_embedding, unk_embedding, self.embeddings], dim=0)

    def load_embeddings(self, file_path, skip_first):
        """Loads embeddings from a file."""
        idx2word = []
        embeddings = []

        with open(file_path, "r", encoding="utf-8") as f:
            if skip_first:
                next(f)  # Skip first line 
            for line in f:
                values = line.strip().split()
                word = values[0]
                vector = list(map(float, values[1:]))
                idx2word.append(word)
                embeddings.append(vector)

        return idx2word, embeddings

    def get_embedding(self, word):
        """Returns the embedding for a given word or the <UNK> embedding if not found."""
        idx = self.word2idx.get(word, self.word2idx["<UNK>"])
        return self.embeddings[idx]

    def __len__(self):
        """Returns the vocabulary size."""
        return len(self.idx2word)


### GloVe and fastText

In [3]:
glove_path = "/kaggle/input/nlp-a2/glove.6B/glove.6B.300d.txt"
GloVe = WordEmbeddings(glove_path, skip_first=False) 

fasttext_path = "/kaggle/input/nlp-a2/wiki-news-300d-1M.vec/wiki-news-300d-1M.vec"
fastText = WordEmbeddings(fasttext_path, skip_first=True)

## Aspect Term Extraction (ATE) Dataset using BIO encoding

In [4]:
from torch.utils.data import Dataset, DataLoader

class ATE_Dataset(Dataset):
    # BIO Label Mapping
    BIOlabel = {"B": 0, "I": 1, "O": 2}
    PADlabel = -100
    
    def __init__(self, filepath, word_embeddings):
        """
        Args:
            filepath (str): Path to the JSON file containing BIO-tagged data.
            word_embeddings (WordEmbeddings): Preloaded WordEmbeddings object.
        """
        self.word_embeddings = word_embeddings

        # Load and Process JSON
        self.data = []
        with open(filepath, "r", encoding="utf-8") as f:
            samples = json.load(f)

        for sample in samples:
            tokens_idx = [
                self.word_embeddings.word2idx.get(token, self.word_embeddings.word2idx["<UNK>"])
                for token in sample["tokens"]
            ]
            labels_idx = [ATE_Dataset.BIOlabel[label] for label in sample["labels"]]

            self.data.append((torch.tensor(tokens_idx, dtype=torch.long),
                              torch.tensor(labels_idx, dtype=torch.long)))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        """Returns token indices and label indices for one sample."""
        return self.data[idx]

    @staticmethod
    def collate_fn(batch):
        """
        Custom collate function to pad sequences to the max length in a batch.
        """
        tokens, labels = zip(*batch)  # Unpacking list of tuples
    
        # Get max sequence length
        max_len = max(len(t) for t in tokens)
    
        # Padding
        pad_idx = 0  #  <PAD> has index 0
        tokens_padded = [torch.cat([t, torch.full((max_len - len(t),), WordEmbeddings.PAD_IDX, dtype=torch.long)]) for t in tokens]
        labels_padded = [torch.cat([l, torch.full((max_len - len(l),), ATE_Dataset.PADlabel, dtype=torch.long)]) for l in labels]
    
        return torch.stack(tokens_padded), torch.stack(labels_padded)

## Models
###  RNN

In [5]:
import torch.nn as nn

class ATE_RNN(nn.Module):
    def __init__(self, pretrained_embeddings, hidden_dim, output_dim):
        super(ATE_RNN, self).__init__()
        vocab_size, embed_dim = pretrained_embeddings.shape
        
        # Load pretrained embeddings
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True, padding_idx=WordEmbeddings.PAD_IDX)
        
        # RNN layer
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)  # Convert indices to word embeddings
        h, _ = self.rnn(x)   # Pass through RNN
        out = self.fc(h)     # Pass hidden state to Fully connected layer for classification
        return out  # Output shape: (batch, seq_len, num_classes)

### GRU

In [6]:
class ATE_GRU(nn.Module):
    def __init__(self, pretrained_embeddings, hidden_dim, output_dim):
        super(ATE_GRU, self).__init__()
        vocab_size, embed_dim = pretrained_embeddings.shape
        
        # Load pretrained embeddings
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True, padding_idx=WordEmbeddings.PAD_IDX)
        
        # GRU layer
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)  # Convert indices to word embeddings
        h, _ = self.gru(x)   # Pass through GRU
        out = self.fc(h)     # Pass hidden state to Fully connected layer for classification
        return out  # Output shape: (batch, seq_len, num_classes)

## Model Training

In [7]:
import torch.nn as nn
import torch.optim as optim
import wandb
from kaggle_secrets import UserSecretsClient

# Login to W&B
user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("WANDB_API_KEY")
wandb.login(key=wandb_api)

# Define Device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train(model, train_loader, val_loader, epochs, learning_rate, model_name="Model"):
    """
    Train a sequence labeling model for BIO tagging and log metrics to wandb.

    Args:
        model (nn.Module): The PyTorch model (GRU, RNN, etc.).
        train_loader (DataLoader): DataLoader for training data.
        val_loader (DataLoader): DataLoader for validation data.
        epochs (int): Number of training epochs.
        learning_rate (float): Learning rate.
        model_name (str): Name of the model for logging.
    """

    # Initialize WandB
    wandb.init(project="NLP_A2_Task1", name=model_name)
    wandb.config.update({
        "epochs": epochs,
        "learning rate": learning_rate,
        "batch_size": train_loader.batch_size,
        "loss_function": "CrossEntropyLoss",
        "optimizer": "Adam"
    })

    model = model.to(device)  # Move model to GPU/CPU
    criterion = nn.CrossEntropyLoss(ignore_index=ATE_Dataset.PADlabel)  # Ignore padding token in loss
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)  # Adam Optimizer

    print(f"------------------------------------- TRAINING {model_name} -------------------------------------")
    for epoch in range(epochs):
        # Train Phase
        model.train()

        for tokens, labels in train_loader:
            tokens, labels = tokens.to(device), labels.to(device)

            optimizer.zero_grad()  # Reset gradients
            outputs = model(tokens)  # Forward pass

            # Reshape outputs & labels for loss calculation
            loss = criterion(outputs.view(-1, outputs.shape[-1]), labels.view(-1))
            loss.backward()  # Backpropagation
            optimizer.step()  # Update weights
            
       # Evaluate on train set after weight updates
        train_loss,_,train_f1 = evaluate_BIO_tagger(model, train_loader)

        # Validation Phase
        val_loss,_,val_f1 = evaluate_BIO_tagger(model, val_loader)

        # Log to WandB (only Loss and F1-score)
        wandb.log({
            "Train Loss": train_loss,
            "Train F1-score": train_f1,
            "Val Loss": val_loss,
            "Val F1-score": val_f1
        })

        print(f"Epoch [{epoch+1}/{epochs}] -> Train Loss: {train_loss:.4f}, Train F1: {train_f1:.4f} | Val Loss: {val_loss:.4f}, Val F1: {val_f1:.4f}")
        
    print(f"--------------------------------------- TRAINING COMPLETED -------------------------------------")
    wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mharshu04[0m ([33mharshu04-indraprastha-institute-of-information-technolog[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Evaluation

In [8]:
!pip install conlleval --quiet
from conlleval import evaluate 
import pprint

def evaluate_BIO_tagger(model, data_loader, verbose=False):
    """
    Evaluate a trained model on a dataset for BIO tagging.

    Args:
        model (nn.Module): Trained PyTorch model.
        data_loader (DataLoader): DataLoader for the dataset (test/validation).
        verbose (bool): Whether to print the evaluation table.
        
    Returns:
        tuple: A tuple containing (loss, tag-level F1, chunk-level F1)
    """
    model.eval()  # Set to evaluation mode
    total_loss = 0.0
    total_tokens = 0  # Track total number of valid tokens
    criterion = nn.CrossEntropyLoss(ignore_index=ATE_Dataset.PADlabel)  # Ignore padding token in loss

    # Create idx2tag
    idx2tag = {label: tag for tag, label in ATE_Dataset.BIOlabel.items()} 
    
    eval_data = []  # Input format for conlleval : "# gold_tag pred_tag" 

    with torch.no_grad():
        for tokens, labels in data_loader:
            tokens, labels = tokens.to(device), labels.to(device)

            outputs = model(tokens)
            loss = criterion(outputs.view(-1, outputs.shape[-1]), labels.view(-1))

            # Count valid tokens (excluding padding)
            valid_tokens = (labels != ATE_Dataset.PADlabel).sum().item()
            total_loss += loss.item() * valid_tokens  # Scale loss correctly
            total_tokens += valid_tokens  # Only count valid tokens

            preds = torch.argmax(outputs, dim=-1)
            mask = labels != ATE_Dataset.PADlabel  # Ignore padding tokens
            
            for i in range(labels.shape[0]):  # Iterate over batch
                gold_tags = [idx2tag[idx.item()] for idx in labels[i][mask[i]]]
                pred_tags = [idx2tag[idx.item()] for idx in preds[i][mask[i]]]
                for gt, pt in zip(gold_tags, pred_tags):
                    eval_data.append(f"# {gt} {pt}")
    
    # Compute final loss by averaging over all valid (non-padding) tokens
    avg_loss = total_loss / total_tokens if total_tokens > 0 else 0.0 
    
    # Compute evaluation metrics using conlleval
    results = evaluate(eval_data)
    results['loss'] = avg_loss
    tag_f1 = results['overall']['tags']['evals']['f1']
    chunk_f1 = results['overall']['chunks']['evals']['f1']
    
    if verbose:
        pprint.pprint(results)  

    return avg_loss, tag_f1, chunk_f1

In [9]:
import os 

# Create a folder for saving weights if it doesn't exist
os.makedirs("models", exist_ok=True)

wandb.finish()
for EmbeddingObject_name, EmbeddingObject in zip(["GloVe", "fastText"], [GloVe, fastText]): 
    # Load datasets
    train_dataset = ATE_Dataset("train_task_1.json", EmbeddingObject)
    val_dataset = ATE_Dataset("val_task_1.json", EmbeddingObject)
    # Create DataLoaders 
    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=ATE_Dataset.collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False, collate_fn=ATE_Dataset.collate_fn)

    for ModelClass_name, ModelClass in zip(["RNN", "GRU"], [ATE_RNN, ATE_GRU]):
        model_name = f"{ModelClass_name}_{EmbeddingObject_name}"
        # Initialize model
        model = ModelClass(EmbeddingObject.embeddings, hidden_dim=128, output_dim=len(ATE_Dataset.BIOlabel))
        # Train the model
        train(model, train_loader, val_loader, epochs=5, learning_rate=5e-4, model_name=model_name)
        # Save the model
        torch.save(model.state_dict(), f"models/{model_name}.pt")

------------------------------------- TRAINING RNN_GloVe -------------------------------------
Epoch [1/5] -> Train Loss: 0.1689, Train F1: 0.6660 | Val Loss: 0.1554, Val F1: 0.6675
Epoch [2/5] -> Train Loss: 0.1511, Train F1: 0.6867 | Val Loss: 0.1397, Val F1: 0.6852
Epoch [3/5] -> Train Loss: 0.1260, Train F1: 0.7492 | Val Loss: 0.1284, Val F1: 0.7050
Epoch [4/5] -> Train Loss: 0.1216, Train F1: 0.7365 | Val Loss: 0.1275, Val F1: 0.7072
Epoch [5/5] -> Train Loss: 0.1038, Train F1: 0.7747 | Val Loss: 0.1197, Val F1: 0.7181
--------------------------------------- TRAINING COMPLETED -------------------------------------


0,1
Train F1-score,▁▂▆▆█
Train Loss,█▆▃▃▁
Val F1-score,▁▃▆▆█
Val Loss,█▅▃▃▁

0,1
Train F1-score,0.77467
Train Loss,0.10376
Val F1-score,0.71809
Val Loss,0.11974


------------------------------------- TRAINING GRU_GloVe -------------------------------------
Epoch [1/5] -> Train Loss: 0.1483, Train F1: 0.7055 | Val Loss: 0.1367, Val F1: 0.6960
Epoch [2/5] -> Train Loss: 0.1221, Train F1: 0.7497 | Val Loss: 0.1281, Val F1: 0.7221
Epoch [3/5] -> Train Loss: 0.0968, Train F1: 0.7901 | Val Loss: 0.1165, Val F1: 0.7239
Epoch [4/5] -> Train Loss: 0.0763, Train F1: 0.8318 | Val Loss: 0.1142, Val F1: 0.7294
Epoch [5/5] -> Train Loss: 0.0587, Train F1: 0.8732 | Val Loss: 0.1113, Val F1: 0.7380
--------------------------------------- TRAINING COMPLETED -------------------------------------


0,1
Train F1-score,▁▃▅▆█
Train Loss,█▆▄▂▁
Val F1-score,▁▅▆▇█
Val Loss,█▆▂▂▁

0,1
Train F1-score,0.8732
Train Loss,0.05874
Val F1-score,0.73797
Val Loss,0.11128


------------------------------------- TRAINING RNN_fastText -------------------------------------
Epoch [1/5] -> Train Loss: 0.1539, Train F1: 0.6858 | Val Loss: 0.1347, Val F1: 0.6981
Epoch [2/5] -> Train Loss: 0.1371, Train F1: 0.7130 | Val Loss: 0.1219, Val F1: 0.7135
Epoch [3/5] -> Train Loss: 0.1278, Train F1: 0.7329 | Val Loss: 0.1224, Val F1: 0.7225
Epoch [4/5] -> Train Loss: 0.1258, Train F1: 0.7425 | Val Loss: 0.1228, Val F1: 0.7202
Epoch [5/5] -> Train Loss: 0.1117, Train F1: 0.7537 | Val Loss: 0.1198, Val F1: 0.7087
--------------------------------------- TRAINING COMPLETED -------------------------------------


0,1
Train F1-score,▁▄▆▇█
Train Loss,█▅▄▃▁
Val F1-score,▁▅█▇▄
Val Loss,█▂▂▂▁

0,1
Train F1-score,0.75372
Train Loss,0.1117
Val F1-score,0.70872
Val Loss,0.11978


------------------------------------- TRAINING GRU_fastText -------------------------------------
Epoch [1/5] -> Train Loss: 0.1486, Train F1: 0.7061 | Val Loss: 0.1359, Val F1: 0.7036
Epoch [2/5] -> Train Loss: 0.1299, Train F1: 0.7115 | Val Loss: 0.1187, Val F1: 0.7137
Epoch [3/5] -> Train Loss: 0.1177, Train F1: 0.7365 | Val Loss: 0.1123, Val F1: 0.7328
Epoch [4/5] -> Train Loss: 0.1120, Train F1: 0.7453 | Val Loss: 0.1116, Val F1: 0.7149
Epoch [5/5] -> Train Loss: 0.0988, Train F1: 0.7955 | Val Loss: 0.1149, Val F1: 0.7378
--------------------------------------- TRAINING COMPLETED -------------------------------------


0,1
Train F1-score,▁▁▃▄█
Train Loss,█▅▄▃▁
Val F1-score,▁▃▇▃█
Val Loss,█▃▁▁▂

0,1
Train F1-score,0.79551
Train Loss,0.09882
Val F1-score,0.73779
Val Loss,0.1149


## Testing and Model Inference

In [10]:
# preprocess("/kaggle/input/nlp-a2/test.json", "test_task_1.json")

# for EmbeddingObject_name, EmbeddingObject in zip(["GloVe", "fastText"], [GloVe, fastText]): 
#     # Load datasets
#     test_dataset = ATE_Dataset("test_task_1.json", EmbeddingObject)
#     # Create DataLoaders 
#     test_loader = DataLoader(test_dataset, batch_size=256, shuffle=True, collate_fn=ATE_Dataset.collate_fn)

#     for ModelClass_name, ModelClass in zip(["RNN", "GRU"], [ATE_RNN, ATE_GRU]):
#         model_name = f"{ModelClass_name}_{EmbeddingObject_name}"
#         # Initialize model
#         model = ModelClass(EmbeddingObject.embeddings, hidden_dim=128, output_dim=len(ATE_Dataset.BIOlabel))
#         # Load  Weights
#         weight_path = f"models/{model_name}.pt" 
#         model.load_state_dict(torch.load(weight_path, map_location=device, weights_only=True))
#         model.to(device)

#         print(f"-------------------------------- Evaluating {model_name} --------------------------------")
#         evaluate_BIO_tagger(model, test_loader, verbose = True)        