## Preprocessing

In [4]:
import json
import re

def preprocess(input_file, output_file):
    # Read the JSON file
    with open(input_file, 'r', encoding='utf-8') as f:
        input_data = json.load(f)
    
    output_data = []
    
    for sentence_data in input_data:
        sentence = sentence_data['sentence'].lower() 
        sentence = re.sub(r'[^a-zA-Z0-9\s]', ' ', sentence)  # Remove punctuation and special characters
        tokens = sentence.split()
        
        # Process aspect terms
        for aspect in sentence_data.get('aspect_terms', []):
            polarity = aspect['polarity'].lower()
            term = aspect['term'].lower()
            term = re.sub(r'[^a-zA-Z0-9\s]', ' ', term)  # Remove punctuation and special characters
            aspect_term = term.split()
            
            index = tokens.index(aspect_term[0])  # Find first token index
        
            output_data.append({
                'tokens': tokens,
                'polarity': polarity,
                'aspect_term': aspect_term,
                'index': index,
            })
    
    # Write to output JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=4)


preprocess("/kaggle/input/nlp-a2/train.json", "train_task_2.json")
preprocess("/kaggle/input/nlp-a2/val.json", "val_task_2.json")

## Pre-Trained Word Embeddings

In [5]:
import torch

class WordEmbeddings:
    PAD_IDX = 0 
    UNK_IDX = 1
    def __init__(self, file_path, skip_first=False):
        """
        Args:
            file_path (str): Path to the word embedding file (GloVe or FastText).
            skip_first (bool): Whether to skip the first line (needed for FastText).
        """
        self.idx2word, self.embeddings = self.load_embeddings(file_path, skip_first)

        # Add special tokens <PAD> (index = 0) and <UNK> (index = 1)
        self.idx2word = ["<PAD>", "<UNK>"] + self.idx2word

        # Convert embeddings to a tensor
        self.embeddings = torch.as_tensor(self.embeddings, dtype=torch.float32)

        # Generate word2idx mapping
        self.word2idx = {word: idx for idx, word in enumerate(self.idx2word)}

        # Append new embeddings (zeros for PAD, small random for UNK)
        pad_embedding = torch.zeros((1, self.embeddings.shape[1]))  # <PAD> = all zeros
        unk_embedding = torch.rand((1, self.embeddings.shape[1])) * 0.01  # Small random values for <UNK>
        self.embeddings = torch.cat([pad_embedding, unk_embedding, self.embeddings], dim=0)

    def load_embeddings(self, file_path, skip_first):
        """Loads embeddings from a file."""
        idx2word = []
        embeddings = []

        with open(file_path, "r", encoding="utf-8") as f:
            if skip_first:
                next(f)  # Skip first line 
            for line in f:
                values = line.strip().split()
                word = values[0]
                vector = list(map(float, values[1:]))
                idx2word.append(word)
                embeddings.append(vector)

        return idx2word, embeddings

    def get_embedding(self, word):
        """Returns the embedding for a given word or the <UNK> embedding if not found."""
        idx = self.word2idx.get(word, self.word2idx["<UNK>"])
        return self.embeddings[idx]

    def __len__(self):
        """Returns the vocabulary size."""
        return len(self.idx2word)


### GloVe and fastText

In [6]:
glove_path = "/kaggle/input/nlp-a2/glove.6B/glove.6B.300d.txt"
GloVe = WordEmbeddings(glove_path, skip_first=False) 

# fasttext_path = "/kaggle/input/nlp-a2/wiki-news-300d-1M.vec/wiki-news-300d-1M.vec"
# fastText = WordEmbeddings(fasttext_path, skip_first=True)

## Aspect Sentiment Classification (ASC) Dataset 

In [7]:
import json
import torch
from torch.utils.data import Dataset, DataLoader

class ASC_Dataset(Dataset):
    # Sentiment Label Mapping
    sentiment_labels = {"negative": 0, "neutral": 1, "positive": 2, "conflict": 3}
    
    def __init__(self, filepath, word_embeddings):
        """
        Args:
            filepath (str): Path to the JSON file containing ABSA data.
            word_embeddings (WordEmbeddings): Preloaded word embedding object.
        """
        self.word_embeddings = word_embeddings
        self.data = []

        # Load JSON data
        with open(filepath, "r", encoding="utf-8") as f:
            samples = json.load(f)

        for sample in samples:
            
            tokens_idx = [
                self.word_embeddings.word2idx.get(token, self.word_embeddings.word2idx["<UNK>"])
                for token in sample["tokens"]
            ]
            aspect_idx = [
                self.word_embeddings.word2idx.get(term, self.word_embeddings.word2idx["<UNK>"])
                for term in sample["aspect_term"]
            ]
            polarity = ASC_Dataset.sentiment_labels[sample["polarity"]]

            self.data.append((torch.tensor(tokens_idx, dtype=torch.long),
                              torch.tensor(aspect_idx, dtype=torch.long),
                              torch.tensor(polarity, dtype=torch.long)))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        """Returns token indices, aspect indices, and sentiment label for one sample."""
        return self.data[idx]

    @staticmethod
    def collate_fn(batch):
        """
        Custom collate function to pad sequences to the max length in a batch.
        """
        tokens, aspects, labels = zip(*batch)  # Unpack list of tuples

        # Get max lengths
        max_sentence_len = max(len(t) for t in tokens)
        max_aspect_len = max(len(a) for a in aspects)

        # Padding
        tokens_padded = [torch.cat([t, torch.full((max_sentence_len - len(t),), WordEmbeddings.PAD_IDX, dtype=torch.long)]) for t in tokens]
        aspects_padded = [torch.cat([a, torch.full((max_aspect_len - len(a),), WordEmbeddings.PAD_IDX, dtype=torch.long)]) for a in aspects]

        return torch.stack(tokens_padded), torch.stack(aspects_padded), torch.tensor(labels)

## Model

In [82]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ATAE_LSTM(nn.Module):
    def __init__(self, pretrained_embeddings, hidden_dim, output_dim):
        super(ATAE_LSTM, self).__init__()
        
        # Load pretrained embeddings
        vocab_size, embedding_dim = pretrained_embeddings.shape
        aspect_dim = embedding_dim
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True, padding_idx=WordEmbeddings.PAD_IDX)

        # LSTM Layer
        self.lstm = nn.LSTM(input_size=embedding_dim + aspect_dim, 
                            hidden_size=hidden_dim, 
                            batch_first=True, 
                            bidirectional=True)

        # Attention Mechanism
        self.attention_M = nn.Linear(hidden_dim * 2 + aspect_dim, hidden_dim * 2 + aspect_dim)
        self.attention_alpha = nn.Linear(hidden_dim * 2 + aspect_dim, 1)

        # Fully Connected Layers
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)  # Added layer
        self.fc2 = nn.Linear(hidden_dim, output_dim)  # Final output layer

    def forward(self, sentence, aspect):
        # Embed sentence and aspect
        sentence_emb = self.embedding(sentence)  # (batch, seq_len, embedding_dim)
        aspect_emb = self.embedding(aspect)  # (batch, aspect_len, embedding_dim)

        # Create aspect mask (1 for real tokens, 0 for padding)
        aspect_mask = (aspect != WordEmbeddings.PAD_IDX).float()  # (batch, aspect_len)

        # Compute mean of aspect embeddings, ignoring padding
        aspect_emb = (aspect_emb * aspect_mask.unsqueeze(-1)).sum(dim=1) / aspect_mask.sum(dim=1, keepdim=True)  # (batch, embedding_dim)

        # Expand aspect embedding across sentence length
        aspect_expanded = aspect_emb.unsqueeze(1).expand(-1, sentence_emb.size(1), -1)  # (batch, seq_len, aspect_dim)

        # Concatenate sentence embedding and aspect embedding
        lstm_input = torch.cat((sentence_emb, aspect_expanded), dim=2)  # (batch, seq_len, embedding_dim + aspect_dim)

        # LSTM Forward Pass
        lstm_out, _ = self.lstm(lstm_input)  # (batch, seq_len, hidden_dim * 2)

        # Compute Attention Scores
        M_input = torch.cat((lstm_out, aspect_expanded), dim=2)  # (batch, seq_len, hidden_dim * 2 + aspect_dim)
        M = torch.tanh(self.attention_M(M_input))  # (batch, seq_len, hidden_dim * 2 + aspect_dim)
        attention_scores = self.attention_alpha(M).squeeze(-1)  # (batch, seq_len)

        # Create sentence mask (1 for real tokens, 0 for padding)
        sentence_mask = (sentence != WordEmbeddings.PAD_IDX).float()  # (batch, seq_len)

        # Apply mask: Set padding tokens' scores to a very small value before normalization via softmax
        attention_scores = attention_scores.masked_fill(sentence_mask == 0, -1e9)
        attention_weights = F.softmax(attention_scores, dim=1)  # (batch, seq_len)

        # Compute context vector (weighted sum of LSTM outputs)
        context = torch.sum(lstm_out * attention_weights.unsqueeze(-1), dim=1)  # (batch, hidden_dim * 2)

        # Pass through fc1 + relu, then final layer
        hidden = F.relu(self.fc1(context))  # (batch, hidden_dim)
        output = self.fc2(hidden)  # (batch, output_dim)

        return output, attention_weights

## Model Training

In [85]:
import torch
import torch.optim as optim
import wandb
from kaggle_secrets import UserSecretsClient

# Login to W&B
user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("WANDB_API_KEY")
wandb.login(key=wandb_api)

# Define Device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train(model, train_loader, val_loader, epochs, learning_rate, model_name="Model"):
    """
    Train ATAE-LSTM model for aspect-based sentiment classification.

    Args:
        model (nn.Module): The ATAE-LSTM model.
        train_loader (DataLoader): DataLoader for training data.
        val_loader (DataLoader): DataLoader for validation data.
        epochs (int): Number of training epochs.
        learning_rate (float): Learning rate.
        model_name (str): Name of the model for logging.
    """

    # Initialize WandB
    wandb.init(project="NLP_A2_Task2", name=model_name)
    wandb.config.update({
        "epochs": epochs,
        "learning rate": learning_rate,
        "batch_size": train_loader.batch_size,
        "loss_function": "CrossEntropyLoss",
        "optimizer": "Adam"
    })

    model = model.to(device)  # Move model to GPU/CPU
    criterion = nn.CrossEntropyLoss() 
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    print(f"---------------- TRAINING {model_name} ----------------")
    for epoch in range(epochs):
        # Train Phase
        model.train()

        for sentence, aspect, labels in train_loader:
            sentence, aspect, labels = sentence.to(device), aspect.to(device), labels.to(device)

            optimizer.zero_grad()  # Reset gradients
            outputs, _ = model(sentence, aspect)  # Forward pass

            loss = criterion(outputs, labels)  # Compute loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update weights

        # Evaluate on train set 
        train_loss, train_acc = evaluate_ASC(model, train_loader)

        # Evaluate on Validation set 
        val_loss, val_acc = evaluate_ASC(model, val_loader)

        # Log to WandB (Loss and Accuracy)
        wandb.log({
            "Train Loss": train_loss,
            "Train Accuracy": train_acc,
            "Val Loss": val_loss,
            "Val Accuracy": val_acc
        })

        print(f"Epoch [{epoch+1}/{epochs}] -> Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    print(f"---------------- TRAINING COMPLETED ----------------")
    wandb.finish()

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Evaluation

In [10]:
def evaluate_ASC(model, data_loader, verbose=False):
    """
    Evaluate a trained model on a dataset for Aspect-Based Sentiment Classification (ASC).

    Args:
        model (nn.Module): Trained PyTorch model (e.g., ATAE-LSTM).
        data_loader (DataLoader): DataLoader for validation or test dataset.
        verbose (bool): If True, prints accuracy details.

    Returns:
         average_loss (float): Average loss over the dataset.
         average_accuracy (float): Accuracy of the model.
    """
    model.eval()  # Set model to evaluation mode
    total_loss, total_samples = 0, 0
    criterion = nn.CrossEntropyLoss()  # CrossEntropyLoss for multi-class classification

    preds, true_labels = [], []  # Store predictions and ground truths

    with torch.no_grad():  # Disable gradient computation
        for sentence, aspect, labels in data_loader:
            sentence, aspect, labels = sentence.to(device), aspect.to(device), labels.to(device)

            outputs, _ = model(sentence, aspect)  # Forward pass
            loss = criterion(outputs, labels)  # Compute loss

            # Convert logits to predicted class (argmax)
            predicted_labels = torch.argmax(outputs, dim=1)

            # Append predictions and true labels
            preds.extend(predicted_labels.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

            # Update total loss
            total_loss += loss.item() * labels.size(0)
            total_samples += labels.size(0)

    # Compute accuracy
    preds = torch.tensor(preds)
    true_labels = torch.tensor(true_labels)
    accuracy = (preds == true_labels).float().mean().item()

    # Print details if verbose
    if verbose:
        print(f"Total Samples: {total_samples}")
        print(f"Correct Predictions: {(preds == true_labels).sum().item()}")
        print(f"Validation Accuracy: {accuracy:.4f}")

    average_loss = total_loss / total_samples

    return average_loss, accuracy

In [106]:
import os 

# Create a folder for saving weights if it doesn't exist
os.makedirs("models", exist_ok=True)

# Load datasets
train_dataset = ASC_Dataset("train_task_2.json", GloVe)
val_dataset = ASC_Dataset("val_task_2.json", GloVe)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=ASC_Dataset.collate_fn)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False, collate_fn=ASC_Dataset.collate_fn)

# Initialize model
model = ATAE_LSTM(GloVe.embeddings, hidden_dim=128, output_dim=len(ASC_Dataset.sentiment_labels))

# Train the model
train(model, train_loader, val_loader, epochs=25, learning_rate=0.00003, model_name="ATAE_LSTM")

# Save the model
torch.save(model.state_dict(), "models/ATAE_LSTM.pt")

---------------- TRAINING ATAE_LSTM ----------------
Epoch [1/25] -> Train Loss: 1.2587, Train Acc: 0.5863 | Val Loss: 1.2745, Val Acc: 0.5633
Epoch [2/25] -> Train Loss: 1.0876, Train Acc: 0.5870 | Val Loss: 1.1285, Val Acc: 0.5633
Epoch [3/25] -> Train Loss: 1.0339, Train Acc: 0.5870 | Val Loss: 1.0915, Val Acc: 0.5633
Epoch [4/25] -> Train Loss: 1.0049, Train Acc: 0.5870 | Val Loss: 1.0731, Val Acc: 0.5633
Epoch [5/25] -> Train Loss: 0.9807, Train Acc: 0.5876 | Val Loss: 1.0587, Val Acc: 0.5660
Epoch [6/25] -> Train Loss: 0.9499, Train Acc: 0.5910 | Val Loss: 1.0406, Val Acc: 0.5660
Epoch [7/25] -> Train Loss: 0.9122, Train Acc: 0.6086 | Val Loss: 1.0224, Val Acc: 0.5930
Epoch [8/25] -> Train Loss: 0.8814, Train Acc: 0.6201 | Val Loss: 1.0137, Val Acc: 0.5957
Epoch [9/25] -> Train Loss: 0.8421, Train Acc: 0.6424 | Val Loss: 1.0023, Val Acc: 0.6253
Epoch [10/25] -> Train Loss: 0.8128, Train Acc: 0.6677 | Val Loss: 0.9923, Val Acc: 0.6038
Epoch [11/25] -> Train Loss: 0.7896, Train Acc

0,1
Train Accuracy,▁▁▁▁▁▁▂▂▃▄▄▄▅▅▅▅▆▆▆▆▇▇███
Train Loss,█▆▆▆▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
Val Accuracy,▁▁▁▁▁▁▃▃▅▄▄▄▄▅▅▅▆▅▆▆▇▇▇██
Val Loss,█▅▄▄▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁

0,1
Train Accuracy,0.81155
Train Loss,0.52421
Val Accuracy,0.66577
Val Loss,0.96356


## Testing and Model Inference

In [12]:
# # preprocess test.json
# preprocess("/kaggle/input/nlp-a2/test.json", "test_task_2.json")

# # Load test dataset
# test_dataset = ASC_Dataset("test_task_2.json", GloVe)

# # Create DataLoader
# test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, collate_fn=ASC_Dataset.collate_fn)

# # Initialize model
# model = ATAE_LSTM(GloVe.embeddings, hidden_dim=128, output_dim=len(ASC_Dataset.sentiment_labels))

# # Load  Weights 
# model.load_state_dict(torch.load("models/ATAE_LSTM.pt", map_location=device, weights_only=True))
# model.to(device)

# print(f"-------------------------------- Evaluating {model_name} --------------------------------")
# evaluate_ASC(model, test_loader, verbose = True)        