<a href="https://colab.research.google.com/github/hendrikyong/CVNL_Assignment_1/blob/main/RNN_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from collections import Counter
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download necessary NLTK data
nltk.download("punkt")
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
# Define dataset paths from Hugging Face
splits = {
    'train': 'hf://datasets/dair-ai/emotion/split/train-00000-of-00001.parquet',
    'validation': 'hf://datasets/dair-ai/emotion/split/validation-00000-of-00001.parquet',
    'test': 'hf://datasets/dair-ai/emotion/split/test-00000-of-00001.parquet'
}

# Load datasets using pandas
train_df = pd.read_parquet(splits["train"])
val_df = pd.read_parquet(splits["validation"])
test_df = pd.read_parquet(splits["test"])

# Display dataset samples
print("Train Data Sample:\n", train_df.head())
print("Validation Data Sample:\n", val_df.head())
print("Test Data Sample:\n", test_df.head())

# Extract text and labels
train_texts, train_labels = train_df["text"].tolist(), train_df["label"].tolist()
val_texts, val_labels = val_df["text"].tolist(), val_df["label"].tolist()
test_texts, test_labels = test_df["text"].tolist(), test_df["label"].tolist()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Train Data Sample:
                                                 text  label
0                            i didnt feel humiliated      0
1  i can go from feeling so hopeless to so damned...      0
2   im grabbing a minute to post i feel greedy wrong      3
3  i am ever feeling nostalgic about the fireplac...      2
4                               i am feeling grouchy      3
Validation Data Sample:
                                                 text  label
0  im feeling quite sad and sorry for myself but ...      0
1  i feel like i am still looking at a blank canv...      0
2                     i feel like a faithful servant      2
3                  i am just feeling cranky and blue      3
4  i can have for a treat or if i am feeling festive      1
Test Data Sample:
                                                 text  label
0  im feeling rather rotten so im not very ambiti...      0
1          im updating my blog because i feel shitty      0
2  i never make her separate from me

In [3]:
# Tokenize all words from training set
all_words = [word for sentence in train_texts for word in word_tokenize(sentence.lower())]

# Create vocabulary mapping
word_to_idx = {word: idx + 2 for idx, word in enumerate(Counter(all_words))}
word_to_idx["<PAD>"] = 0
word_to_idx["<UNK>"] = 1  # For unknown words

# Define max sequence length
MAX_LENGTH = 30

# Convert text to sequence of tokens
def encode_text(text, word_to_idx, max_len=MAX_LENGTH):
    tokens = [word_to_idx.get(word, word_to_idx["<UNK>"]) for word in word_tokenize(text.lower())]
    return tokens[:max_len] + [word_to_idx["<PAD>"]] * (max_len - len(tokens))

# Encode datasets
train_sequences = [encode_text(text, word_to_idx) for text in train_texts]
val_sequences = [encode_text(text, word_to_idx) for text in val_texts]
test_sequences = [encode_text(text, word_to_idx) for text in test_texts]

# Convert labels to tensor
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
test_labels = torch.tensor(test_labels)

print("Sample encoded text:", train_sequences[0])

Sample encoded text: [2, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [4]:
class EmotionDataset(Dataset):
    def __init__(self, text_sequences, labels):
        self.text_sequences = text_sequences
        self.labels = labels

    def __len__(self):
        return len(self.text_sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.text_sequences[idx]), torch.tensor(self.labels[idx])

# Create dataset instances
train_dataset = EmotionDataset(train_sequences, train_labels)
val_dataset = EmotionDataset(val_sequences, val_labels)
test_dataset = EmotionDataset(test_sequences, test_labels)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"Dataset Sizes - Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")

Dataset Sizes - Train: 16000, Val: 2000, Test: 2000


In [5]:
class EmotionLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=2, dropout=0.3):
        super(EmotionLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        x_embedded = self.embedding(x)
        lstm_out, _ = self.lstm(x_embedded)
        last_hidden = lstm_out[:, -1, :]
        return self.fc(last_hidden)

# Initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = EmotionLSTM(len(word_to_idx), embed_dim=128, hidden_dim=32, num_classes=6).to(device)

print(model)

EmotionLSTM(
  (embedding): Embedding(15212, 128, padding_idx=0)
  (lstm): LSTM(128, 32, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (fc): Linear(in_features=64, out_features=6, bias=True)
)


In [None]:
# Define optimizer, loss function
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=5e-3)
criterion = nn.CrossEntropyLoss()

# Training function with validation and test accuracy evaluated only after all epochs
def train_model_with_test_eval(model, train_loader, val_loader, test_loader, optimizer, criterion, device, epochs=15):
    train_losses = []   # List to track training losses
    train_accuracies = []   # List to track training accuracies

    for epoch in range(epochs):
        model.train()
        train_loss, train_correct, train_total = 0, 0, 0

        with tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}") as pbar:
            for texts, labels in pbar:
                texts, labels = texts.to(device), labels.to(device)
                optimizer.zero_grad()

                # Forward pass
                outputs = model(texts)
                loss = criterion(outputs, labels)

                # Backward pass and optimization
                loss.backward()
                optimizer.step()

                # Track loss and accuracy
                train_loss += loss.item()
                train_correct += (torch.argmax(outputs, dim=1) == labels).sum().item()
                train_total += labels.size(0)

                # Update progress bar
                pbar.set_postfix(loss=f"{loss.item():.4f}")

        # Calculate training metrics
        train_loss /= len(train_loader)
        train_accuracy = (train_correct / train_total) * 100

        # Append training metrics for plotting later
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)
        print(f"Epoch {epoch+1}:\n  Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%")

    # Return training metrics for visualization
    return train_losses, train_accuracies

    # Final Validation Step
    model.eval()
    val_loss, val_correct, val_total = 0, 0, 0

    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            val_correct += (torch.argmax(outputs, dim=1) == labels).sum().item()
            val_total += labels.size(0)

    # Calculate validation metrics
    val_loss /= len(val_loader)
    val_accuracy = (val_correct / val_total) * 100
    print("\nFinal Validation Results:")
    print(f"  Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%")

    # Final Test Evaluation
    test_loss, test_correct, test_total = 0, 0, 0

    with torch.no_grad():
        for texts, labels in test_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels)

            test_loss += loss.item()
            test_correct += (torch.argmax(outputs, dim=1) == labels).sum().item()
            test_total += labels.size(0)

    # Calculate test metrics
    test_loss /= len(test_loader)
    test_accuracy = (test_correct / test_total) * 100
    print("\nFinal Test Results:")
    print(f"  Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")

# Call the updated training function
train_losses, train_accuracies = train_model_with_test_eval(model, train_loader, val_loader, test_loader, optimizer, criterion, device, epochs=15)

  return torch.tensor(self.text_sequences[idx]), torch.tensor(self.labels[idx])
Epoch 1/15:  75%|███████▍  | 374/500 [00:27<00:07, 15.81it/s, loss=1.4079]

In [None]:
# Evaluation function to calculate test loss, accuracy, and predictions
def evaluate_model(model, test_loader, device):
    model.eval()
    y_true, y_pred = [], []
    test_loss = 0.0
    correct, total = 0, 0

    with torch.no_grad():
        for texts, labels in test_loader:
            texts, labels = texts.to(device), labels.to(device)

            outputs = model(texts)
            loss = criterion(outputs, labels)

            test_loss += loss.item()
            _, predictions = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predictions.cpu().numpy())
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    avg_test_loss = test_loss / len(test_loader)
    accuracy = correct / total * 100
    return avg_test_loss, accuracy, y_true, y_pred

In [None]:
# Visualize Train Loss vs Train Accuracy
epochs = range(1, len(train_losses) + 1)

fig, ax1 = plt.subplots(figsize=(12, 5))

# Plot Train Loss on the first axis
ax1.set_xlabel('Training Epochs')
ax1.set_ylabel('Train Loss', color='tab:blue')
ax1.plot(epochs, train_losses, label='Train Loss', color='tab:blue', marker='o')
ax1.tick_params(axis='y', labelcolor='tab:blue')

# Create a second y-axis for accuracy
ax2 = ax1.twinx()
ax2.set_ylabel('Train Accuracy (%)', color='tab:green')
ax2.plot(epochs, train_accuracies, label='Train Accuracy', color='tab:green', marker='o')
ax2.tick_params(axis='y', labelcolor='tab:green')

# Title and grid
plt.title('Train Loss vs Train Accuracy')
ax1.grid(True)

# Combine legends from both axes
fig.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2)

plt.show()

In [None]:
# Evaluate the model on the test dataset
test_loss, test_accuracy, y_true, y_pred = evaluate_model(model, test_loader, device)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")

# Generate classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=[str(i) for i in range(6)]))  # Replace with class names if available

# Generate confusion matrix
confusion_mat = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_mat, annot=True, fmt="d", cmap="Blues", xticklabels=[str(i) for i in range(6)], yticklabels=[str(i) for i in range(6)])
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()