<a href="https://colab.research.google.com/github/jaouni24/Character-Recognition-and-Subjectivity-Detection/blob/main/AI_Lab_Case_Study_2_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 2: Subjectivity Recognision


## LSTM

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
import nltk

# Download NLTK resources
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Dataset class
class TextDataset(Dataset):
    def __init__(self, file_path, word2vec_model, max_seq_len, label_encoder):
        self.data = pd.read_csv(file_path, sep='\t')
        self.sentences = self.data['sentence'].apply(self.preprocess).tolist()
        self.labels = label_encoder.transform(self.data['label'])  # Encode labels
        self.word2vec = word2vec_model
        self.max_seq_len = max_seq_len

    def preprocess(self, sentence):
        # Tokenization and lowercasing
        tokens = word_tokenize(sentence.lower())
        # Retain only alphabetic tokens
        tokens = [word for word in tokens if word.isalpha()]
        return tokens

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        tokens = self.sentences[idx]
        # Convert tokens to vectors
        vectors = [
            self.word2vec.wv[token] if token in self.word2vec.wv else np.zeros(self.word2vec.vector_size)
            for token in tokens
        ]
        # Pad or truncate sequences
        if len(vectors) < self.max_seq_len:
            vectors += [np.zeros(self.word2vec.vector_size)] * (self.max_seq_len - len(vectors))
        else:
            vectors = vectors[:self.max_seq_len]

        vectors = np.array(vectors, dtype=np.float32)
        label = int(self.labels[idx])  # Ensure label is an integer
        return torch.tensor(vectors), torch.tensor(label)

# Define LSTM model
class SentimentClassifierLSTM(nn.Module):
    def __init__(self, embedding_size, hidden_size, num_layers):
        super(SentimentClassifierLSTM, self).__init__()
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 2)  # Output 2 classes (SUBJ and OBJ)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # Use the last hidden state
        return out  # Logits for CrossEntropyLoss

# Load data and train Word2Vec
train_file = "/content/train_en.tsv"
test_file = "/content/test_en_gold.tsv"

train_data = pd.read_csv(train_file, sep='\t')
train_data['sentence'] = train_data['sentence'].fillna("").astype(str)  # Ensure all inputs are valid strings

# Encode labels
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['label'])
test_data = pd.read_csv(test_file, sep='\t')
test_data['label'] = label_encoder.transform(test_data['label'])

# Print label mapping
print("Label Mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

# Train Word2Vec model
sentences = train_data['sentence'].apply(lambda x: word_tokenize(x.lower())).tolist()
word2vec = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)

# Hyperparameter grid
learning_rates = [0.01, 0.001, 0.003, 0.005, 0.0001]
batch_sizes = [32, 64, 128]
num_layers_list = [2, 3, 4, 5]
max_seq_len = 50
embedding_size = 100
hidden_size = 128
epochs = 20

# Results storage
results = []

# Hyperparameter search
print("\nEvaluating Hyperparameter Combinations:\n")

for lr in learning_rates:
    for batch_size in batch_sizes:
        for num_layers in num_layers_list:
            # print(f"Evaluating: LR={lr}, Batch Size={batch_size}, Layers={num_layers}")

            # Load datasets
            train_dataset = TextDataset(train_file, word2vec, max_seq_len, label_encoder)
            test_dataset = TextDataset(test_file, word2vec, max_seq_len, label_encoder)

            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

            # Initialize model, loss, and optimizer
            model = SentimentClassifierLSTM(embedding_size, hidden_size, num_layers).to(device)
            class_weights = torch.tensor([1.0, 1.0]).to(device)  # Adjust class weights if needed
            criterion = nn.CrossEntropyLoss(weight=class_weights)
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)

            # Training loop
            for epoch in range(epochs):
                model.train()
                total_loss = 0
                for inputs, labels in train_loader:
                    inputs, labels = inputs.to(device), labels.to(device)
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)

                    optimizer.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
                    optimizer.step()

                    total_loss += loss.item()

            # Evaluation
            model.eval()
            y_true, y_pred = [], []
            with torch.no_grad():
                for inputs, labels in test_loader:
                    inputs, labels = inputs.to(device), labels.to(device)
                    outputs = model(inputs)
                    predictions = torch.argmax(outputs, dim=1)
                    y_true.extend(labels.cpu().numpy())
                    y_pred.extend(predictions.cpu().numpy())

            # Compute metrics
            accuracy = accuracy_score(y_true, y_pred)
            precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
            recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)

            # Store the results
            results.append((lr, batch_size, num_layers, accuracy, precision, recall, f1))
            print(f"LR: {lr}, Batch Size: {batch_size}, Layers: {num_layers} => Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

# Convert results to a DataFrame
results_df = pd.DataFrame(results, columns=['Learning Rate', 'Batch Size', 'Num Layers', 'Accuracy', 'Precision', 'Recall', 'F1'])

# Display best configuration
best_config = results_df.loc[results_df['F1'].idxmax()]
print("\nBest Configuration:")
print(best_config)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Label Mapping: {'OBJ': 0, 'SUBJ': 1}

Evaluating Hyperparameter Combinations:

LR: 0.01, Batch Size: 32, Layers: 2 => Accuracy: 0.4774, Precision: 0.2387, Recall: 0.5000, F1: 0.3231
LR: 0.01, Batch Size: 32, Layers: 3 => Accuracy: 0.4774, Precision: 0.2387, Recall: 0.5000, F1: 0.3231
LR: 0.01, Batch Size: 32, Layers: 4 => Accuracy: 0.4774, Precision: 0.4885, Recall: 0.4993, F1: 0.3364
LR: 0.01, Batch Size: 32, Layers: 5 => Accuracy: 0.4774, Precision: 0.2387, Recall: 0.5000, F1: 0.3231
LR: 0.01, Batch Size: 64, Layers: 2 => Accuracy: 0.4815, Precision: 0.5729, Recall: 0.5036, F1: 0.3384
LR: 0.01, Batch Size: 64, Layers: 3 => Accuracy: 0.4774, Precision: 0.2387, Recall: 0.5000, F1: 0.3231
LR: 0.01, Batch Size: 64, Layers: 4 => Accuracy: 0.4774, Precision: 0.2387, Recall: 0.5000, F1: 0.3231
LR: 0.01, Batch Size: 64, Layers: 5 => Accuracy: 0.4774, Precision: 0.2387, Recall: 0.5000, F1: 0.3231
LR: 0.01, Batch Size: 128, Layers: 2 => Accuracy: 0.4897, Precision: 0.5987, Recall: 0.5111, F1: 