# 10 — RNN (Bidirectional LSTM)

A **Recurrent Neural Network** using a Bidirectional LSTM for text classification.

Unlike CNN (which captures local n-gram patterns), LSTM processes the entire sequence
step-by-step, capturing **long-range dependencies** and contextual meaning.

Bidirectional processing reads the text both forward and backward for richer representations.

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score, classification_report
import os

In [2]:
%load_ext watermark
%watermark -v -n -m -p numpy,pandas,torch,gensim,sklearn

Python implementation: CPython
Python version       : 3.12.12
IPython version      : 9.10.0

numpy  : 1.26.4
pandas : 3.0.0
torch  : 2.2.2
gensim : 4.4.0
sklearn: 1.8.0

Compiler    : Clang 17.0.0 (clang-1700.6.3.2)
OS          : Darwin
Release     : 25.2.0
Machine     : x86_64
Processor   : i386
CPU cores   : 8
Architecture: 64bit



## 1. Data Preparation

In [3]:
MAX_LEN = 50
EMBED_DIM = 100
LABEL_MAP = {'NEGATIVE': 0, 'POSITIVE': 1}

def build_embedding_matrix(w2v_model):
    vocab = w2v_model.wv.key_to_index
    matrix = np.zeros((len(vocab) + 1, EMBED_DIM))
    word2idx = {'<PAD>': 0}
    for word, idx in vocab.items():
        word2idx[word] = idx + 1
        matrix[idx + 1] = w2v_model.wv[word]
    return matrix, word2idx

def texts_to_sequences(texts, word2idx, max_len):
    sequences = []
    for text in texts:
        tokens = str(text).split()
        seq = [word2idx.get(w, 0) for w in tokens[:max_len]]
        seq += [0] * (max_len - len(seq))
        sequences.append(seq)
    return np.array(sequences)

## 2. Model Definition

Embedding → Bidirectional LSTM (hidden=64) → Dropout → FC → Sigmoid

In [None]:
class BiLSTM(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim=64, dropout=0.3):
        super().__init__()
        vocab_size, embed_dim = embedding_matrix.shape
        
        # Pre-trained embedding (frozen)
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.embedding.weight = nn.Parameter(torch.FloatTensor(embedding_matrix))
        self.embedding.weight.requires_grad = False
        
        # Bidirectional LSTM
        self.lstm = nn.LSTM(
            embed_dim, hidden_dim,
            batch_first=True,
            bidirectional=True,
            num_layers=1
        )
        
        self.dropout = nn.Dropout(dropout)
        # Bidirectional → hidden_dim * 2
        self.fc = nn.Linear(hidden_dim * 2, 1)
    
    def forward(self, x):
        # x: (batch, seq_len)
        x = self.embedding(x)  # (batch, seq_len, embed_dim)
        
        # LSTM output: (batch, seq_len, hidden*2)
        lstm_out, (hidden, _) = self.lstm(x)
        
        # Concatenate final hidden states from both directions
        # hidden: (num_layers*2, batch, hidden_dim)
        hidden_fwd = hidden[-2]  # Last layer, forward
        hidden_bwd = hidden[-1]  # Last layer, backward
        combined = torch.cat((hidden_fwd, hidden_bwd), dim=1)  # (batch, hidden*2)
        
        out = self.dropout(combined)
        out = torch.sigmoid(self.fc(out)).squeeze(1)
        return out

## 3. Training Function

In [5]:
def train_rnn(variation_name, data_dir, w2v_path, output_dir, epochs=20, lr=1e-3, batch_size=32):
    print(f"\n{'='*20} BiLSTM: {variation_name} {'='*20}")
    
    # Load Word2Vec
    w2v = Word2Vec.load(w2v_path)
    embed_matrix, word2idx = build_embedding_matrix(w2v)
    print(f"Embedding matrix: {embed_matrix.shape}")
    
    # Load data
    train_df = pd.read_csv(f'{data_dir}/train.csv').fillna('')
    test_df  = pd.read_csv(f'{data_dir}/test.csv').fillna('')
    
    X_train = texts_to_sequences(train_df['text_clean'], word2idx, MAX_LEN)
    X_test  = texts_to_sequences(test_df['text_clean'], word2idx, MAX_LEN)
    y_train = train_df['label'].map(LABEL_MAP).values.astype(np.float32)
    y_test  = test_df['label'].map(LABEL_MAP).values.astype(np.float32)
    
    # DataLoaders
    train_ds = TensorDataset(torch.LongTensor(X_train), torch.FloatTensor(y_train))
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    
    # Model
    model = BiLSTM(embed_matrix)
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(
        filter(lambda p: p.requires_grad, model.parameters()), lr=lr
    )
    
    # Train
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            output = model(batch_x)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        if (epoch + 1) % 5 == 0:
            print(f"  Epoch {epoch+1}/{epochs} — Loss: {total_loss/len(train_loader):.4f}")
    
    # Evaluate
    model.eval()
    with torch.no_grad():
        preds = model(torch.LongTensor(X_test))
        y_pred = (preds >= 0.5).int().numpy()
    
    acc = accuracy_score(y_test, y_pred)
    print(f"\nBiLSTM ({variation_name}) Accuracy: {acc:.4f}")
    print(classification_report(y_test.astype(int), y_pred))
    
    # Save
    os.makedirs(output_dir, exist_ok=True)
    torch.save(model.state_dict(), f'{output_dir}/model.pt')
    print(f"Model saved to {output_dir}/model.pt")
    
    return acc

## 4. Run Both Pipelines

In [6]:
torch.manual_seed(42)
np.random.seed(42)

acc_standard = train_rnn(
    "Standard",
    "../data/processed/standard",
    "../models/word2vec/standard/word2vec.model",
    "../models/rnn/standard"
)

acc_irony = train_rnn(
    "Irony",
    "../data/processed/irony",
    "../models/word2vec/irony/word2vec.model",
    "../models/rnn/irony"
)


Embedding matrix: (2248, 100)


  Epoch 5/20 — Loss: 0.4657


  Epoch 10/20 — Loss: 0.4079


  Epoch 15/20 — Loss: 0.3291


  Epoch 20/20 — Loss: 0.2623

BiLSTM (Standard) Accuracy: 0.7822
              precision    recall  f1-score   support

           0       0.77      0.80      0.79       225
           1       0.79      0.76      0.78       225

    accuracy                           0.78       450
   macro avg       0.78      0.78      0.78       450
weighted avg       0.78      0.78      0.78       450

Model saved to ../models/rnn/standard/model.pt

Embedding matrix: (2239, 100)


  Epoch 5/20 — Loss: 0.4694


  Epoch 10/20 — Loss: 0.4020


  Epoch 15/20 — Loss: 0.3201


  Epoch 20/20 — Loss: 0.2517

BiLSTM (Irony) Accuracy: 0.7800
              precision    recall  f1-score   support

           0       0.76      0.81      0.79       225
           1       0.80      0.75      0.77       225

    accuracy                           0.78       450
   macro avg       0.78      0.78      0.78       450
weighted avg       0.78      0.78      0.78       450

Model saved to ../models/rnn/irony/model.pt


## 5. Comparison

In [7]:
print("\n=== Final Comparison ===")
print(f"Standard: {acc_standard:.4f}")
print(f"Irony:    {acc_irony:.4f}")
diff = acc_irony - acc_standard
print(f"Impact of Irony features: {diff:+.4f}")


=== Final Comparison ===
Standard: 0.7822
Irony:    0.7800
Impact of Irony features: -0.0022
