In [1]:
# Cell 1: Imports, NLTK setup & GPU check

import torch
import pandas as pd
import numpy as np
import sklearn
import nltk
from bs4 import BeautifulSoup
import re

# NLTK downloads (only the first time)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# GPU check
print("CUDA available?", torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


CUDA available? True
Using device: cuda


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akram\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\akram\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\akram\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# Cell 2: Load CSVs & train/val/test split

from sklearn.model_selection import train_test_split

# Adjust paths if needed
train_df = pd.read_csv(r'C:\NLP project\Sentiment-Analysis-using-LSTM\train.csv')
test_df  = pd.read_csv(r'C:\NLP project\Sentiment-Analysis-using-LSTM\test.csv').iloc[:10000].reset_index(drop=True)

# Split 25k train → 20k train + 5k val
train_df, val_df = train_test_split(
    train_df,
    test_size=5000,
    random_state=42,
    stratify=train_df['label']
)

print("Train:", train_df.shape, "Val:", val_df.shape, "Test:", test_df.shape)
train_df.head()


Train: (20000, 4) Val: (5000, 4) Test: (10000, 3)


Unnamed: 0,text,text_len,score,label
7522,Many reviews I've read reveals that most peopl...,2621,9,1
17493,The effect achieved in this story about a psyc...,1123,3,0
12260,I really enjoyed the performances of the main ...,650,8,1
1275,'Midnight Cowboy' was rated X with the origina...,1122,9,1
8004,"As the maker of ""This Darkness,"" I admit we ne...",747,10,1


In [3]:
# Cell 3: Text cleaning + token preprocessing

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # 1. Remove HTML
    text = BeautifulSoup(text, 'html.parser').get_text()
    # 2. Lowercase
    text = text.lower()
    # 3. Remove non-alphanumeric
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    # 4. Tokenize, remove stopwords, lemmatize
    tokens = []
    for w in text.split():
        if w in stop_words:
            continue
        tokens.append( lemmatizer.lemmatize(w) )
    return " ".join(tokens)

# Apply
for df in (train_df, val_df, test_df):
    df['clean_text'] = df['text'].map(clean_text)

print("Sample cleaned review:")
print(train_df['clean_text'].iloc[0])


Sample cleaned review:
many review read reveals people tend like part one better part two feel exactly opposite part one played around bit much trying find different way showing che guevara personality different type film stock different location cutting back forth interview cuban revolution part structured finely somewhat distracting part two che enters bolivia along changing geographical location rule structure change gone spacial jump switching stock documentary realism treatise instead literally trapped che desaturated depopulated landscape people exist burdened far life anything survival option posit dark turn che life real reason people prefer part one part two change geographic location also signifies least che part one che part two fact second two act three act structure begun motorcycle diary motorcycle diary che coming age appropriately coming ideal argentina che part one military leadership cuba che part two downfall bolivia movie completely illustrate life missing experienc

In [None]:
# Cell 4: Tokenization & DataLoaders (using Hugging Face tokenizer)

from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader

model_name = "roberta-base"  # for tokenizer only
tokenizer = AutoTokenizer.from_pretrained(model_name)

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Datasets
train_ds = TextDataset(train_df['clean_text'].tolist(), train_df['label'].tolist(), tokenizer)
val_ds   = TextDataset(val_df['clean_text'].tolist(),   val_df['label'].tolist(),   tokenizer)
test_ds  = TextDataset(test_df['clean_text'].tolist(),  test_df['label'].tolist(),  tokenizer)

# DataLoaders
batch_size = 32
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False)

print(f"Batches → train: {len(train_loader)}, val: {len(val_loader)}, test: {len(test_loader)}")


  from .autonotebook import tqdm as notebook_tqdm


Batches → train: 625, val: 157, test: 313


In [7]:
# Cell 5: Define a CNN + BiLSTM model from scratch

import torch.nn as nn

class CNNLSTMSentiment(nn.Module):
    def __init__(self,
                 vocab_size,
                 embed_dim=128,
                 cnn_filters=128,
                 lstm_hidden=128,
                 lstm_layers=1,
                 kernel_size=3,
                 dropout=0.3):
        super().__init__()
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=tokenizer.pad_token_id)
        # 1D convolution over the embedding dimension
        self.conv1 = nn.Conv1d(
            in_channels=embed_dim,
            out_channels=cnn_filters,
            kernel_size=kernel_size,
            padding=kernel_size//2
        )
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2)
        # Bi-directional LSTM on top of conv features
        self.lstm = nn.LSTM(
            input_size=cnn_filters,
            hidden_size=lstm_hidden,
            num_layers=lstm_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout if lstm_layers > 1 else 0
        )
        self.dropout = nn.Dropout(dropout)
        # Final classification layer
        self.fc = nn.Linear(lstm_hidden * 2, 2)

    def forward(self, input_ids, attention_mask=None):
        # (batch, seq_len) → (batch, seq_len, embed_dim)
        x = self.embedding(input_ids)
        x = self.dropout(x)
        # Prepare for conv: (batch, embed_dim, seq_len)
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)                  # (batch, cnn_filters, seq_len/2)
        # Prepare for LSTM: (batch, seq_len/2, cnn_filters)
        x = x.permute(0, 2, 1)
        # LSTM: we only care about the hidden state
        _, (h_n, _) = self.lstm(x)
        # Concatenate forward & backward final hidden states
        h = torch.cat((h_n[-2], h_n[-1]), dim=1)
        h = self.dropout(h)
        return self.fc(h)

# Instantiate model
vocab_size = tokenizer.vocab_size
model = CNNLSTMSentiment(vocab_size).to(device)
print(model)



CNNLSTMSentiment(
  (embedding): Embedding(50265, 128, padding_idx=1)
  (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu): ReLU()
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (lstm): LSTM(128, 128, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)


In [8]:
# Cell 6: Training loop with early stopping

from torch.optim import Adam
from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm

optimizer = Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

max_epochs = 10
patience = 2
best_val_acc = 0
epochs_no_improve = 0

def evaluate(loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for b in loader:
            inputs = b['input_ids'].to(device)
            labels = b['labels'].to(device)
            logits = model(inputs)
            preds = logits.argmax(dim=1)
            all_preds += preds.cpu().tolist()
            all_labels += labels.cpu().tolist()
    return accuracy_score(all_labels, all_preds)

for epoch in range(1, max_epochs+1):
    model.train()
    loop = tqdm(train_loader, desc=f"Epoch {epoch}")
    for batch in loop:
        inputs = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        logits = model(inputs)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        loop.set_postfix(loss=loss.item())

    val_acc = evaluate(val_loader)
    print(f"→ Epoch {epoch} Val Acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        epochs_no_improve = 0
        torch.save(model.state_dict(), 'best_lstm.pth')
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f"Early stopping after {epoch} epochs")
            break


Epoch 1: 100%|██████████| 625/625 [00:14<00:00, 44.04it/s, loss=0.426]


→ Epoch 1 Val Acc: 0.7850


Epoch 2: 100%|██████████| 625/625 [00:13<00:00, 47.43it/s, loss=0.373]


→ Epoch 2 Val Acc: 0.8294


Epoch 3: 100%|██████████| 625/625 [00:13<00:00, 47.70it/s, loss=0.287] 


→ Epoch 3 Val Acc: 0.8378


Epoch 4: 100%|██████████| 625/625 [00:12<00:00, 48.32it/s, loss=0.245] 


→ Epoch 4 Val Acc: 0.8450


Epoch 5: 100%|██████████| 625/625 [00:13<00:00, 47.67it/s, loss=0.191] 


→ Epoch 5 Val Acc: 0.8464


Epoch 6: 100%|██████████| 625/625 [00:12<00:00, 48.11it/s, loss=0.236] 


→ Epoch 6 Val Acc: 0.8450


Epoch 7: 100%|██████████| 625/625 [00:13<00:00, 47.67it/s, loss=0.204] 


→ Epoch 7 Val Acc: 0.8490


Epoch 8: 100%|██████████| 625/625 [00:12<00:00, 48.16it/s, loss=0.151] 


→ Epoch 8 Val Acc: 0.8478


Epoch 9: 100%|██████████| 625/625 [00:13<00:00, 46.34it/s, loss=0.114] 


→ Epoch 9 Val Acc: 0.8530


Epoch 10: 100%|██████████| 625/625 [00:13<00:00, 46.92it/s, loss=0.233]  


→ Epoch 10 Val Acc: 0.8468


In [9]:
# Cell 7: Load best model and evaluate on test set

model.load_state_dict(torch.load('best_lstm.pth'))
test_acc = evaluate(test_loader)
print(f"🔥 Best LSTM Test Accuracy: {test_acc:.4f}")


🔥 Best LSTM Test Accuracy: 0.8479
