In [1]:
import sys
print(sys.executable)


c:\ProgramData\anaconda3\envs\sentiment2\python.exe


In [2]:
import sys, torch

print("Python exec:", sys.executable)
print("Torch lib:", torch.__file__)
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version())
if torch.cuda.is_available():
    print("  → GPU:", torch.cuda.get_device_name(0))


Python exec: c:\ProgramData\anaconda3\envs\sentiment2\python.exe
Torch lib: c:\ProgramData\anaconda3\envs\sentiment2\lib\site-packages\torch\__init__.py
Torch version: 2.1.1+cu118
CUDA available: True
CUDA version: 11.8
cuDNN version: 8700
  → GPU: NVIDIA GeForce RTX 3060


In [3]:
# Cell 1: Imports & environment check

import torch
import transformers
import pandas as pd
import numpy as np
import sklearn

# 1. GPU check
print("Torch CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("  → GPU detected:", torch.cuda.get_device_name(0))

# 2. Version info
print("Torch version:       ", torch.__version__)
print("Transformers version:", transformers.__version__)
print("Pandas version:      ", pd.__version__)
print("NumPy version:       ", np.__version__)
print("Scikit-learn version:", sklearn.__version__)


  from .autonotebook import tqdm as notebook_tqdm


Torch CUDA available: True
  → GPU detected: NVIDIA GeForce RTX 3060
Torch version:        2.1.1+cu118
Transformers version: 4.51.0
Pandas version:       2.2.3
NumPy version:        1.26.3
Scikit-learn version: 1.6.1


In [4]:
# Cell 2: Load and inspect the raw training data
import pandas as pd

# TODO: update this to the real path on your system, e.g.
# train_path = r'C:\Users\akram\data\train.csv'
train_path = r'C:\NLP project\Sentiment-Analysis-using-LSTM\train.csv'

raw_df = pd.read_csv(train_path)
print(f"Raw train.csv shape: {raw_df.shape}")
raw_df.head()


Raw train.csv shape: (25000, 4)


Unnamed: 0,text,text_len,score,label
0,Bromwell High is a cartoon comedy. It ran at t...,806,9,1
1,Homelessness (or Houselessness as George Carli...,2318,8,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,841,10,1
3,This is easily the most underrated film inn th...,663,7,1
4,This is not the typical Mel Brooks film. It wa...,647,8,1


In [5]:
# Cell 3: Split into Train/Validation and subset Test

import pandas as pd
from sklearn.model_selection import train_test_split

# ── paths to my train and test  ────────────────────────────────────────────────────────
train_path = r'C:\NLP project\Sentiment-Analysis-using-LSTM\train.csv'
test_path  = r'C:\NLP project\Sentiment-Analysis-using-LSTM\test.csv'
# ──────────────────────────────────────────────────────────────────────────────

# 1. Load full training data and split
raw_df = pd.read_csv(train_path)
train_df, val_df = train_test_split(
    raw_df,
    test_size=5000,
    random_state=42,
    stratify=raw_df['label']
)

print("Train split shape:     ", train_df.shape)
print("Validation split shape:", val_df.shape)

# 2. Load test and subset to 10k
test_df = pd.read_csv(test_path).iloc[:10000]
print("Test subset shape:     ", test_df.shape)

# 3. Preview first rows
print("\n--- Train head ---")
print(train_df.head().to_string(index=False))

print("\n--- Validation head ---")
print(val_df.head().to_string(index=False))

print("\n--- Test head ---")
print(test_df.head().to_string(index=False))


Train split shape:      (20000, 4)
Validation split shape: (5000, 4)
Test subset shape:      (10000, 3)

--- Train head ---
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            

In [6]:
# Cell 4: Tokenizer, Dataset & DataLoaders

from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader

# 1. Load pretrained tokenizer
model_name = 'distilbert-base-uncased'  # feel free to swap this
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 2. Define a Dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        # Squeeze the batch dimension
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(label)
        return item

# 3. Instantiate datasets
train_ds = SentimentDataset(train_df['text'].tolist(), train_df['label'].tolist(), tokenizer)
val_ds   = SentimentDataset(val_df['text'].tolist(),   val_df['label'].tolist(),   tokenizer)
test_ds  = SentimentDataset(test_df['text'].tolist(),  test_df['label'].tolist(),  tokenizer)

# 4. Create DataLoaders
batch_size = 16
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False)

# 5. Inspect batch counts
print(f"Num training batches:   {len(train_loader)}")
print(f"Num validation batches: {len(val_loader)}")
print(f"Num test batches:       {len(test_loader)}")


Num training batches:   1250
Num validation batches: 313
Num test batches:       625


In [7]:
# Cell 5 (updated): Model + Optimizer + Scheduler
from transformers import AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
import torch

# 1. Load the model and send to GPU if available
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 2. Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# 3. Scheduler
num_epochs = 3
total_steps = len(train_loader) * num_epochs
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps,
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm

def evaluate(loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
            labels = batch["labels"].to(device)
            outputs = model(**inputs)
            preds = outputs.logits.argmax(dim=-1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
    return accuracy_score(all_labels, all_preds)

for epoch in range(1, num_epochs+1):
    # Training
    model.train()
    loop = tqdm(train_loader, desc=f"Epoch {epoch} ▶")
    for batch in loop:
        optimizer.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            labels=inputs["labels"]
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        loop.set_postfix(loss=loss.item())

    # Validation
    val_acc = evaluate(val_loader)
    print(f"Epoch {epoch} ◀ Validation Accuracy: {val_acc:.4f}")


Epoch 1 ▶: 100%|██████████| 1250/1250 [03:00<00:00,  6.93it/s, loss=0.339] 


Epoch 1 ◀ Validation Accuracy: 0.8618


Epoch 2 ▶: 100%|██████████| 1250/1250 [02:51<00:00,  7.28it/s, loss=0.144] 


Epoch 2 ◀ Validation Accuracy: 0.8738


Epoch 3 ▶: 100%|██████████| 1250/1250 [02:49<00:00,  7.37it/s, loss=0.07]   


Epoch 3 ◀ Validation Accuracy: 0.8732


In [9]:
test_acc = evaluate(test_loader)
print(f"🔥 Test set accuracy: {test_acc:.4f}")


🔥 Test set accuracy: 0.8930
