In [13]:
import os
import requests
from tqdm import tqdm
import zipfile

def download_file_with_progress(url, destination):
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    block_size = 1024  # 1 Kibibyte
    with open(destination, 'wb') as file, tqdm(
        desc=destination,
        total=total_size,
        unit='iB',
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for data in response.iter_content(block_size):
            file.write(data)
            bar.update(len(data))

# Define the URL and destination paths


glove_url = "https://nlp.stanford.edu/data/glove.840B.300d.zip"
glove_zip_path = "glove.840B.zip"
glove_txt_file = "glove.840B.300d.txt"


# glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"
# glove_zip_path = "glove.6B.zip"
# glove_txt_file = "glove.6B.50d.txt"

# Download the GloVe zip file with progress bar if it doesn't exist
if not os.path.exists(glove_zip_path):
    download_file_with_progress(glove_url, glove_zip_path)

# Extract the required GloVe file if it doesn't exist
if not os.path.exists(glove_txt_file):
    with zipfile.ZipFile(glove_zip_path, 'r') as zip_ref:
        zip_ref.extract(glove_txt_file)


glove.840B.zip: 100%|██████████| 2.03G/2.03G [06:49<00:00, 5.31MiB/s]


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Configuration
CFG = {
    'embedding_dim': 300,
    'hidden_dim': 256,
    'batch_size': 2048,
    'num_epochs': 50,
    'learning_rate': 3e-4,
    'num_workers': 4,
    'glove_path': 'glove.840B.300d.txt'
}

# Dataset class
class IMDBDataset(Dataset):
    def __init__(self, texts, labels, word2idx):
        self.texts = [self.encode_text(text, word2idx) for text in texts]
        self.labels = labels

    def encode_text(self, text, word2idx):
        return torch.tensor([word2idx.get(word, word2idx['<unk>']) for word in text], dtype=torch.long)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Collate function for DataLoader
def collate_fn(batch):
    texts, labels = zip(*batch)
    lengths = torch.tensor([len(t) for t in texts])
    padded = pad_sequence(texts, batch_first=True, padding_value=0)
    return padded, lengths, torch.tensor(labels)

# Model architecture
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_matrix):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.lstm = nn.LSTM(
            input_size=CFG['embedding_dim'],
            hidden_size=CFG['hidden_dim'],
            num_layers=10,
            bidirectional=True,
            dropout=0.3,
            batch_first=True
        )
        self.classifier = nn.Sequential(
            nn.Linear(CFG['hidden_dim']*2, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 2)
        )

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = pack_padded_sequence(
            embedded, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        output, (hidden, cell) = self.lstm(packed)
        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
        return self.classifier(hidden)

# Load GloVe embeddings
def load_glove_embeddings(glove_path, word2idx):
    embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), CFG['embedding_dim']))
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc='Loading GloVe'):
            parts = line.strip().split()
            word = parts[0]
            vector = parts[1:]
            if word in word2idx:
                try:
                    embeddings[word2idx[word]] = np.array(vector, dtype=np.float32)
                except ValueError:
                    print(f"Skipping malformed line: {line[:50]}...")
    return torch.tensor(embeddings, dtype=torch.float32)


# Training function
def train_model():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load and prepare data
    imdb_data = pd.read_csv('https://media.githubusercontent.com/media/EXC3ll3NTrhyTHM/NLP_datasets/refs/heads/main/IMDB_datasets/IMDB_cleaned.csv')
    texts = imdb_data['oov_processed_text'].astype(str).str.lower().str.split().tolist()
    labels = imdb_data['label'].astype(int).tolist()

    # Build vocabulary
    word_counts = {}
    for sentence in texts:
        for word in sentence:
            word_counts[word] = word_counts.get(word, 0) + 1
    word2idx = {'<pad>': 0, '<unk>': 1}
    for word in word_counts:
        if word_counts[word] > 5:
            word2idx[word] = len(word2idx)

    # Prepare datasets
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.2, random_state=42
    )

    train_dataset = IMDBDataset(train_texts, train_labels, word2idx)
    val_dataset = IMDBDataset(val_texts, val_labels, word2idx)

    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG['batch_size'],
        shuffle=True,
        collate_fn=collate_fn,
        num_workers=CFG['num_workers'],
        pin_memory=True
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=CFG['batch_size'],
        shuffle=False,
        collate_fn=collate_fn,
        num_workers=CFG['num_workers'],
        pin_memory=True
    )

    # Initialize model
    embedding_matrix = load_glove_embeddings(CFG['glove_path'], word2idx)
    # model = SentimentLSTM(len(word2idx), embedding_matrix).to(device)
    
    model = SentimentLSTM(len(word2idx), embedding_matrix)
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs!")
        model = nn.DataParallel(model)
    model = model.to(device)


    optimizer = torch.optim.AdamW(model.parameters(), lr=CFG['learning_rate'])
    criterion = nn.CrossEntropyLoss()

    # Training loop
    for epoch in range(CFG['num_epochs']):
        model.train()
        total_loss = 0
        for inputs, lengths, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
            inputs, lengths, labels = inputs.to(device), lengths.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs, lengths)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1} | Training Loss: {avg_loss:.4f}')

        # Validation
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for inputs, lengths, labels in val_loader:
                inputs, lengths, labels = inputs.to(device), lengths.to(device), labels.to(device)
                outputs = model(inputs, lengths)
                preds = torch.argmax(outputs, dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)

        accuracy = correct / total
        print(f'Epoch {epoch+1} | Validation Accuracy: {accuracy:.2%}')
        
    return model, val_dataset

# Execute training
model, val_dataset = train_model()


Loading GloVe: 547252it [00:04, 112819.04it/s]

Skipping malformed line: contact name@domain.com 0.016426 0.13728 0.18781 0...


Loading GloVe: 2196017it [00:19, 113211.37it/s]


Using 4 GPUs!


Epoch 1: 100%|██████████| 20/20 [00:06<00:00,  3.12it/s]

Epoch 1 | Training Loss: 0.6934





Epoch 1 | Validation Accuracy: 49.90%


Epoch 2: 100%|██████████| 20/20 [00:05<00:00,  3.59it/s]

Epoch 2 | Training Loss: 0.5812





Epoch 2 | Validation Accuracy: 78.14%


Epoch 3: 100%|██████████| 20/20 [00:05<00:00,  3.66it/s]

Epoch 3 | Training Loss: 0.4956





Epoch 3 | Validation Accuracy: 78.69%


Epoch 4: 100%|██████████| 20/20 [00:05<00:00,  3.66it/s]

Epoch 4 | Training Loss: 0.4681





Epoch 4 | Validation Accuracy: 78.18%


Epoch 5: 100%|██████████| 20/20 [00:05<00:00,  3.61it/s]

Epoch 5 | Training Loss: 0.4807





Epoch 5 | Validation Accuracy: 81.31%


Epoch 6: 100%|██████████| 20/20 [00:05<00:00,  3.50it/s]

Epoch 6 | Training Loss: 0.4338





Epoch 6 | Validation Accuracy: 83.11%


Epoch 7: 100%|██████████| 20/20 [00:05<00:00,  3.49it/s]

Epoch 7 | Training Loss: 0.3739





Epoch 7 | Validation Accuracy: 84.64%


Epoch 8: 100%|██████████| 20/20 [00:05<00:00,  3.47it/s]

Epoch 8 | Training Loss: 0.3330





Epoch 8 | Validation Accuracy: 85.15%


Epoch 9: 100%|██████████| 20/20 [00:05<00:00,  3.51it/s]

Epoch 9 | Training Loss: 0.3152





Epoch 9 | Validation Accuracy: 86.00%


Epoch 10: 100%|██████████| 20/20 [00:05<00:00,  3.57it/s]

Epoch 10 | Training Loss: 0.2997





Epoch 10 | Validation Accuracy: 86.40%


Epoch 11: 100%|██████████| 20/20 [00:06<00:00,  3.21it/s]

Epoch 11 | Training Loss: 0.2888





Epoch 11 | Validation Accuracy: 87.17%


Epoch 12: 100%|██████████| 20/20 [00:05<00:00,  3.41it/s]

Epoch 12 | Training Loss: 0.2761





Epoch 12 | Validation Accuracy: 87.41%


Epoch 13: 100%|██████████| 20/20 [00:05<00:00,  3.54it/s]

Epoch 13 | Training Loss: 0.2703





Epoch 13 | Validation Accuracy: 87.40%


Epoch 14: 100%|██████████| 20/20 [00:05<00:00,  3.48it/s]

Epoch 14 | Training Loss: 0.2692





Epoch 14 | Validation Accuracy: 88.05%


Epoch 15: 100%|██████████| 20/20 [00:05<00:00,  3.53it/s]

Epoch 15 | Training Loss: 0.2561





Epoch 15 | Validation Accuracy: 88.24%


Epoch 16: 100%|██████████| 20/20 [00:05<00:00,  3.49it/s]

Epoch 16 | Training Loss: 0.2427





Epoch 16 | Validation Accuracy: 88.43%


Epoch 17: 100%|██████████| 20/20 [00:05<00:00,  3.49it/s]

Epoch 17 | Training Loss: 0.2350





Epoch 17 | Validation Accuracy: 88.52%


Epoch 18: 100%|██████████| 20/20 [00:05<00:00,  3.50it/s]

Epoch 18 | Training Loss: 0.2284





Epoch 18 | Validation Accuracy: 88.82%


Epoch 19: 100%|██████████| 20/20 [00:05<00:00,  3.53it/s]

Epoch 19 | Training Loss: 0.2229





Epoch 19 | Validation Accuracy: 88.51%


Epoch 20: 100%|██████████| 20/20 [00:05<00:00,  3.49it/s]

Epoch 20 | Training Loss: 0.2198





Epoch 20 | Validation Accuracy: 88.82%


Epoch 21: 100%|██████████| 20/20 [00:05<00:00,  3.42it/s]

Epoch 21 | Training Loss: 0.2103





Epoch 21 | Validation Accuracy: 88.81%


Epoch 22: 100%|██████████| 20/20 [00:05<00:00,  3.56it/s]

Epoch 22 | Training Loss: 0.2081





Epoch 22 | Validation Accuracy: 88.80%


Epoch 23: 100%|██████████| 20/20 [00:05<00:00,  3.58it/s]

Epoch 23 | Training Loss: 0.2083





Epoch 23 | Validation Accuracy: 88.73%


Epoch 24: 100%|██████████| 20/20 [00:05<00:00,  3.51it/s]

Epoch 24 | Training Loss: 0.2076





Epoch 24 | Validation Accuracy: 88.92%


Epoch 25: 100%|██████████| 20/20 [00:05<00:00,  3.55it/s]

Epoch 25 | Training Loss: 0.1953





Epoch 25 | Validation Accuracy: 89.12%


Epoch 26: 100%|██████████| 20/20 [00:05<00:00,  3.49it/s]

Epoch 26 | Training Loss: 0.1883





Epoch 26 | Validation Accuracy: 89.02%


Epoch 27: 100%|██████████| 20/20 [00:05<00:00,  3.48it/s]

Epoch 27 | Training Loss: 0.1940





Epoch 27 | Validation Accuracy: 88.94%


Epoch 28: 100%|██████████| 20/20 [00:05<00:00,  3.49it/s]

Epoch 28 | Training Loss: 0.1983





Epoch 28 | Validation Accuracy: 88.76%


Epoch 29: 100%|██████████| 20/20 [00:05<00:00,  3.46it/s]

Epoch 29 | Training Loss: 0.1805





Epoch 29 | Validation Accuracy: 89.00%


Epoch 30: 100%|██████████| 20/20 [00:05<00:00,  3.52it/s]

Epoch 30 | Training Loss: 0.1751





Epoch 30 | Validation Accuracy: 88.66%


Epoch 31: 100%|██████████| 20/20 [00:05<00:00,  3.35it/s]

Epoch 31 | Training Loss: 0.1729





Epoch 31 | Validation Accuracy: 88.85%


Epoch 32: 100%|██████████| 20/20 [00:05<00:00,  3.41it/s]

Epoch 32 | Training Loss: 0.1629





Epoch 32 | Validation Accuracy: 88.40%


Epoch 33: 100%|██████████| 20/20 [00:06<00:00,  3.22it/s]

Epoch 33 | Training Loss: 0.1589





Epoch 33 | Validation Accuracy: 88.35%


Epoch 34: 100%|██████████| 20/20 [00:05<00:00,  3.50it/s]

Epoch 34 | Training Loss: 0.1571





Epoch 34 | Validation Accuracy: 88.34%


Epoch 35: 100%|██████████| 20/20 [00:05<00:00,  3.46it/s]

Epoch 35 | Training Loss: 0.1506





Epoch 35 | Validation Accuracy: 88.55%


Epoch 36: 100%|██████████| 20/20 [00:05<00:00,  3.52it/s]

Epoch 36 | Training Loss: 0.1499





Epoch 36 | Validation Accuracy: 88.51%


Epoch 37: 100%|██████████| 20/20 [00:05<00:00,  3.53it/s]

Epoch 37 | Training Loss: 0.1461





Epoch 37 | Validation Accuracy: 88.28%


Epoch 38: 100%|██████████| 20/20 [00:05<00:00,  3.53it/s]

Epoch 38 | Training Loss: 0.1479





Epoch 38 | Validation Accuracy: 88.40%


Epoch 39: 100%|██████████| 20/20 [00:05<00:00,  3.53it/s]

Epoch 39 | Training Loss: 0.1335





Epoch 39 | Validation Accuracy: 87.76%


Epoch 40: 100%|██████████| 20/20 [00:05<00:00,  3.45it/s]

Epoch 40 | Training Loss: 0.1356





Epoch 40 | Validation Accuracy: 88.16%


Epoch 41: 100%|██████████| 20/20 [00:05<00:00,  3.48it/s]

Epoch 41 | Training Loss: 0.1270





Epoch 41 | Validation Accuracy: 87.99%


Epoch 42: 100%|██████████| 20/20 [00:05<00:00,  3.49it/s]

Epoch 42 | Training Loss: 0.1364





Epoch 42 | Validation Accuracy: 88.17%


Epoch 43: 100%|██████████| 20/20 [00:05<00:00,  3.52it/s]

Epoch 43 | Training Loss: 0.1181





Epoch 43 | Validation Accuracy: 87.89%


Epoch 44: 100%|██████████| 20/20 [00:05<00:00,  3.48it/s]

Epoch 44 | Training Loss: 0.1157





Epoch 44 | Validation Accuracy: 87.81%


Epoch 45: 100%|██████████| 20/20 [00:05<00:00,  3.50it/s]

Epoch 45 | Training Loss: 0.1111





Epoch 45 | Validation Accuracy: 88.06%


Epoch 46: 100%|██████████| 20/20 [00:05<00:00,  3.51it/s]

Epoch 46 | Training Loss: 0.1423





Epoch 46 | Validation Accuracy: 88.31%


Epoch 47: 100%|██████████| 20/20 [00:05<00:00,  3.48it/s]

Epoch 47 | Training Loss: 0.1151





Epoch 47 | Validation Accuracy: 88.23%


Epoch 48: 100%|██████████| 20/20 [00:05<00:00,  3.47it/s]

Epoch 48 | Training Loss: 0.1074





Epoch 48 | Validation Accuracy: 87.58%


Epoch 49: 100%|██████████| 20/20 [00:05<00:00,  3.39it/s]

Epoch 49 | Training Loss: 0.1045





Epoch 49 | Validation Accuracy: 87.58%


Epoch 50: 100%|██████████| 20/20 [00:05<00:00,  3.48it/s]

Epoch 50 | Training Loss: 0.0966





Epoch 50 | Validation Accuracy: 87.77%
