In [None]:
import sys
from pathlib import Path

project_root = Path().resolve().parents[0]
sys.path.append(str(project_root))

In [None]:
from collections import Counter
import torch
from torch.utils.data import Dataset


from src.data_loader import find_data_dir, load_data, clean_html
from src.dataset import IMDBDataset

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def build_vocab(texts, max_vocab_size=30000):
    counter = Counter()
    for text in texts:
        counter.update(text.split())
    vocab = {"<PAD>": 0, "<UNK>": 1}
    for word, _ in counter.most_common(max_vocab_size - 2):
        vocab[word] = len(vocab)
    return vocab


In [5]:
X_train, X_test, y_train, y_test = load_data()

In [6]:
vocab = build_vocab(X_train)
vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 30000


In [7]:
from torch.utils.data import DataLoader

train_ds = IMDBDataset(X_train, y_train, vocab)
test_ds   = IMDBDataset(X_test, y_test, vocab)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader   = DataLoader(test_ds, batch_size=64)

In [8]:
import torch.nn as nn
import torch.nn.functional as F

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.conv = nn.Conv1d(embed_dim, 128, kernel_size=5, padding=2)
        self.fc = nn.Linear(128, 1)

    def forward(self, x):
        x = self.embedding(x)          # (B, L, E)
        x = x.permute(0, 2, 1)         # (B, E, L)
        x = F.relu(self.conv(x))       # (B, C, L')
        x = F.max_pool1d(x, x.shape[2]).squeeze(2)
        return torch.sigmoid(self.fc(x)).squeeze(1)


In [9]:
def train_model(model, train_loader, test_loader, epochs=5):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.BCELoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for x, y in train_loader:
            x = x.to(device)
            y = y.to(device)

            optimizer.zero_grad()
            preds = model(x)
            loss = criterion(preds, y)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Train Loss: {total_loss:.4f}")


In [10]:
from sklearn.metrics import accuracy_score

def evaluate(model, loader):
    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            out = model(x)
            preds.extend((out > 0.5).int().cpu().tolist())
            labels.extend(y.int().cpu().tolist())

    return accuracy_score(labels, preds)



In [11]:
cnn = TextCNN(vocab_size)
cnn.to(device)
train_model(cnn, train_loader, test_loader)


Epoch 1, Train Loss: 315.5629
Epoch 2, Train Loss: 191.2802
Epoch 3, Train Loss: 106.8197
Epoch 4, Train Loss: 45.4632
Epoch 5, Train Loss: 15.0414


In [12]:
evaluate(cnn, test_loader)

0.8637

In [13]:
class TextLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embed_dim,
            hidden_dim,
            batch_first=True
        )
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        _, (h, _) = self.lstm(x)
        x = self.dropout(h[-1])
        return torch.sigmoid(self.fc(x)).squeeze(1)

In [14]:
lstm = TextLSTM(vocab_size)
lstm.to(device)
train_model(lstm, train_loader, test_loader, epochs=10)


Epoch 1, Train Loss: 433.1283
Epoch 2, Train Loss: 428.9181
Epoch 3, Train Loss: 420.9850
Epoch 4, Train Loss: 406.5806
Epoch 5, Train Loss: 354.6346
Epoch 6, Train Loss: 248.7169
Epoch 7, Train Loss: 182.4997
Epoch 8, Train Loss: 140.6117
Epoch 9, Train Loss: 113.2298
Epoch 10, Train Loss: 84.0772


In [15]:
evaluate(lstm, test_loader)

0.8483

In [16]:
class TextBiLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embed_dim,
            hidden_dim,
            batch_first=True,
            bidirectional=True,
        )
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        _, (h, _) = self.lstm(x)
        x = self.dropout(h[-1])
        return torch.sigmoid(self.fc(x)).squeeze(1)

In [17]:
bilstm = TextBiLSTM(vocab_size)
bilstm.to(device)
train_model(bilstm, train_loader, test_loader, epochs=10)

Epoch 1, Train Loss: 407.8379
Epoch 2, Train Loss: 397.3550
Epoch 3, Train Loss: 337.7436
Epoch 4, Train Loss: 223.9385
Epoch 5, Train Loss: 165.3608
Epoch 6, Train Loss: 127.1533
Epoch 7, Train Loss: 93.5595
Epoch 8, Train Loss: 70.8617
Epoch 9, Train Loss: 53.0177
Epoch 10, Train Loss: 38.9279


In [18]:
evaluate(bilstm, test_loader)

0.8591