In [8]:
!pip install torch




[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
%%time 

import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import warnings

# Set random seeds for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True

warnings.filterwarnings('ignore')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CPU times: total: 0 ns
Wall time: 4.98 ms


## DATABASE

In [10]:
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from transformers import BertTokenizer
from torch.utils.data import DataLoader, Dataset

# Load and prepare data
class MovieReviewDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        review = self.data.iloc[index]
        inputs = self.tokenizer.encode_plus(
            review['Content'],
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        label = 1 if review['Sentiment'] == 'Positive' else 0
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Example data loading and preparation
data = []
folder_path = 'movies/docs'  # Adjust the path to your dataset folder

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            content = file.read().replace('\n', ' ')
        sentiment = 'Negative' if filename.startswith('negR') else 'Positive'
        id = f"N{filename[5:8]}" if sentiment == 'Negative' else f"P{filename[5:8]}"
        data.append({'Content': content, 'Sentiment': sentiment, 'id': id})

# Convert to DataFrame
df = pd.DataFrame(data)

# Sort the DataFrame by 'id'
df = df.sort_values(by='id').reset_index(drop=True)

# Separate the dataset
train_neg = df[df['Sentiment'] == 'Negative'][:800]
val_neg = df[df['Sentiment'] == 'Negative'][800:900]
test_neg = df[df['Sentiment'] == 'Negative'][900:1000]

train_pos = df[df['Sentiment'] == 'Positive'][:800]
val_pos = df[df['Sentiment'] == 'Positive'][800:900]
test_pos = df[df['Sentiment'] == 'Positive'][900:1000]

# Concatenate the splits
train_df = pd.concat([train_neg, train_pos]).sample(frac=1).reset_index(drop=True)
val_df = pd.concat([val_neg, val_pos]).sample(frac=1).reset_index(drop=True)
test_df = pd.concat([test_neg, test_pos]).sample(frac=1).reset_index(drop=True)

# Create datasets and dataloaders
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = MovieReviewDataset(train_df.sample(frac=1, random_state=200), tokenizer, max_len=128)
val_dataset = MovieReviewDataset(val_df, tokenizer, max_len=128)
test_dataset = MovieReviewDataset(test_df, tokenizer, max_len=128)


batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [11]:
train_df.to_csv('movies_reviews_train.csv', index=False)
val_df.to_csv('movies_reviews_val.csv', index=False)
test_df.to_csv('movies_reviews_test.csv', index=False)


## MODEL

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super(CNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(1, n_filters, (fs, embedding_dim)) for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.embedding(text)  # [batch size, sent len, emb dim]
        embedded = embedded.unsqueeze(1)  # [batch size, 1, sent len, emb dim]
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

# Create the CNN instance
INPUT_DIM = len(tokenizer.get_vocab())  # Use tokenizer vocabulary size
EMBEDDING_DIM = 512
N_FILTERS = 100
FILTER_SIZES = [2, 3, 4]
OUTPUT_DIM = 2  # Two output classes for binary classification
DROPOUT = 0.5

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
model = model.to(device)

import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in iterator:
        texts = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        predictions = model(texts)
        
        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            texts = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            predictions = model(texts)
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def binary_accuracy(preds, y):
    max_preds = preds.argmax(dim=1, keepdim=True)  # Get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    acc = correct.sum().float() / float(y.size(0))
    return acc

NUM_EPOCHS = 20

for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, val_loader, criterion)
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

torch.save(model.state_dict(), 'model.pth')

Epoch: 01
	Train Loss: 0.872 | Train Acc: 51.68%
	 Val. Loss: 0.786 |  Val. Acc: 50.09%
Epoch: 02
	Train Loss: 0.717 | Train Acc: 62.62%
	 Val. Loss: 0.650 |  Val. Acc: 59.77%
Epoch: 03
	Train Loss: 0.548 | Train Acc: 73.02%
	 Val. Loss: 0.642 |  Val. Acc: 64.58%
Epoch: 04
	Train Loss: 0.459 | Train Acc: 78.97%
	 Val. Loss: 0.630 |  Val. Acc: 64.58%
Epoch: 05
	Train Loss: 0.386 | Train Acc: 82.93%
	 Val. Loss: 0.617 |  Val. Acc: 68.19%
Epoch: 06
	Train Loss: 0.344 | Train Acc: 85.82%
	 Val. Loss: 0.646 |  Val. Acc: 61.68%
Epoch: 07
	Train Loss: 0.279 | Train Acc: 89.42%
	 Val. Loss: 0.621 |  Val. Acc: 66.75%
Epoch: 08
	Train Loss: 0.239 | Train Acc: 91.53%
	 Val. Loss: 0.619 |  Val. Acc: 64.58%
Epoch: 09
	Train Loss: 0.194 | Train Acc: 93.51%
	 Val. Loss: 0.618 |  Val. Acc: 67.62%
Epoch: 10
	Train Loss: 0.162 | Train Acc: 94.89%
	 Val. Loss: 0.608 |  Val. Acc: 66.93%
Epoch: 11
	Train Loss: 0.133 | Train Acc: 95.97%
	 Val. Loss: 0.607 |  Val. Acc: 69.79%
Epoch: 12
	Train Loss: 0.118 | T