In [1]:
import os
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from dotenv import load_dotenv

import mlflow
import mlflow.pytorch
from torch.utils.tensorboard import SummaryWriter
from transformers import AutoModel, AutoTokenizer

In [None]:
num_workers = os.cpu_count()
print(f"Number of workers: {num_workers}")

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

In [4]:
def print_gpu_memory():
    if torch.cuda.is_available():
        print(
            "{:<10} {:<15} {:<15} {:<15}".format(
                "GPU", "Total (GB)", "Allocated (GB)", "Available (GB)"
            )
        )
        print("-" * 60)
        for i in range(torch.cuda.device_count()):
            total_memory = torch.cuda.get_device_properties(i).total_memory / 1e9
            allocated_memory = torch.cuda.memory_allocated(i) / 1e9
            available_memory = total_memory - allocated_memory
            print(
                "{:<10} {:<15} {:<15} {:<15}".format(
                    f"GPU_{i}",
                    f"{total_memory:.2f}",
                    f"{allocated_memory:.2f}",
                    f"{available_memory:.2f}",
                )
            )
            print()
    else:
        print("No GPU available.")

In [None]:
print_gpu_memory()

#### Load dataset

In [6]:
df1 = pd.read_parquet("../../data/imdb/train-00000-of-00001.parquet")
df2 = pd.read_parquet("../../data/imdb/test-00000-of-00001.parquet")

full_df = pd.concat([df1, df2]).reset_index(drop=True)

In [None]:
X = full_df["text"].to_numpy()
y = full_df["label"].to_numpy()
X.shape, y.shape

In [None]:
X, _, y, _ = train_test_split(X, y, stratify=y, test_size=0.95, random_state=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=1
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, stratify=y_train, test_size=0.2, random_state=1
)

X_train.shape, X_val.shape, X_test.shape

#### Define Model Path

In [9]:
MODEL_PATH = "prajjwal1/bert-tiny"

#### Prepare dataset and dataloader

In [10]:
class IMDBDataset(Dataset):

    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

    def __getitem__(self, index):
        text = self.X[index]
        label = self.y[index]

        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt",
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }

    def __len__(self):
        return len(self.y)

In [None]:
train_ds = IMDBDataset(X_train, y_train)
val_ds = IMDBDataset(X_val, y_val)
test_ds = IMDBDataset(X_test, y_test)

In [12]:
num_workers = 0

train_dl = DataLoader(
    dataset=train_ds, batch_size=32, shuffle=True, num_workers=num_workers
)
val_dl = DataLoader(
    dataset=val_ds, batch_size=32, shuffle=False, num_workers=num_workers
)
test_dl = DataLoader(
    dataset=test_ds, batch_size=32, shuffle=False, num_workers=num_workers
)

#### Define BERT-Model

In [13]:
class BERTSentimentClassifier(nn.Module):

    def __init__(self, num_classes):
        super(BERTSentimentClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(MODEL_PATH)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output = self.drop(output.pooler_output)
        return self.out(output)

#### Instantiate Model and Define Loss & Optimizer

In [None]:
model = BERTSentimentClassifier(num_classes=2).to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

#### Start training

In [15]:
def train_epoch(model, dataloader, loss_fn, optimizer, device):
    model.train()
    epoch_loss = 0
    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

In [16]:
def evaluate(model, dataloader, loss_fn, device):
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs, dim=1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return val_loss / len(dataloader), accuracy

In [None]:
n_epochs = 100
verbose = 2
train_losses = []
val_losses = []

for epoch in tqdm(range(n_epochs)):
    train_loss = train_epoch(model, train_dl, loss_fn, optimizer, device)
    val_loss, val_accuracy = evaluate(model, val_dl, loss_fn, device)

    train_losses.append(train_loss)
    val_losses.append(val_loss)

    if verbose and (epoch + 1) % verbose == 0:
        print(
            f"Epoch {epoch + 1}/{n_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy:.4f}"
        )

print("Training complete.")