In [1]:
%load_ext autoreload
%autoreload 2

Dependencies

In [2]:
import pathlib as pb

# Environment
ROOT_PATH = pb.Path('..')
DATA_DIR_PATH = ROOT_PATH / 'data'
CACHE_DIR_PATH = ROOT_PATH / '.cache'
TRANSFORMERS_CACHE_DIR_PATH = CACHE_DIR_PATH / 'transformers'
DATASETS_CACHE_DIR_PATH = CACHE_DIR_PATH / 'datasets'
TEST_DATA_FILE = DATA_DIR_PATH / 'test_data.csv'
TRAIN_DATA_FILE = DATA_DIR_PATH / 'train_data.csv'
SUBMISSIONS_DIR_PATH = ROOT_PATH / 'submissions'

In [3]:
import os

os.environ['TRANSFORMERS_CACHE'] = str(TRANSFORMERS_CACHE_DIR_PATH)
os.environ['HF_DATASETS_CACHE'] = str(DATASETS_CACHE_DIR_PATH)
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch
import torchdata
import torchtext
from torch import Tensor
from torch import nn
from torch import optim
from torch.utils.data import Subset, DataLoader, Dataset
from torch import backends
import typing
import pathlib as pb
import os
import gc
from typing import List, Tuple, Dict, Set, Callable, Any
import random
import numpy as np
import pandas as pd
import platform
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from utils import get_available_device, read_data, silence_warnings
from preprocess import BERTPreprocessor
from data import SexismDataset
from models import Args, Output, BertFlatClassModel
from train import evaluate

Environment Setup

In [6]:
# Use available GPU
DEVICE: torch.device = get_available_device()

# Deterministic experiments
SEED = 61
random.seed(SEED)
np.random.seed(SEED)
np.random.RandomState(SEED)
torch.manual_seed(SEED)
torch.use_deterministic_algorithms(True)

# Adjust package settings
silence_warnings()

Dataset & Preprocessing

In [7]:
# Load raw dataset
train_data_raw, test_data_raw = read_data(DATA_DIR_PATH)

# Initialize custom pretraiend preprocessor
preprocessor = BERTPreprocessor()

# Create and preprocess the datasets
train_dataset = SexismDataset(train_data_raw, preprocessor)
test_dataset = SexismDataset(test_data_raw, preprocessor)

Model Setup

In [None]:
# Keep the training setup separately
args = Args()
model_factory: Callable[[], nn.Module] = lambda: torch.compile(BertFlatClassModel(unfreeze='none')).to(DEVICE)
optim_factory: Callable[..., optim.Optimizer] = lambda params: optim.AdamW(params, lr=args.learning_rate, weight_decay=args.weight_decay)
loss_fn = nn.CrossEntropyLoss(weight=train_dataset.weights.to(DEVICE))

HyperParameter Tuning using K-Fold Cross-Validation

In [24]:
# Inspired from: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)


for i, (train_idx, valid_idx) in enumerate(kf.split(train_dataset, train_dataset.classes)):
    print('K-FOLD: {}'.format(i))
    
    # Reinitialize the model
    model: nn.Module = model_factory()
    optimizer: optim.Optimizer = optim_factory(model.parameters())

    # Split the data
    train_subset = Subset(train_dataset, train_idx)
    valid_subset = Subset(train_dataset, valid_idx)

    # Create the dataloaders
    train_loader = DataLoader(train_subset, args.batch_size)
    valid_loader = DataLoader(train_subset, args.batch_size)

    # ---  Training  ---
    for epoch in range(args.num_epochs):
        model.train()
        epoch_loss: List[float] = []
        epoch_accy: List[float] = []

        for batch_i, batch in enumerate(train_loader):
            # Send batch to GPU
            batch: Dict[str, Tensor] = { k: v.to(DEVICE) for k, v in batch.items() }

            # Make predictions
            y_true: Tensor | np.ndarray = batch['label']
            y_pred: Tensor | np.ndarray = model.forward(batch)

            # Compute the loss
            optimizer.zero_grad()
            loss: Tensor = loss_fn(y_pred, y_true)
            loss.backward()
            optimizer.step()

            # Compute the accuracy
            y_true = y_true.detach().cpu().numpy()
            y_pred = y_pred.detach().argmax(dim=1).cpu().numpy()
            epoch_accy.append(balanced_accuracy_score(y_true, y_pred))

            # Track progress
            epoch_loss.append(loss.detach().cpu().numpy())

        mean_loss: float = np.array(epoch_loss).mean()
        mean_accy: float = np.array(epoch_accy).mean()
        print('Train Epoch {} - Loss: {}, Accuracy: {}'.format(epoch, mean_loss, mean_accy))

        # --- Validation ---
        valid_output: Output = evaluate(
            model=model,
            loss_fn=loss_fn,
            data_loader=valid_loader,
            with_labels=True,
            device=DEVICE
        )
        print('Validation - Loss: {}, Accuracy: {}'.format(valid_output.loss_mean, valid_output.accy_mean))

AttributeError: 'SexismDataset' object has no attribute 'dataset_pro'

Training on the Whole Dataset

Prediction

In [None]:
test_loader: DataLoader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)
test_output: Output = evaluate(
    model=model,
    loss_fn=loss_fn,
    data_loader=test_loader,
    with_labels=False,
    device=DEVICE
)

In [None]:
output: pd.DataFrame = pd.DataFrame({ 'Label': pd.Series(data=test_output.predictions_as_text) })
output = output.reset_index()
output = output.rename(columns={ 'index': 'Id' })
output.to_csv(SUBMISSIONS_DIR_PATH / 'submission_8.csv', index=False)
output['Label'].hist()

In [None]:
# Free memory
# del model
# gc.collect()
# torch.cuda.empty_cache()