In [1]:
%load_ext autoreload
%autoreload 2

Dependencies

In [2]:
import pathlib as pb

# Environment
ROOT_PATH = pb.Path('..')
DATA_DIR_PATH = ROOT_PATH / 'data'
CACHE_DIR_PATH = ROOT_PATH / '.cache'
TRANSFORMERS_CACHE_DIR_PATH = CACHE_DIR_PATH / 'transformers'
DATASETS_CACHE_DIR_PATH = CACHE_DIR_PATH / 'datasets'
TEST_DATA_FILE = DATA_DIR_PATH / 'test_data.csv'
TRAIN_DATA_FILE = DATA_DIR_PATH / 'train_data.csv'
SUBMISSIONS_DIR_PATH = ROOT_PATH / 'submissions'

In [3]:
import os

os.environ['TRANSFORMERS_CACHE'] = str(TRANSFORMERS_CACHE_DIR_PATH)
os.environ['HF_DATASETS_CACHE'] = str(DATASETS_CACHE_DIR_PATH)
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch
import torchdata
import torchtext
from torch import Tensor
from torch import nn
from torch import optim
from torch.utils.data import Subset, DataLoader, Dataset, WeightedRandomSampler
from torch import backends
import typing
import pathlib as pb
import os
import gc
from typing import List, Tuple, Dict, Set, Callable, Any
import random
import numpy as np
import pandas as pd
import platform
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, accuracy_score
from balanced_loss import Loss

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from utils import get_available_device, read_data, silence_warnings
from preprocess import BERTPreprocessor, MT5Preprocessor, RobertPreprocessor
from data import SexismDataset
from models import Args, Output, Ensemble, RoBertFlatClassModel, BertFlatClassModel, MT5FlatClassModel
from train import evaluate, train

Environment Setup

In [6]:
# Use available GPU
DEVICE: torch.device = get_available_device()

# Deterministic experiments
SEED = 61
random.seed(SEED)
np.random.seed(SEED)
np.random.RandomState(SEED)
torch.manual_seed(SEED)
torch.use_deterministic_algorithms(True)

# Adjust package settings
silence_warnings()

Dataset & Preprocessing

In [7]:
# Load raw dataset
train_data_raw, test_data_raw = read_data(DATA_DIR_PATH)

# Initialize custom pretraiend preprocessor
preprocessor = BERTPreprocessor()

# Create and preprocess the datasets
train_dataset = SexismDataset(train_data_raw, preprocessor)
test_dataset = SexismDataset(test_data_raw, preprocessor)

# Build common interface
model_setup = [
    # {
    #     'name': 'bert',
    #     'model_factory': lambda: BertFlatClassModel().to(DEVICE),
    #     'optim_factory': lambda params: optim.AdamW(params, lr=Args().learning_rate, weight_decay=Args().weight_decay),
    #     'loss_fn': nn.CrossEntropyLoss(weight=train_dataset.weights.to(DEVICE)),
    #     'train_dataset': SexismDataset(train_data_raw, preprocessor),
    #     'test_dataset': SexismDataset(test_data_raw, preprocessor),
    # },
    # {
    #     'name': 'mt5',
    #     'model_factory': lambda: MT5FlatClassModel().to(DEVICE),
    #     'optim_factory': lambda params: optim.AdamW(params, lr=Args().learning_rate, weight_decay=Args().weight_decay),
    #     'loss_fn': nn.CrossEntropyLoss(weight=train_dataset.weights.to(DEVICE)),
    #     'train_dataset': SexismDataset(train_data_raw, MT5Preprocessor()),
    #     'test_dataset': SexismDataset(test_data_raw, MT5Preprocessor()),
    # },
    {
        'name': 'robert',
        'model_factory': lambda: RoBertFlatClassModel().to(DEVICE),
        'optim_factory': lambda params: optim.AdamW(params, lr=Args().learning_rate, weight_decay=Args().weight_decay),
        'loss_fn': nn.CrossEntropyLoss(weight=train_dataset.weights.to(DEVICE)),
        'train_dataset': SexismDataset(train_data_raw, RobertPreprocessor()),
        'test_dataset': SexismDataset(test_data_raw, RobertPreprocessor()),
    }
]

In [8]:
train_dataset.weights, train_dataset.class_to_freq

(tensor([0.9444, 0.9613, 0.9944, 0.2106, 0.8893]),
 {0: tensor(0.0005),
  1: tensor(0.0007),
  2: tensor(0.0045),
  3: tensor(3.2438e-05),
  4: tensor(0.0002)})

Model Setup

In [9]:
# Keep the training setup separately
args = Args()
model_factory: Callable[[], nn.Module] = lambda: MT5FlatClassModel().to(DEVICE)
optim_factory: Callable[..., optim.Optimizer] = lambda params: optim.AdamW(params, lr=args.learning_rate, weight_decay=args.weight_decay)
loss_fn = nn.CrossEntropyLoss(weight=train_dataset.weights.to(DEVICE))

HyperParameter Tuning using K-Fold Cross-Validation

In [10]:
# Inspired from: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
n_splits = 5
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
models: List[nn.Module] = []

for i, (train_idx, valid_idx) in enumerate(kf.split(train_dataset, train_dataset.classes)):
    # Map to dynamic model
    setup = model_setup[i % len(model_setup)]
    print('K-FOLD: {} - Model: {}'.format(i, setup['name']))

    # Reinitialize the model
    model: nn.Module = setup['model_factory']()
    optimizer: optim.Optimizer = setup['optim_factory'](model.parameters())

    # Split the data
    train_subset = Subset(setup['train_dataset'], train_idx[:256])
    valid_subset = Subset(setup['train_dataset'], valid_idx[:64])

    # Inverse frequency count for random sampling
    inv_freq = torch.tensor(list(train_dataset.class_to_freq.get(int(t['label'])) for t in train_subset))

    # Create WeightedRandomSampler
    wrs = WeightedRandomSampler(inv_freq, len(train_subset), replacement=True)

    # Create the dataloaders
    train_loader = DataLoader(train_subset, args.batch_size, sampler=wrs)
    valid_loader = DataLoader(train_subset, args.batch_size)

    # --- Validation ---
    def validation_pass() -> None:
        # --- Validation ---
        valid_output: Output = evaluate(
            model=model,
            loss_fn=loss_fn,
            data_loader=valid_loader,
            with_labels=True,
            class_to_label=train_dataset.class_to_label,
            device=DEVICE
        )
        print('Validation - Loss: {}, Accuracy: {}'.format(valid_output.loss_mean, valid_output.accy_mean))

    # --- Training ---
    train_output: Output = train(
        model=model,
        optimizer=optimizer,
        loss_fn=loss_fn,
        data_loader=train_loader,
        class_to_label=train_dataset.class_to_label,
        args=args,
        device=DEVICE,
        valid_callback=validation_pass,
    )
    print('Last Epoch - Loss: {}, Accuracy: {}'.format(train_output.loss_mean, train_output.accy_mean))
    models.append(model)

K-FOLD: 0 - Model: robert


Some weights of the model checkpoint at readerbench/RoBERT-base were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Train Epoch 0 - Loss: 1.365743637084961, Accuracy: 0.40052489177489176
Validation - Loss: 1.6318612098693848, Accuracy: 0.44107142857142856
Train Epoch 1 - Loss: 0.8493609428405762, Accuracy: 0.5149702380952381
Validation - Loss: 1.4898476600646973, Accuracy: 0.5861607142857144
Train Epoch 2 - Loss: 0.5181704759597778, Accuracy: 0.7152083333333332
Validation - Loss: 1.319649338722229, Accuracy: 0.6632440476190476
Last Epoch - Loss: 0.5181704759597778, Accuracy: 0.7152083333333332
K-FOLD: 1 - Model: robert


Some weights of the model checkpoint at readerbench/RoBERT-base were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Train Epoch 0 - Loss: 1.3033376932144165, Accuracy: 0.33756651334776333
Validation - Loss: 1.4897652864456177, Accuracy: 0.35277777777777775
Train Epoch 1 - Loss: 0.7163115739822388, Accuracy: 0.4375
Validation - Loss: 1.4073853492736816, Accuracy: 0.47604166666666664
Train Epoch 2 - Loss: 0.5639907717704773, Accuracy: 0.559375
Validation - Loss: 1.2712986469268799, Accuracy: 0.6364583333333333
Last Epoch - Loss: 0.5639907717704773, Accuracy: 0.559375
K-FOLD: 2 - Model: robert


Some weights of the model checkpoint at readerbench/RoBERT-base were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Train Epoch 0 - Loss: 1.1892975568771362, Accuracy: 0.40528273809523807
Validation - Loss: 1.4128891229629517, Accuracy: 0.39374999999999993
Train Epoch 1 - Loss: 0.7976636290550232, Accuracy: 0.5086309523809525
Validation - Loss: 1.3154172897338867, Accuracy: 0.6479166666666667
Train Epoch 2 - Loss: 0.5463498830795288, Accuracy: 0.7204166666666667
Validation - Loss: 1.1941900253295898, Accuracy: 0.7041666666666666
Last Epoch - Loss: 0.5463498830795288, Accuracy: 0.7204166666666667
K-FOLD: 3 - Model: robert


Some weights of the model checkpoint at readerbench/RoBERT-base were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Train Epoch 0 - Loss: 1.2648372650146484, Accuracy: 0.331073717948718
Validation - Loss: 1.5429790019989014, Accuracy: 0.3944940476190476
Train Epoch 1 - Loss: 0.8548659086227417, Accuracy: 0.4451388888888889
Validation - Loss: 1.455643653869629, Accuracy: 0.5020833333333333
Train Epoch 2 - Loss: 0.5925307273864746, Accuracy: 0.6214475108225108
Validation - Loss: 1.2834300994873047, Accuracy: 0.6020833333333333
Last Epoch - Loss: 0.5925307273864746, Accuracy: 0.6214475108225108
K-FOLD: 4 - Model: robert


Some weights of the model checkpoint at readerbench/RoBERT-base were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Train Epoch 0 - Loss: 1.3183951377868652, Accuracy: 0.3341612554112554
Validation - Loss: 1.511131763458252, Accuracy: 0.3291666666666666
Train Epoch 1 - Loss: 0.9492776393890381, Accuracy: 0.5072420634920635
Validation - Loss: 1.4610319137573242, Accuracy: 0.6315972222222221
Train Epoch 2 - Loss: 0.6164517402648926, Accuracy: 0.7032589285714286
Validation - Loss: 1.3270971775054932, Accuracy: 0.7041666666666666
Last Epoch - Loss: 0.6164517402648926, Accuracy: 0.7032589285714286


In [11]:
majority = Ensemble(models)

Training on the Whole Dataset

In [12]:
# Reinitialize the model
model: nn.Module = model_factory()
optimizer: optim.Optimizer = optim_factory(model.parameters())

# Create the dataloaders
train_loader = DataLoader(train_dataset, args.batch_size, shuffle=True)

# --- Training ---
train_output: Output = train(
    model=model,
    optimizer=optimizer,
    loss_fn=loss_fn,
    data_loader=train_loader,
    class_to_label=train_dataset.class_to_label,
    args=args,
    device=DEVICE,
)
print('Last Epoch - Loss: {}, Accuracy: {}'.format(train_output.loss_mean, train_output.accy_mean))

You are using a model of type t5 to instantiate a model of type mt5. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at dumitrescustefan/mt5-base-romanian were not used when initializing MT5EncoderModel: ['decoder.block.10.layer.1.layer_norm.weight', 'decoder.block.4.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.1.layer.0.SelfAttention.q.weight', 'decoder.block.10.layer.0.layer_norm.weight', 'decoder.block.4.layer.2.DenseReluDense.wo.weight', 'decoder.block.11.layer.0.layer_norm.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.5.layer.0.layer_norm.weight', 'decoder.block.10.layer.1.EncDecAttention.v.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.1.layer.0.SelfAttention.v.weight', 'decoder.block.8.layer.1.EncDecAttention.k.weight', 'decoder.block.1.layer.1.EncDecAttention.v.weight', 'decoder.block.10.layer.2.DenseReluDense.wi_1.weight', 'decoder.block.7.layer.2.layer_

Prediction

In [None]:
test_loader: DataLoader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)
test_output: Output = evaluate(
    model=majority,
    loss_fn=loss_fn,
    data_loader=test_loader,
    with_labels=False,
    class_to_label=train_dataset.class_to_label,
    device=DEVICE
)

In [None]:
output: pd.DataFrame = pd.DataFrame({ 'Label': pd.Series(data=test_output.predictions_as_text) })
output = output.reset_index()
output = output.rename(columns={ 'index': 'Id' })
output.to_csv(SUBMISSIONS_DIR_PATH / 'submission_18.csv', index=False)
output['Label'].hist()

In [None]:
# Free memory
# del model
# gc.collect()
# torch.cuda.empty_cache()