In [6]:
%load_ext autoreload
%autoreload 2

Dependencies

In [7]:
import pathlib as pb

# Environment
ROOT_PATH = pb.Path('..')
DATA_DIR_PATH = ROOT_PATH / 'data'
CACHE_DIR_PATH = ROOT_PATH / '.cache'
TRANSFORMERS_CACHE_DIR_PATH = CACHE_DIR_PATH / 'transformers'
DATASETS_CACHE_DIR_PATH = CACHE_DIR_PATH / 'datasets'
TEST_DATA_FILE = DATA_DIR_PATH / 'test_data.csv'
TRAIN_DATA_FILE = DATA_DIR_PATH / 'train_data.csv'

# Model Repositories
XLM_ROBERTA_MODEL_REPO = 'xlm-roberta-base'

In [8]:

import os

os.environ['TRANSFORMERS_CACHE'] = str(TRANSFORMERS_CACHE_DIR_PATH)
os.environ['HF_DATASETS_CACHE'] = str(DATASETS_CACHE_DIR_PATH)
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [23]:
from transformers import AutoTokenizer, AutoModel
import torch
import torchdata
import torchtext
from torch import Tensor
from torch import nn
from torch import optim
from torch.utils.data import Subset, DataLoader, Dataset
from torch import backends
import typing
import pathlib as pb
import os
import gc
from typing import List, Tuple, Dict, Set
import random
import numpy as np
import pandas as pd
import platform
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score

In [30]:
from utils import get_available_device, read_data
from preprocess import BertPreprocessor, RobertaPreprocessor
from data import SexismDataset
from models import BertFlatClassModel
from transformers import XLMRobertaForSequenceClassification, TrainingArguments, Trainer

Environment Setup

In [11]:
# Use available GPU
DEVICE: torch.device = get_available_device()
print(DEVICE)
# Deterministic experiments
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
np.random.RandomState(SEED)
torch.manual_seed(SEED)
torch.use_deterministic_algorithms(True)

mps


Dataset & Preprocessing

In [12]:
train_data_raw, test_data_raw = read_data(DATA_DIR_PATH)

In [13]:
train_data_raw.head()

Unnamed: 0,text,label
0,@CorinaTomescu05 Încă nu ...dar am trecut prin...,non-offensive
1,@emosaphicbitch sau rosu ca mine,non-offensive
2,@DanaMinodora Ce frumoasa ești.. Arăți foarte ...,non-offensive
3,Din fericire în extaz!Ai dus covoareleeee?? Pu...,offensive
4,cand aveam vreo 5 ani credeam ca romana e sing...,non-offensive


In [31]:
train_dataset = SexismDataset(train_data_raw, RobertaPreprocessor(train_data_raw, XLM_ROBERTA_MODEL_REPO))
test_dataset = SexismDataset(test_data_raw, RobertaPreprocessor(test_data_raw, XLM_ROBERTA_MODEL_REPO))

In [33]:
train_dataset[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids': tensor([     0,   1374,  50886,    885, 108340,  18271,   8194, 189460,    315,
            153,   

Model Training

In [15]:
# Training specifications
batch_size = 8
num_epochs = 10
learning_rate = 1e-5

In [16]:
# Instantiate language model
# Inspired from: https://luv-bansal.medium.com/fine-tuning-bert-for-text-classification-in-pytorch-503d97342db2
model = XLMRobertaForSequenceClassification.from_pretrained(XLM_ROBERTA_MODEL_REPO, num_labels=5)
loss_fn = nn.CrossEntropyLoss() # TODO: weighted loss
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

In [34]:
# Inspired from: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

for i, (train_idx, valid_idx) in enumerate(kf.split(train_dataset)):
    # Split the data
    train_subset = Subset(train_dataset, train_idx)
    valid_subset = Subset(train_dataset, valid_idx)

    # Create the dataloaders
    train_loader = DataLoader(
        train_subset,
        batch_size,
        pin_memory=True,
        # num_workers=num_workers
    )
    valid_loader = DataLoader(
        train_subset,
        batch_size,
        pin_memory=True,
        # num_workers=num_workers
    )

    # ---  Training  ---
    for epoch in range(num_epochs):
        epoch_loss = []
        epoch_accy = []

        for batch, X in train_loader:
            # Send batch to GPU
            y_true = X['label'].to(DEVICE)
            input_ids = X['input_ids'].to(DEVICE)
            attention_mask = X['attention_mask'].to(DEVICE)

            # Make predictions
            y_pred = model.forward({
                'input_ids': input_ids,
                'attention_mask': attention_mask
            })

            # Compute the loss
            optimizer.zero_grad()
            loss: Tensor = loss_fn(y_pred, y_true)
            loss.backward()
            optimizer.step()

            # Compute the accuracy
            y_pred = torch.argmax(y_pred, dim=1).detach().cpu().numpy()
            epoch_accy.append(balanced_accuracy_score(y_true, y_pred))

            # Track progress
            epoch_loss.append(loss.detach().cpu().numpy())

        mean_loss = np.array(epoch_loss).mean()
        mean_accy = np.array(epoch_accy).mean()
        print('Epoch {} - Loss: {}, Accuracy: {}'.format(epoch, mean_loss, mean_accy))

    # --- Validation ---

ValueError: too many values to unpack (expected 2)

In [20]:
# Free memory
del model
gc.collect()
torch.cuda.empty_cache()