This notebook is for fine-tuning a DeBERTa-v3-small model. It's reproducible if the PATHS class is set up correctly.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install datasets

# Set up

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='0,1'
import warnings
import numpy as np, pandas as pd
import random
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from datasets import Dataset
import matplotlib.pyplot as plt
import seaborn as sns
warnings.simplefilter('ignore')

In [None]:
class PATHS:
    save = '/content/drive/MyDrive'
    final_train = f'{save}/harm_train.tsv'
    save_model = f'{save}/harm-deberta-v3-small'

In [None]:
class CFG:
    model_name = 'microsoft/deberta-v3-small'
    max_length = 512
    lr = 1e-5
    train_batch_size = 32
    train_epochs = 4
    weight_decay = 0.01
    warmup_ratio = 0.0
    labels = ['self_harm', 'harming_others', 'harmed_by_others','reference_to_harm']

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(42)

# Tokenize Train

Now we carry out the final tokenization and training on the training set.

In [None]:
train = pd.read_csv(PATHS.final_train, sep='\t', dtype={'id': 'string'})

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)

In [None]:
def tokenize_function(batch):
    tokenized_inputs = tokenizer(
        batch['text'],
        padding=True,
        truncation=True,
        max_length=CFG.max_length,
    )
    batch_labels = {label: batch[label] for label in CFG.labels}
    matrix_labels = np.zeros((len(batch['text']), len(CFG.labels)))
    for i, label in enumerate(CFG.labels):
        matrix_labels[:, i] = batch_labels[label]
    tokenized_inputs['labels'] = matrix_labels
    return tokenized_inputs

In [None]:
train_dict = {}
for col in ['id', 'text'] + CFG.labels:
    train_dict[col] = train[col].tolist()

In [None]:
train_ds = Dataset.from_dict(train_dict)
tokenized_train_ds = train_ds.map(tokenize_function, batched=True)
tokenized_train_ds.set_format('torch')

Map:   0%|          | 0/132485 [00:00<?, ? examples/s]

In [None]:
id2label = {i: label for i, label in enumerate(CFG.labels)}
label2id = {label: i for i, label in enumerate(CFG.labels)}

# Training

In [None]:
config = AutoConfig.from_pretrained(CFG.model_name)
config.num_labels = len(CFG.labels)
config.problem_type = 'multi_label_classification'
config.id2label = id2label
config.label2id = label2id
model = AutoModelForSequenceClassification.from_pretrained(CFG.model_name, config=config)
model.resize_token_embeddings(len(tokenizer))

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(128001, 768, padding_idx=0)

In [None]:
training_args = TrainingArguments(
    output_dir='output',
    fp16=True,
    learning_rate=CFG.lr,
    per_device_train_batch_size=CFG.train_batch_size,
    num_train_epochs=CFG.train_epochs,
    weight_decay=CFG.weight_decay,
    save_strategy='epoch',
    save_total_limit=1,
    report_to='none',
    warmup_ratio=CFG.warmup_ratio,
    lr_scheduler_type='linear',
    optim='adamw_torch',
    logging_first_step=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

Step,Training Loss
1,0.6637
500,0.1471
1000,0.0643
1500,0.0521
2000,0.0453
2500,0.043
3000,0.0396
3500,0.0373
4000,0.036
4500,0.0339


TrainOutput(global_step=16564, training_loss=0.032952358629990824, metrics={'train_runtime': 16939.9275, 'train_samples_per_second': 31.283, 'train_steps_per_second': 0.978, 'total_flos': 6.983164548956825e+16, 'train_loss': 0.032952358629990824, 'epoch': 4.0})

In [None]:
trainer.save_model(PATHS.save_model)