In [1]:
# ------- NOTEBOOK CONFIGURATION -------

model_name = "roberta-base" # "roberta-base" or "xlm-roberta-base"
pre_finetuned = True
finetuning_language = "en" # "en" or "fr" or "de" or "es" or "it" # only "en" implemented so far
dataset_name = "maptask" # 'dihana' (Spanish), 'ilisten' (Italian), 'loria' (French), 'maptask' (English) or 'vm2' (German) 
last_layer_only = True

# ------ DO NOT CHANGE UNDER HERE ------

n_train = 1986 # number of training examples in the Ilisten dataset (minimum of all five datasets)
n_test  =  971 # number of     test examples in the Ilisten dataset (minimum of all five datasets)

if model_name=="roberta-base" and dataset_name != "maptask":
    print("Warning: RoBERTa is only trainable on the Maptask dataset.")

if pre_finetuned:
    model_path  = './results/' + model_name + '--open-subtitle-' + finetuning_language + "--last-3-layers"
    saving_path =  model_path + '--' + dataset_name + '--' + ('last-layer-only' if last_layer_only else 'full')
else :
    model_path = model_name
    saving_path = './results/' + model_name + '--' + dataset_name + '--' + ('last-layer-only' if last_layer_only else 'full')

print(f"Loading pre-finetuned on Open Subtitle model from {model_path}")
print(f"Writing to {saving_path}")

Loading pre-finetuned on Open Subtitle model from ./results/roberta-base--open-subtitle-en--last-3-layers
Writing to ./results/roberta-base--open-subtitle-en--last-3-layers--maptask--last-layer-only


In [37]:
! pip3 install torch transformers ipywidgets datasets accelerate evaluate wheel scikit-learn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting scikit-learn
  Downloading scikit_learn-1.2.1-cp39-cp39-macosx_12_0_arm64.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting scipy>=1.3.2
  Downloading scipy-1.10.1-cp39-cp39-macosx_12_0_arm64.whl (28.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.9/28.9 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting threadpoolctl>=2.0.

In [2]:
import numpy as np

In [3]:
# Based on roBERTa and using the transformers library
# and the https://huggingface.co/datasets/silicone dataset
# train a classifier of each "utterance" into the categories 0 to 3

# Ideally, the classification should depend on the dialog context
# using the Dialogue_ID and Idx variables
# respectively giving the dialogue identifier and the utterance order.

# Inspired by https://huggingface.co/docs/transformers/training

import datasets
dataset = datasets.load_dataset('miam', dataset_name)
n_intent_classes = len(dataset['train'].info.features['Label'].names)

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=n_intent_classes)

Found cached dataset miam (/Users/katossky/.cache/huggingface/datasets/miam/maptask/1.0.0/3cb25c5337f9e60db1dc6d90344763a6ef79d7a4ac3c5f215ce6e8afe99db26c)


  0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [4]:
def tokenize_function(doc):
    return tokenizer(doc["Utterance"], padding="max_length", truncation=True)

In [5]:
columns_to_remove = np.setdiff1d(dataset['train'].column_names, ['Label'])
dataset_train = dataset['train']\
    .shuffle(seed=42).select(range(n_train))\
    .rename_column("Label", "label")\
    .map(tokenize_function, batched=True)\
    .remove_columns(columns_to_remove)

Loading cached shuffled indices for dataset at /Users/katossky/.cache/huggingface/datasets/miam/maptask/1.0.0/3cb25c5337f9e60db1dc6d90344763a6ef79d7a4ac3c5f215ce6e8afe99db26c/cache-476c0eb896c3a622.arrow
Loading cached processed dataset at /Users/katossky/.cache/huggingface/datasets/miam/maptask/1.0.0/3cb25c5337f9e60db1dc6d90344763a6ef79d7a4ac3c5f215ce6e8afe99db26c/cache-4a6e1277ce062944.arrow


In [6]:
columns_to_remove = np.setdiff1d(dataset['test'].column_names, ['Label'])
dataset_test = dataset['test']\
    .shuffle(seed=42).select(range(n_test))\
    .rename_column("Label", "label")\
    .map(tokenize_function, batched=True)\
    .remove_columns(columns_to_remove)

Loading cached shuffled indices for dataset at /Users/katossky/.cache/huggingface/datasets/miam/maptask/1.0.0/3cb25c5337f9e60db1dc6d90344763a6ef79d7a4ac3c5f215ce6e8afe99db26c/cache-22c3088f9085857b.arrow


Map:   0%|          | 0/971 [00:00<?, ? examples/s]

In [7]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [8]:
# freeze the model but the last layer
if last_layer_only:
    for param in model.roberta.parameters():
        param.requires_grad = False

In [91]:
# count the number of trainable parameters
# sum(p.numel() for p in model.parameters() if p.requires_grad)

# model.classifier
# the head consists in two linear layers
# the first has 768 features in and 768 out
# the second has 768 features in and 4 out
# this is a total of 768 * 768 + 768 + 768 * 4 + 4 = 593668

# (model.classifier.dense.in_features + 1) * model.classifier.dense.out_features +\
# model.classifier.dense.out_features+1) * n_intent_classes
# yeaaaaah! this matches the number of trainable parameters

In [9]:
# train the model

# seems like worth it to test metaparameters
# in particular, I get no logging, and that
# may be due to the batch seize not beeing
# a multiple of gradient_accumulation_steps

training_args = TrainingArguments(
    seed=42,                         # random seed for initialization
    output_dir=saving_path,          # output directory
    evaluation_strategy="steps",     # evaluation strategy to adopt during training
    eval_steps=100,                  # evaluation step
    logging_steps=100,               # log step
    optim="adamw_torch",             # optimizer
    learning_rate=1e-3,              # learning rate
    weight_decay=0.01,               # strength of weight decay
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=50,                 # number of warmup steps for learning rate scheduler
    save_strategy="epoch",           # strategy to adopt when saving checkpoints
    use_mps_device=True,             # use the new Apple M1 chip
)

In [10]:
trainer = Trainer(
    model = model,                         # the instantiated 🤗 Transformers model to be trained
    args = training_args,                  # training arguments, defined above
    train_dataset = dataset_train,         # training dataset
    eval_dataset  = dataset_test,          # evaluation dataset
    compute_metrics = compute_metrics,
)

In [11]:
trainer.train()

***** Running training *****
  Num examples = 1986
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 375
  Number of trainable parameters = 599820


  0%|          | 0/375 [00:00<?, ?it/s]

  incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
***** Running Evaluation *****
  Num examples = 971
  Batch size = 64


{'loss': 2.3173, 'learning_rate': 0.0008461538461538462, 'epoch': 0.8}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 2.1120247840881348, 'eval_accuracy': 0.26776519052523173, 'eval_runtime': 41.9615, 'eval_samples_per_second': 23.14, 'eval_steps_per_second': 0.381, 'epoch': 0.8}


Saving model checkpoint to ./results/roberta-base--open-subtitle-en--last-3-layers--maptask--last-layer-only/checkpoint-125
Configuration saved in ./results/roberta-base--open-subtitle-en--last-3-layers--maptask--last-layer-only/checkpoint-125/config.json
Model weights saved in ./results/roberta-base--open-subtitle-en--last-3-layers--maptask--last-layer-only/checkpoint-125/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 971
  Batch size = 64


{'loss': 2.0328, 'learning_rate': 0.0005384615384615384, 'epoch': 1.6}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 1.8446063995361328, 'eval_accuracy': 0.3553038105046344, 'eval_runtime': 41.3521, 'eval_samples_per_second': 23.481, 'eval_steps_per_second': 0.387, 'epoch': 1.6}


Saving model checkpoint to ./results/roberta-base--open-subtitle-en--last-3-layers--maptask--last-layer-only/checkpoint-250
Configuration saved in ./results/roberta-base--open-subtitle-en--last-3-layers--maptask--last-layer-only/checkpoint-250/config.json
Model weights saved in ./results/roberta-base--open-subtitle-en--last-3-layers--maptask--last-layer-only/checkpoint-250/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 971
  Batch size = 64


{'loss': 1.851, 'learning_rate': 0.0002307692307692308, 'epoch': 2.4}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 1.7855678796768188, 'eval_accuracy': 0.39237899073120497, 'eval_runtime': 42.9519, 'eval_samples_per_second': 22.607, 'eval_steps_per_second': 0.373, 'epoch': 2.4}


Saving model checkpoint to ./results/roberta-base--open-subtitle-en--last-3-layers--maptask--last-layer-only/checkpoint-375
Configuration saved in ./results/roberta-base--open-subtitle-en--last-3-layers--maptask--last-layer-only/checkpoint-375/config.json
Model weights saved in ./results/roberta-base--open-subtitle-en--last-3-layers--maptask--last-layer-only/checkpoint-375/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 357.7699, 'train_samples_per_second': 16.653, 'train_steps_per_second': 1.048, 'train_loss': 2.0210760498046874, 'epoch': 3.0}


TrainOutput(global_step=375, training_loss=2.0210760498046874, metrics={'train_runtime': 357.7699, 'train_samples_per_second': 16.653, 'train_steps_per_second': 1.048, 'train_loss': 2.0210760498046874, 'epoch': 3.0})