In [1]:
# Based on roBERTa and using the transformers library
# and the https://huggingface.co/datasets/silicone dataset
# train a classifier of each "utterance" into the categories 0 to 3

# Ideally, the classification should depend on the dialog context
# using the Dialogue_ID and Idx variables
# respectively giving the dialogue identifier and the utterance order.

# Inspired by https://huggingface.co/docs/transformers/training

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=4)

import datasets
dataset = datasets.load_dataset('silicone', 'dyda_da')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

  0%|          | 0/3 [00:00<?, ?it/s]

In [22]:
len(dataset['train'])

87170

In [23]:
dataset['train'][0]

{'Utterance': 'say , jim , how about going for a few beers after dinner ?',
 'Dialogue_Act': 'directive',
 'Dialogue_ID': '1',
 'Label': 1,
 'Idx': 0}

In [2]:
def tokenize_function(doc):
    return tokenizer(doc["Utterance"], padding="max_length", truncation=True)

In [9]:
dataset_tokenized = dataset.rename_column("Label", "label").map(tokenize_function, batched=True).remove_columns(['Utterance', 'Dialogue_Act', 'Dialogue_ID', 'Idx'])
small_train_dataset = dataset_tokenized['train'] # .shuffle(seed=42).select(range(500))
small_test_dataset  =  dataset_tokenized['test'] # .shuffle(seed=42).select(range(500))

Map:   0%|          | 0/87170 [00:00<?, ? examples/s]

Loading cached processed dataset at /Users/katossky/.cache/huggingface/datasets/silicone/dyda_da/1.0.0/af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5/cache-02fc435480a05403.arrow
Loading cached processed dataset at /Users/katossky/.cache/huggingface/datasets/silicone/dyda_da/1.0.0/af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5/cache-c2d579ded85024ea.arrow


In [11]:
(len(small_train_dataset), len(small_test_dataset))

(87170, 7740)

In [89]:
small_train_dataset[0].keys()

dict_keys(['Utterance', 'Dialogue_Act', 'Dialogue_ID', 'label', 'Idx', 'input_ids', 'attention_mask'])

In [68]:
len(small_train_dataset[0]['input_ids'])

512

In [4]:
import evaluate

metric = evaluate.load("accuracy")

In [5]:
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [12]:
# train the model
training_args = TrainingArguments(
    seed=42,                         # random seed for initialization
    output_dir='./results',          # output directory
    do_train=True,                   # do training
    evaluation_strategy="steps",     # evaluation strategy to adopt during training
    eval_steps=100,                  # evaluation step
    #num_train_epochs=3,              # total # of training epochs
    #per_device_train_batch_size=16,  # batch size per device during training
    #per_device_eval_batch_size=64,   # batch size for evaluation
    #warmup_steps=500,                # number of warmup steps for learning rate scheduler
    #weight_decay=0.01,               # strength of weight decay
    log_level='debug',
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,               # eval_steps is set to the same value
    save_strategy="epoch",
    auto_find_batch_size=True,       # automatically find the best batch size
    use_mps_device=True,             # use the new Apple M1 chip
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [13]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=small_train_dataset,         # training dataset
    eval_dataset=small_test_dataset,           # evaluation dataset
    # tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
# trainer.train()

In [14]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Idx, Dialogue_Act, Utterance, Dialogue_ID. If Idx, Dialogue_Act, Utterance, Dialogue_ID are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 87170
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 32691
  Number of trainable parameters = 124648708


  0%|          | 0/32691 [00:00<?, ?it/s]

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Idx, Dialogue_Act, Utterance, Dialogue_ID. If Idx, Dialogue_Act, Utterance, Dialogue_ID are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 7740
  Batch size = 8


{'loss': 0.7563, 'learning_rate': 4.9847052705637644e-05, 'epoch': 0.01}


  0%|          | 0/968 [00:00<?, ?it/s]

{'eval_loss': 0.6663451790809631, 'eval_accuracy': 0.7465116279069768, 'eval_runtime': 304.8435, 'eval_samples_per_second': 25.39, 'eval_steps_per_second': 3.175, 'epoch': 0.01}


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Idx, Dialogue_Act, Utterance, Dialogue_ID. If Idx, Dialogue_Act, Utterance, Dialogue_ID are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 7740
  Batch size = 8


{'loss': 0.7472, 'learning_rate': 4.969410541127528e-05, 'epoch': 0.02}


  0%|          | 0/968 [00:00<?, ?it/s]

{'eval_loss': 0.711063802242279, 'eval_accuracy': 0.7024547803617571, 'eval_runtime': 304.3257, 'eval_samples_per_second': 25.433, 'eval_steps_per_second': 3.181, 'epoch': 0.02}


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Idx, Dialogue_Act, Utterance, Dialogue_ID. If Idx, Dialogue_Act, Utterance, Dialogue_ID are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 7740
  Batch size = 8


{'loss': 0.675, 'learning_rate': 4.954115811691292e-05, 'epoch': 0.03}


  0%|          | 0/968 [00:00<?, ?it/s]

{'eval_loss': 0.6733798384666443, 'eval_accuracy': 0.7804909560723514, 'eval_runtime': 306.6239, 'eval_samples_per_second': 25.243, 'eval_steps_per_second': 3.157, 'epoch': 0.03}


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Idx, Dialogue_Act, Utterance, Dialogue_ID. If Idx, Dialogue_Act, Utterance, Dialogue_ID are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 7740
  Batch size = 8


{'loss': 0.7474, 'learning_rate': 4.9388210822550554e-05, 'epoch': 0.04}


  0%|          | 0/968 [00:00<?, ?it/s]

In [37]:
! pip3 install torch transformers ipywidgets datasets accelerate evaluate wheel scikit-learn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting scikit-learn
  Downloading scikit_learn-1.2.1-cp39-cp39-macosx_12_0_arm64.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting scipy>=1.3.2
  Downloading scipy-1.10.1-cp39-cp39-macosx_12_0_arm64.whl (28.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.9/28.9 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting threadpoolctl>=2.0.