In [1]:
import torch
print(torch.cuda.is_available())

True


In [2]:
!pip install optuna
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install --upgrade accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.1.1-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.7/365.7 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.10.4-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.9/212.9 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.9.1 (from optuna)
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully 

In [3]:
from datasets import Dataset, Features, Value, ClassLabel, DatasetDict, load_from_disk
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import evaluate
import numpy as np
import os
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
import pickle

In [4]:
raw_datasets = load_from_disk("/content/drive/MyDrive/data/arrow_cache")

In [5]:
raw_datasets.keys()

dict_keys(['train', 'validation', 'test'])

In [6]:
len(raw_datasets["train"])

512940

In [7]:
raw_datasets["train"][10000:10010]

{'text': ['brooch box',
  'padre pio prayer for healing',
  'magazines for teenage guys',
  'bbq wedding',
  'phq cards price list',
  'deadpool onesie',
  'harry potter scene it 2nd edition',
  'heated trolley',
  'private reg on car',
  'orthodontist cardiff'],
 'label': [452, 1187, 34, 315, 626, 698, 371, 368, 937, 584]}

In [8]:
checkpoint = "distilbert-base-uncased"

In [9]:
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"])

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



In [10]:
tokenized_datasets["train"][10000:10010]

{'text': ['brooch box',
  'padre pio prayer for healing',
  'magazines for teenage guys',
  'bbq wedding',
  'phq cards price list',
  'deadpool onesie',
  'harry potter scene it 2nd edition',
  'heated trolley',
  'private reg on car',
  'orthodontist cardiff'],
 'label': [452, 1187, 34, 315, 626, 698, 371, 368, 937, 584],
 'input_ids': [[101, 22953, 11663, 3482, 102],
  [101, 28612, 14255, 2080, 7083, 2005, 8907, 102],
  [101, 7298, 2005, 9454, 4364, 102],
  [101, 22861, 4160, 5030, 102],
  [101, 6887, 4160, 5329, 3976, 2862, 102],
  [101, 2757, 16869, 3924, 2666, 102],
  [101, 4302, 10693, 3496, 2009, 3416, 3179, 102],
  [101, 9685, 20820, 102],
  [101, 2797, 19723, 2006, 2482, 102],
  [101, 2030, 2705, 28716, 16774, 10149, 102]],
 'attention_mask': [[1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1]]}

In [11]:
metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels, average="macro")

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [12]:
training_args = TrainingArguments(
        output_dir="/content/model_distilbert_optimization/results/best_model",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        learning_rate=2e-5,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="/content/model_distilbert_optimization/logs",
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        seed=42,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        gradient_accumulation_steps=1,
        push_to_hub=False
)

In [13]:
max_f1_score = 0.5730164524449647

def objective(trial):
    global max_f1_score

    # Define hyperparameter search space
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 3)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16])
    warmup_steps = trial.suggest_int("warmup_steps", 0, 1000, step=100)
    weight_decay = trial.suggest_float("weight_decay", 0, 0.1, step=0.01)

    # Update training arguments with hyperparameter values
    training_args.learning_rate = learning_rate
    training_args.num_train_epochs = num_train_epochs
    training_args.per_device_train_batch_size = per_device_train_batch_size
    training_args.warmup_steps = warmup_steps
    training_args.weight_decay = weight_decay

    # Instantiate a fresh model for this trial
    model = DistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=1419)

    # Train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

    trainer.train()

    # Evaluate the model
    metrics = trainer.evaluate()

    # If the current trial's f1-score is greater than the max_f1_score, save the model and update max_f1_score
    if metrics["eval_f1"] > max_f1_score or trial.number == 0:
        max_f1_score = metrics["eval_f1"]
        output_dir = "/content/model_distilbert_optimization/results/best_model"
        os.makedirs(output_dir, exist_ok=True)
        model.save_pretrained(output_dir)
        g_drive_dir = "/content/drive/MyDrive/models/model_distilbert_optimization/results/best_model"
        model.save_pretrained(g_drive_dir)

    # Return the metric we want to optimize
    return metrics["eval_f1"]

tokenizer.save_pretrained("/content/model_distilbert_optimization/results/tokenizer")
tokenizer.save_pretrained("/content/drive/MyDrive/models/model_distilbert_optimization/tokenizer")

('/content/drive/MyDrive/models/model_distilbert_optimization/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/models/model_distilbert_optimization/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/models/model_distilbert_optimization/tokenizer/vocab.txt',
 '/content/drive/MyDrive/models/model_distilbert_optimization/tokenizer/added_tokens.json')

In [14]:
storage_name = 'sqlite:////content/drive/MyDrive/models/model_distilbert_optimization/optuna_study/my_study.db'

In [15]:
pruner = optuna.pruners.MedianPruner()

In [16]:
study = optuna.create_study(direction="maximize", 
                            storage=storage_name, 
                            study_name='my_study', 
                            load_if_exists=True,
                            pruner=pruner,
                            sampler=optuna.samplers.TPESampler())

[32m[I 2023-05-11 11:03:27,967][0m Using an existing study with name 'my_study' instead of creating a new one.[0m


In [17]:
num_trials_per_round = 5
num_rounds = 4  # 4*5=20 trials

for i in range(num_rounds):
    # run optuna.optimize for num_trials_per_round
    study.optimize(objective, n_trials=num_trials_per_round, timeout=36000)
    # save the study object
    with open('/content/drive/MyDrive/models/model_distilbert_optimization/optuna_studies_pickles/my_study.pkl', 'wb') as f:
        pickle.dump(study, f)

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_clas

Epoch,Training Loss,Validation Loss,F1
1,2.3032,2.090856,0.486101
2,1.9361,1.809354,0.534981
3,1.3089,1.734658,0.557574


[32m[I 2023-05-11 11:45:12,138][0m Trial 12 finished with value: 0.5575740395327388 and parameters: {'learning_rate': 3.0249407292055203e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'warmup_steps': 200, 'weight_decay': 0.1}. Best is trial 5 with value: 0.5730164524449647.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initiali

Epoch,Training Loss,Validation Loss,F1
1,2.1184,2.055657,0.491254
2,1.9614,1.786682,0.53838
3,1.1861,1.714887,0.560482


[32m[I 2023-05-11 12:25:44,627][0m Trial 13 finished with value: 0.560481729606608 and parameters: {'learning_rate': 3.314842591497098e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'warmup_steps': 500, 'weight_decay': 0.1}. Best is trial 5 with value: 0.5730164524449647.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializi

Epoch,Training Loss,Validation Loss,F1
1,2.0195,1.970595,0.506273
2,1.8649,1.732305,0.553569


[32m[I 2023-05-11 12:53:00,301][0m Trial 14 finished with value: 0.553568814306009 and parameters: {'learning_rate': 4.996109361849392e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'warmup_steps': 500, 'weight_decay': 0.08}. Best is trial 5 with value: 0.5730164524449647.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializ

Epoch,Training Loss,Validation Loss,F1
1,2.2625,2.051209,0.492468
2,1.8596,1.777623,0.541322
3,1.1826,1.711202,0.564546


[32m[I 2023-05-11 13:33:53,084][0m Trial 15 finished with value: 0.5645460820332646 and parameters: {'learning_rate': 3.536712492885842e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'warmup_steps': 500, 'weight_decay': 0.09}. Best is trial 5 with value: 0.5730164524449647.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initiali

Epoch,Training Loss,Validation Loss,F1
1,2.2319,2.002936,0.500988
2,1.5274,1.743333,0.549989
3,0.9296,1.688272,0.575977


[32m[I 2023-05-11 14:54:08,157][0m Trial 16 finished with value: 0.5759765440927339 and parameters: {'learning_rate': 3.9829283945570575e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'warmup_steps': 900, 'weight_decay': 0.08}. Best is trial 16 with value: 0.5759765440927339.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initial

Epoch,Training Loss,Validation Loss,F1
1,2.2539,1.972984,0.506979
2,1.5516,1.722952,0.557027


[32m[I 2023-05-11 15:48:34,228][0m Trial 17 finished with value: 0.5570273741843186 and parameters: {'learning_rate': 4.038560436240331e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 8, 'warmup_steps': 1000, 'weight_decay': 0.08}. Best is trial 16 with value: 0.5759765440927339.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initial

Epoch,Training Loss,Validation Loss,F1
1,2.2767,1.992214,0.504861
2,1.6026,1.737311,0.553199
3,0.9155,1.704982,0.577579


[32m[I 2023-05-11 17:10:32,844][0m Trial 18 finished with value: 0.5775788958626737 and parameters: {'learning_rate': 4.935678907206822e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'warmup_steps': 800, 'weight_decay': 0.05}. Best is trial 18 with value: 0.5775788958626737.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initiali

Epoch,Training Loss,Validation Loss,F1
1,2.1042,1.948383,0.510168
2,1.4679,1.708505,0.56187


[32m[I 2023-05-11 18:04:35,260][0m Trial 19 finished with value: 0.5618703982854784 and parameters: {'learning_rate': 4.962270119042742e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 8, 'warmup_steps': 800, 'weight_decay': 0.04}. Best is trial 18 with value: 0.5775788958626737.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initiali

Epoch,Training Loss,Validation Loss,F1
1,2.2213,1.984015,0.504973
2,1.5407,1.730215,0.55379
3,1.0175,1.695429,0.576727


[32m[I 2023-05-11 19:26:32,292][0m Trial 20 finished with value: 0.5767266675337379 and parameters: {'learning_rate': 4.1108988034452255e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'warmup_steps': 900, 'weight_decay': 0.05}. Best is trial 18 with value: 0.5775788958626737.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initial

Epoch,Training Loss,Validation Loss,F1
1,2.0989,1.986116,0.503211
2,1.5657,1.735418,0.554532
3,0.9358,1.697508,0.576482
