In [1]:
import torch
print(torch.cuda.is_available())

True


In [2]:
!pip install optuna
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install --upgrade accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.19.0


In [3]:
from datasets import Dataset, Features, Value, ClassLabel, DatasetDict, load_from_disk
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import evaluate
import numpy as np
import os
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
import pickle

In [4]:
raw_datasets = load_from_disk("/content/drive/MyDrive/data/arrow_cache")

In [5]:
raw_datasets.keys()

dict_keys(['train', 'validation', 'test'])

In [6]:
len(raw_datasets["train"])

512940

In [7]:
raw_datasets["train"][10000:10010]

{'text': ['brooch box',
  'padre pio prayer for healing',
  'magazines for teenage guys',
  'bbq wedding',
  'phq cards price list',
  'deadpool onesie',
  'harry potter scene it 2nd edition',
  'heated trolley',
  'private reg on car',
  'orthodontist cardiff'],
 'label': [452, 1187, 34, 315, 626, 698, 371, 368, 937, 584]}

In [8]:
checkpoint = "distilbert-base-uncased"

In [9]:
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"])

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



In [10]:
tokenized_datasets["train"][10000:10010]

{'text': ['brooch box',
  'padre pio prayer for healing',
  'magazines for teenage guys',
  'bbq wedding',
  'phq cards price list',
  'deadpool onesie',
  'harry potter scene it 2nd edition',
  'heated trolley',
  'private reg on car',
  'orthodontist cardiff'],
 'label': [452, 1187, 34, 315, 626, 698, 371, 368, 937, 584],
 'input_ids': [[101, 22953, 11663, 3482, 102],
  [101, 28612, 14255, 2080, 7083, 2005, 8907, 102],
  [101, 7298, 2005, 9454, 4364, 102],
  [101, 22861, 4160, 5030, 102],
  [101, 6887, 4160, 5329, 3976, 2862, 102],
  [101, 2757, 16869, 3924, 2666, 102],
  [101, 4302, 10693, 3496, 2009, 3416, 3179, 102],
  [101, 9685, 20820, 102],
  [101, 2797, 19723, 2006, 2482, 102],
  [101, 2030, 2705, 28716, 16774, 10149, 102]],
 'attention_mask': [[1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1]]}

In [11]:
metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels, average="macro")

In [12]:
training_args = TrainingArguments(
        output_dir="/content/model_distilbert_optimization/results/best_model",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        learning_rate=2e-5,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="/content/model_distilbert_optimization/logs",
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        seed=42,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        gradient_accumulation_steps=1,
        push_to_hub=False
)

In [13]:
max_f1_score = 0

def objective(trial):
    global max_f1_score

    # Define hyperparameter search space
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 3)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16])
    warmup_steps = trial.suggest_int("warmup_steps", 0, 1000, step=100)
    weight_decay = trial.suggest_float("weight_decay", 0, 0.1, step=0.01)

    # Update training arguments with hyperparameter values
    training_args.learning_rate = learning_rate
    training_args.num_train_epochs = num_train_epochs
    training_args.per_device_train_batch_size = per_device_train_batch_size
    training_args.warmup_steps = warmup_steps
    training_args.weight_decay = weight_decay

    # Instantiate a fresh model for this trial
    model = DistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=1419)

    # Train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

    trainer.train()

    # Evaluate the model
    metrics = trainer.evaluate()

    # If the current trial's f1-score is greater than the max_f1_score, save the model and update max_f1_score
    if metrics["eval_f1"] > max_f1_score or trial.number == 0:
        max_f1_score = metrics["eval_f1"]
        output_dir = "/content/model_distilbert_optimization/results/best_model"
        os.makedirs(output_dir, exist_ok=True)
        model.save_pretrained(output_dir)
        g_drive_dir = "/content/drive/MyDrive/models/model_distilbert_optimization/results/best_model"
        model.save_pretrained(g_drive_dir)

    # Return the metric we want to optimize
    return metrics["eval_f1"]

tokenizer.save_pretrained("/content/model_distilbert_optimization/results/tokenizer")
tokenizer.save_pretrained("/content/drive/MyDrive/models/model_distilbert_optimization/tokenizer")

('/content/drive/MyDrive/models/model_distilbert_optimization/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/models/model_distilbert_optimization/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/models/model_distilbert_optimization/tokenizer/vocab.txt',
 '/content/drive/MyDrive/models/model_distilbert_optimization/tokenizer/added_tokens.json')

In [14]:
storage_name = 'sqlite:////content/drive/MyDrive/models/model_distilbert_optimization/optuna_study/my_study.db'

In [15]:
pruner = optuna.pruners.MedianPruner()

In [16]:
study = optuna.create_study(direction="maximize", 
                            storage=storage_name, 
                            study_name='my_study', 
                            load_if_exists=True,
                            pruner=pruner,
                            sampler=optuna.samplers.TPESampler())

[32m[I 2023-05-10 18:42:45,294][0m A new study created in RDB with name: my_study[0m


In [None]:
num_trials_per_round = 5
num_rounds = 4  # 4*5=20 trials

for i in range(num_rounds):
    # run optuna.optimize for num_trials_per_round
    study.optimize(objective, n_trials=num_trials_per_round, timeout=36000)
    # save the study object
    with open('/content/drive/MyDrive/models/model_distilbert_optimization/optuna_studies_pickles/my_study.pkl', 'wb') as f:
        pickle.dump(study, f)

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

Epoch,Training Loss,Validation Loss,F1
1,2.1669,1.990059,0.502196
2,1.949,1.763607,0.54979


[32m[I 2023-05-10 19:11:49,240][0m Trial 0 finished with value: 0.5497898814619042 and parameters: {'learning_rate': 4.5086775117231564e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'warmup_steps': 600, 'weight_decay': 0.01}. Best is trial 0 with value: 0.5497898814619042.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initiali

Epoch,Training Loss,Validation Loss,F1
1,2.4198,2.153507,0.480988


[32m[I 2023-05-10 19:26:20,077][0m Trial 1 finished with value: 0.48098786838153684 and parameters: {'learning_rate': 3.4083835840539955e-05, 'num_train_epochs': 1, 'per_device_train_batch_size': 16, 'warmup_steps': 400, 'weight_decay': 0.06}. Best is trial 0 with value: 0.5497898814619042.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initial

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,F1
1,2.1681,2.041914,0.497137
2,1.8197,1.819763,0.540211


[32m[I 2023-05-10 20:22:04,997][0m Trial 2 finished with value: 0.5402107456319871 and parameters: {'learning_rate': 2.664220601890812e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 8, 'warmup_steps': 0, 'weight_decay': 0.03}. Best is trial 0 with value: 0.5497898814619042.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing

Epoch,Training Loss,Validation Loss,F1
1,2.2911,1.993103,0.508081


[32m[I 2023-05-10 20:50:12,084][0m Trial 3 finished with value: 0.5080808048540352 and parameters: {'learning_rate': 3.7144038724553934e-05, 'num_train_epochs': 1, 'per_device_train_batch_size': 8, 'warmup_steps': 300, 'weight_decay': 0.02}. Best is trial 0 with value: 0.5497898814619042.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializ

Epoch,Training Loss,Validation Loss,F1
1,2.5237,2.296998,0.460155


[32m[I 2023-05-10 21:18:10,791][0m Trial 4 finished with value: 0.4601549970932343 and parameters: {'learning_rate': 1.823365280668549e-05, 'num_train_epochs': 1, 'per_device_train_batch_size': 8, 'warmup_steps': 0, 'weight_decay': 0.03}. Best is trial 0 with value: 0.5497898814619042.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing

Epoch,Training Loss,Validation Loss,F1
1,2.1954,2.015761,0.49968
2,1.6731,1.742617,0.548443
3,1.1157,1.682028,0.573016


[32m[I 2023-05-10 22:40:29,249][0m Trial 5 finished with value: 0.5730164524449647 and parameters: {'learning_rate': 3.765717948740525e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'warmup_steps': 400, 'weight_decay': 0.1}. Best is trial 5 with value: 0.5730164524449647.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializin

Epoch,Training Loss,Validation Loss,F1
1,2.236,2.050668,0.49694


[32m[I 2023-05-10 22:54:27,524][0m Trial 6 finished with value: 0.4969404095953054 and parameters: {'learning_rate': 4.286457674963664e-05, 'num_train_epochs': 1, 'per_device_train_batch_size': 16, 'warmup_steps': 800, 'weight_decay': 0.01}. Best is trial 5 with value: 0.5730164524449647.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializ

Epoch,Training Loss,Validation Loss,F1
1,2.369,2.113049,0.482173
2,1.9896,1.824644,0.53369
3,1.2428,1.751433,0.555191


[32m[I 2023-05-10 23:36:01,520][0m Trial 7 finished with value: 0.5551911882611065 and parameters: {'learning_rate': 2.739126795196879e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'warmup_steps': 200, 'weight_decay': 0.07}. Best is trial 5 with value: 0.5730164524449647.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializ

Epoch,Training Loss,Validation Loss,F1
1,2.4384,2.092114,0.487159
2,1.7944,1.826274,0.53571
3,1.3192,1.776633,0.554842


[32m[I 2023-05-11 00:59:23,240][0m Trial 8 finished with value: 0.5548415358316745 and parameters: {'learning_rate': 2.021270029064174e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'warmup_steps': 200, 'weight_decay': 0.0}. Best is trial 5 with value: 0.5730164524449647.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializin

Epoch,Training Loss,Validation Loss,F1
1,2.3394,2.128247,0.484488


[32m[I 2023-05-11 01:26:55,901][0m Trial 9 finished with value: 0.4844880574354322 and parameters: {'learning_rate': 2.4619213257962128e-05, 'num_train_epochs': 1, 'per_device_train_batch_size': 8, 'warmup_steps': 700, 'weight_decay': 0.06}. Best is trial 5 with value: 0.5730164524449647.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializ

Epoch,Training Loss,Validation Loss,F1
1,2.5803,2.31718,0.4533
2,2.057,1.992016,0.508069
3,1.8135,1.910825,0.525768


[32m[I 2023-05-11 02:51:29,944][0m Trial 10 finished with value: 0.5257675044413339 and parameters: {'learning_rate': 1.1731887522798277e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'warmup_steps': 1000, 'weight_decay': 0.1}. Best is trial 5 with value: 0.5730164524449647.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initiali

Epoch,Training Loss,Validation Loss,F1
1,2.2117,2.055558,0.492383
2,1.9959,1.785553,0.539631
