In [20]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [23]:
student_id = "google/bert_uncased_L-4_H-256_A-4"
teacher_id = "agvidit1/Bert_TG-HS-HX_pretrain"

# name for our repository on the hub
repo_name = "tinybert-TG-HS-HX-parentpretrained"

In [26]:
from transformers import AutoTokenizer
from transformers import BertTokenizer, DistilBertTokenizer

# init tokenizer
teacher_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
student_tokenizer = BertTokenizer.from_pretrained(student_id)

# sample input
sample = "This is a basic example, with different words to test."

# assert results
assert teacher_tokenizer(sample) == student_tokenizer(sample), "Tokenizers haven't created the same output"

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

In [27]:
# dataset_id="glue"
# dataset_config="sst2"

In [28]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd

dataset = load_dataset("agvidit1/Dataset-TG-HS-HX-Processed")
columns_to_remove = ['__index_level_0__']
dataset = dataset.remove_columns(columns_to_remove)


Downloading readme:   0%|          | 0.00/353 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.88M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/35982 [00:00<?, ? examples/s]

In [29]:
shuffled_dataset = dataset.shuffle(seed=42)
total_rows = len(shuffled_dataset['train'])
train_size = int(total_rows * 0.70)
validation_size = int(total_rows * 0.20)
test_size = total_rows - train_size - validation_size
train_dataset = shuffled_dataset['train'].select(range(train_size))
validation_dataset = shuffled_dataset['train'].select(range(train_size, train_size + validation_size))
test_dataset = shuffled_dataset['train'].select(range(train_size + validation_size, total_rows))

split_dataset = {
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
}

In [31]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


In [32]:

def process(examples):
    tokenized_inputs = tokenizer(
        examples["text"], truncation=True, max_length=256
    )
    return tokenized_inputs


tokenized_datasets = {split: split_dataset[split].map(process, batched=True) for split in split_dataset}
tokenized_datasets = {split: dataset.rename_column("label", "labels") for split, dataset in tokenized_datasets.items()}
# tokenized_datasets = {split: dataset.rename_column("user_id", "idx") for split, dataset in tokenized_datasets.items()}

tokenized_datasets


Map:   0%|          | 0/25187 [00:00<?, ? examples/s]

Map:   0%|          | 0/7196 [00:00<?, ? examples/s]

Map:   0%|          | 0/3599 [00:00<?, ? examples/s]

{'train': Dataset({
     features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 25187
 }),
 'validation': Dataset({
     features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 7196
 }),
 'test': Dataset({
     features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 3599
 })}

In [33]:
from transformers import TrainingArguments, Trainer
import torch
import torch.nn as nn
import torch.nn.functional as F

class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)

        self.alpha = alpha
        self.temperature = temperature

class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        # place teacher on same device as student
        self._move_model_to_device(self.teacher,self.model.device)
        self.teacher.eval()

    def compute_loss(self, model, inputs, return_outputs=False):

        # compute student output
        outputs_student = model(**inputs)
        student_loss=outputs_student.loss
        # compute teacher output
        with torch.no_grad():
          outputs_teacher = self.teacher(**inputs)

        # assert size
        assert outputs_student.logits.size() == outputs_teacher.logits.size()

        # Soften probabilities and compute distillation loss
        loss_function = nn.KLDivLoss(reduction="batchmean")
        loss_logits = (loss_function(
            F.log_softmax(outputs_student.logits / self.args.temperature, dim=-1),
            F.softmax(outputs_teacher.logits / self.args.temperature, dim=-1)) * (self.args.temperature ** 2))
        # Return weighted student loss
        loss = self.args.alpha * student_loss + (1. - self.args.alpha) * loss_logits
        return (loss, outputs_student) if return_outputs else loss


In [34]:
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding, BertForSequenceClassification
from huggingface_hub import HfFolder

# create label2id, id2label dicts for nice outputs for the model
labels = list(set(tokenized_datasets["train"]["labels"]))
num_labels = len(labels)
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}
# for i, label in enumerate(labels):
#     label2id[label] = str(i)
#     id2label[str(i)] = label

# define training args
training_args = DistillationTrainingArguments(
    output_dir=repo_name,
    num_train_epochs=7,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    learning_rate=6e-5,
    seed=33,
    # logging & evaluation strategies
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch", # to get more information to TB
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="tensorboard",
    # push to hub parameters
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_name,
    hub_token=HfFolder.get_token(),
    # distilation parameters
    alpha=0.5,
    temperature=4.0
    )

# define data_collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# define model
teacher_model = BertForSequenceClassification.from_pretrained(
    teacher_id,
    ignore_mismatched_sizes=True,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

# define student model
student_model = BertForSequenceClassification.from_pretrained(
    student_id,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at agvidit1/Bert_TG-HS-HX_pretrain and are newly initialized: ['classifier.weight', 'bert.pooler.dense.bias', 'classifier.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


pytorch_model.bin:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-4_H-256_A-4 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
from datasets import load_metric
import numpy as np

# define metrics and metrics function
accuracy_metric = load_metric( "accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    return {
        "accuracy": acc["accuracy"],
    }


In [36]:
trainer = DistillationTrainer(
    student_model,
    training_args,
    teacher_model=teacher_model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [37]:
import torch
torch.cuda.is_available()

True

In [38]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3017,0.288796,0.793218
2,0.2859,0.281244,0.808644
3,0.2772,0.281716,0.81726
4,0.2707,0.279905,0.823096
5,0.2652,0.280279,0.824347
6,0.2598,0.28197,0.825598
7,0.2578,0.281981,0.825042


TrainOutput(global_step=1379, training_loss=0.2740491738433506, metrics={'train_runtime': 3237.4974, 'train_samples_per_second': 54.458, 'train_steps_per_second': 0.426, 'total_flos': 309665393535204.0, 'train_loss': 0.2740491738433506, 'epoch': 7.0})

In [39]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [40]:
def hp_space(trial):
    return {
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 8),
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4 ,log=True),
        "alpha": trial.suggest_float("alpha", 0, 1),
        "temperature": trial.suggest_int("temperature", 2, 30),
        # "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16,32]),
        # "per_device_eval_batch_size": trial.suggest_categorical("per_device_eval_batch_size", [16,32]),
}


In [41]:
def student_init():
    return AutoModelForSequenceClassification.from_pretrained(
        student_id,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id
    )

trainer = DistillationTrainer(
    model_init=student_init,
    args=training_args,
    teacher_model=teacher_model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
best_run = trainer.hyperparameter_search(
    n_trials=10,
    direction="maximize",
    hp_space=hp_space
)

print(best_run)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-4_H-256_A-4 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2023-12-10 21:44:57,714] A new study created in memory with name: no-name-dea694e1-f36f-42c3-afb7-e28595695974
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-4_H-256_A-4 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6127,0.562056,0.72582
2,0.5562,0.517233,0.752362
3,0.5297,0.498158,0.760145
4,0.5189,0.488543,0.765981
5,0.5117,0.483972,0.767927
6,0.5063,0.482752,0.76751


[I 2023-12-10 22:32:10,610] Trial 0 finished with value: 0.7675097276264592 and parameters: {'num_train_epochs': 6, 'learning_rate': 3.0069416525035787e-06, 'alpha': 0.9772734766942756, 'temperature': 16}. Best is trial 0 with value: 0.7675097276264592.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-4_H-256_A-4 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2825,0.269023,0.752362
2,0.2724,0.265089,0.771818
3,0.2692,0.263475,0.775987
4,0.2681,0.262498,0.778071
5,0.2674,0.262347,0.779461


[I 2023-12-10 23:11:39,311] Trial 1 finished with value: 0.7794608115619789 and parameters: {'num_train_epochs': 5, 'learning_rate': 5.7546531738235265e-06, 'alpha': 0.4394092153370026, 'temperature': 23}. Best is trial 1 with value: 0.7794608115619789.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-4_H-256_A-4 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.564,0.532214,0.698999
2,0.5261,0.496853,0.738883
3,0.5052,0.481561,0.748888
4,0.4975,0.474243,0.754864
5,0.4935,0.472753,0.754308


[I 2023-12-10 23:50:44,718] Trial 2 finished with value: 0.754307948860478 and parameters: {'num_train_epochs': 5, 'learning_rate': 2.09805683059569e-06, 'alpha': 0.8786599620319323, 'temperature': 9}. Best is trial 1 with value: 0.7794608115619789.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-4_H-256_A-4 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2002,0.194488,0.796693
2,0.1946,0.191451,0.808644
3,0.1912,0.191405,0.815314
4,0.189,0.190453,0.818788
5,0.1871,0.190758,0.824625
6,0.1853,0.190897,0.824625
7,0.1843,0.190875,0.823235


[I 2023-12-11 00:45:18,342] Trial 3 finished with value: 0.8232351306281267 and parameters: {'num_train_epochs': 7, 'learning_rate': 5.2898091511494136e-05, 'alpha': 0.3140746400350408, 'temperature': 24}. Best is trial 3 with value: 0.8232351306281267.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-4_H-256_A-4 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3205,0.305639,0.718177
2,0.3065,0.296624,0.749305
3,0.3012,0.292987,0.760283
4,0.2993,0.291159,0.765981
5,0.2978,0.290268,0.769594
6,0.2964,0.289715,0.772235
7,0.2959,0.289577,0.771818
8,0.2961,0.28944,0.772235


[I 2023-12-11 01:47:44,341] Trial 4 finished with value: 0.7722345747637577 and parameters: {'num_train_epochs': 8, 'learning_rate': 2.4373768682613424e-06, 'alpha': 0.4889425582875847, 'temperature': 30}. Best is trial 3 with value: 0.8232351306281267.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-4_H-256_A-4 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3472,0.326423,0.75403
2,0.3304,0.319358,0.77279
3,0.3247,0.316482,0.77821
4,0.3224,0.314489,0.781267
5,0.3204,0.313345,0.784742
6,0.3185,0.312896,0.786409
7,0.318,0.312969,0.786965


[I 2023-12-11 02:42:08,753] Trial 5 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-4_H-256_A-4 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1259,0.12161,0.753613
2,0.1236,0.120967,0.774597
3,0.1229,0.120667,0.78488
4,0.1224,0.12036,0.780434
5,0.1221,0.120237,0.788772
6,0.1218,0.120142,0.789605
7,0.1216,0.120099,0.793496


[I 2023-12-11 03:36:25,831] Trial 6 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-4_H-256_A-4 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4715,0.425386,0.772235
2,0.4326,0.411841,0.783352
3,0.4188,0.406383,0.788772
4,0.4135,0.402778,0.790717
5,0.4098,0.401911,0.791829


[I 2023-12-11 04:15:53,731] Trial 7 finished with value: 0.791828793774319 and parameters: {'num_train_epochs': 5, 'learning_rate': 1.173971182301033e-05, 'alpha': 0.8077469751146494, 'temperature': 8}. Best is trial 3 with value: 0.8232351306281267.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-4_H-256_A-4 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.275,0.26255,0.775292
2,0.2662,0.259816,0.783213
3,0.2633,0.259382,0.784047


[I 2023-12-11 04:39:16,732] Trial 8 finished with value: 0.7840466926070039 and parameters: {'num_train_epochs': 3, 'learning_rate': 1.4180834470145417e-05, 'alpha': 0.43662764362367845, 'temperature': 19}. Best is trial 3 with value: 0.8232351306281267.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-4_H-256_A-4 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3771,0.351038,0.776126
2,0.356,0.343978,0.789883
3,0.3473,0.340755,0.797526
4,0.3428,0.337214,0.803085
5,0.3391,0.336575,0.806003
6,0.3354,0.33524,0.810033
7,0.3337,0.335258,0.809061
8,0.3334,0.33487,0.811006


[I 2023-12-11 05:41:33,745] Trial 9 finished with value: 0.81100611450806 and parameters: {'num_train_epochs': 8, 'learning_rate': 1.3582277085236825e-05, 'alpha': 0.625656755680376, 'temperature': 10}. Best is trial 3 with value: 0.8232351306281267.


BestRun(run_id='3', objective=0.8232351306281267, hyperparameters={'num_train_epochs': 7, 'learning_rate': 5.2898091511494136e-05, 'alpha': 0.3140746400350408, 'temperature': 24}, run_summary=None)


In [42]:
# overwrite initial hyperparameters with from the best_run
for k,v in best_run.hyperparameters.items():
    setattr(training_args, k, v)

# Define a new repository to store our distilled model
best_model_ckpt = "tiny-bert-toxigen-best"
training_args.output_dir = best_model_ckpt


In [43]:
# Create a new Trainer with optimal parameters
optimal_trainer = DistillationTrainer(
    student_model,
    training_args,
    teacher_model=teacher_model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

optimal_trainer.train()


# save best model, metrics and create model card
optimal_trainer.create_model_card(model_name=training_args.hub_model_id)
optimal_trainer.push_to_hub()


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1819,0.19235,0.822679
2,0.1791,0.192187,0.822262
3,0.1772,0.19502,0.814341
4,0.1761,0.193159,0.82393
5,0.1756,0.193216,0.823374
6,0.1752,0.193865,0.824208
7,0.1759,0.193707,0.822957


'https://huggingface.co/joseph10/tinybert-TG-HS-HX-parentpretrained/tree/main/'