In [1]:
import torch
print(torch.cuda.is_available())

True


In [2]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install --upgrade accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.19.0


In [3]:
from datasets import Dataset, Features, Value, ClassLabel, DatasetDict, load_from_disk
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import evaluate
from time import time
import numpy as np

In [4]:
raw_datasets = load_from_disk("/content/drive/MyDrive/data/arrow_cache")

In [5]:
raw_datasets.keys()

dict_keys(['train', 'validation', 'test'])

In [6]:
len(raw_datasets["train"])

512940

In [7]:
raw_datasets["train"][10000:10010]

{'text': ['brooch box',
  'padre pio prayer for healing',
  'magazines for teenage guys',
  'bbq wedding',
  'phq cards price list',
  'deadpool onesie',
  'harry potter scene it 2nd edition',
  'heated trolley',
  'private reg on car',
  'orthodontist cardiff'],
 'label': [452, 1187, 34, 315, 626, 698, 371, 368, 937, 584]}

In [8]:
checkpoint = "distilbert-base-uncased"

In [9]:
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"])

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



In [10]:
metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels, average="macro")

In [11]:
model = DistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=1419)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

In [12]:
training_args = TrainingArguments(
        output_dir="/content/drive/MyDrive/models/model_distilbert_dev_wo_test_split/results",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        learning_rate=2e-5,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="/content/drive/MyDrive/models/model_distilbert_dev_wo_test_split/logs",
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        seed=42,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        gradient_accumulation_steps=1,
        push_to_hub=False
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [14]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1
1,2.5108,2.222072,0.467016
2,2.2224,1.917378,0.519725
3,1.4065,1.843331,0.538795


TrainOutput(global_step=96177, training_loss=2.386570128717082, metrics={'train_runtime': 2518.6787, 'train_samples_per_second': 610.963, 'train_steps_per_second': 38.185, 'total_flos': 3773666146726008.0, 'train_loss': 2.386570128717082, 'epoch': 3.0})

In [15]:
output_dir="/content/drive/MyDrive/models/model_distilbert_dev_wo_test_split/results/final_model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('/content/drive/MyDrive/models/model_distilbert_dev_wo_test_split/results/final_model/tokenizer_config.json',
 '/content/drive/MyDrive/models/model_distilbert_dev_wo_test_split/results/final_model/special_tokens_map.json',
 '/content/drive/MyDrive/models/model_distilbert_dev_wo_test_split/results/final_model/vocab.txt',
 '/content/drive/MyDrive/models/model_distilbert_dev_wo_test_split/results/final_model/added_tokens.json')