In [1]:
!pip install datasets
!pip install transformers[torch]
!pip install evaluate
!pip install -U accelerate
!apt-get install git-lfs

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━[0m [32m440.3/542.1 kB[0m [31m13.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux20

# 4-Optimization

In this notebook, I tune the pretrained LLM using a larger portion of the available movie-review dataset. I also try out a few different hyperparameter settings and consider several scoring metrics.

## Import

In [2]:
import numpy as np
import torch

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification,\
    TrainingArguments, Trainer
import accelerate
import evaluate


In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Display GPU Info

In [4]:
# Display GPU information.
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Fri Jun  7 15:43:10 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   39C    P8              12W /  72W |      1MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Instantiate tokenizer, trainer (and training arguments), and classifer.

In [5]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
   # Map function
    # padding and truncation control for variable length sequences
    return tokenizer(examples["text"], padding="max_length", truncation=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
training_args = TrainingArguments(
    output_dir='./test_trainer',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    push_to_hub=True,
    hub_model_id='derek-harnett/movie-review-classifier',
    hub_strategy='end'
)

In [7]:
# I'll consider these four scoring metrics:
accuracy = evaluate.load('accuracy')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1_score = evaluate.load('f1')

def compute_metrics(eval_pred):
    # raw outputs, actual labels
    logits, labels = eval_pred

    # predictions is the highest output probability
    predictions = np.argmax(logits, axis=-1)

    # metrics
    acc = accuracy.compute(predictions=predictions, references=labels)
    pre = precision.compute(predictions=predictions, references=labels)
    rec = recall.compute(predictions=predictions, references=labels)
    f1 = f1_score.compute(predictions=predictions, references=labels)

    # accuracy computation
    return {
        'accuracy': acc['accuracy'],
        'precision': pre['precision'],
        'recall': rec['recall'],
        'f1': f1['f1']
    }

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [8]:
# Load the movie reviews.
ds_movies = load_dataset('imdb')

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [9]:
tokenized_datasets = ds_movies.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [10]:
# Train and test on a subset of the full dataset.
n_samples = 12_500
small_train_dataset = tokenized_datasets["train"].shuffle().select(range(n_samples))
small_eval_dataset = tokenized_datasets["test"].shuffle().select(range(n_samples))

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)


In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3135,0.23585,0.91336,0.882589,0.95319,0.916532
2,0.1613,0.259606,0.92032,0.92743,0.91167,0.919483
3,0.0991,0.310477,0.92072,0.913085,0.929625,0.92128


TrainOutput(global_step=2346, training_loss=0.17980049617861643, metrics={'train_runtime': 1349.3332, 'train_samples_per_second': 27.792, 'train_steps_per_second': 1.739, 'total_flos': 4967527449600000.0, 'train_loss': 0.17980049617861643, 'epoch': 3.0})

In [15]:
trainer.evaluate()

{'eval_loss': 0.31047725677490234,
 'eval_accuracy': 0.92072,
 'eval_precision': 0.9130845536136042,
 'eval_recall': 0.9296248797691568,
 'eval_f1': 0.9212804829613155,
 'eval_runtime': 119.7585,
 'eval_samples_per_second': 104.377,
 'eval_steps_per_second': 6.53,
 'epoch': 3.0}

## Save Model and Tokenizer

For subsequent use in an API, I'll save both the trained model and tokenizer. Afterwards, I'll download them to my PC and use them to set up an API which classifies movie reviews upon receiving a POST request.

In [16]:
trainer.save_model(
    '/content/drive/MyDrive/movie-review-classifier/model'
)

tokenizer.save_pretrained(
    '/content/drive/MyDrive/movie-review-classifier/tokenizer'
)

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

events.out.tfevents.1717775159.13ebd0311d0e.2192.0:   0%|          | 0.00/7.32k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

events.out.tfevents.1717776628.13ebd0311d0e.2192.1:   0%|          | 0.00/560 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

('/content/drive/MyDrive/movie-review-classifier/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/movie-review-classifier/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/movie-review-classifier/tokenizer/vocab.txt',
 '/content/drive/MyDrive/movie-review-classifier/tokenizer/added_tokens.json',
 '/content/drive/MyDrive/movie-review-classifier/tokenizer/tokenizer.json')

## Push to Hugging Face

In [17]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/derek-harnett/movie-review-classifier/commit/0c583afac38b0ed414b9fe741afad4493bd687e4', commit_message='End of training', commit_description='', oid='0c583afac38b0ed414b9fe741afad4493bd687e4', pr_url=None, pr_revision=None, pr_num=None)