#### CPU vs GPU

In [1]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, BitsAndBytesConfig, logging, pipeline
import time
import random
from pprint import pprint

def set_seed(seed=42):
    random.seed(seed)
    torch.manual_seed(seed)

set_seed()

logging.set_verbosity_error()

In [2]:
from datasets import load_dataset, Dataset

In [3]:
is_gpu_available = torch.cuda.is_available()

num_of_devices = torch.cuda.device_count()

if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name()

print("Is CUDA available? ", is_gpu_available)
print("Number of CUDA devices: ", num_of_devices)
print("CUDA device name: ", device_name)

Is CUDA available?  True
Number of CUDA devices:  1
CUDA device name:  Tesla T4


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# move this tensor to the available device
tensor = torch.rand(2, 2)
tensor.to(device)

print("Device: ", device)
print("Tensor device: ", tensor.device)

Device:  cuda
Tensor device:  cpu


In [5]:
start_time = time.time()
cpu_tensor = torch.rand(1000, 1000)
cpu_sum = cpu_tensor.sum()
cpu_time = time.time() - start_time
print("CPU time: ", cpu_time)

if torch.cuda.is_available():
    start_time = time.time()
    gpu_tensor = torch.rand(1000, 1000, device=device)
    gpu_sum = gpu_tensor.sum()
    gpu_time = time.time() - start_time
    print("GPU time: ", gpu_time)
    print("Speedup: ", cpu_time / gpu_time)

CPU time:  0.015035867691040039
GPU time:  0.019093990325927734
Speedup:  0.7874659740778663


#### Tuning the model

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
dataset = load_dataset("stanfordnlp/imdb")
train_df = dataset["train"].to_pandas()

In [8]:
train_df['dataset'] = 'train'

In [9]:
train_dataset = Dataset.from_pandas(train_df)

In [10]:
test_df = dataset["test"].to_pandas()
test_df['dataset'] = 'test'

In [11]:
test_dataset = Dataset.from_pandas(test_df)

In [12]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="longest", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [13]:
training_args = TrainingArguments(
    output_dir="./temp_results",
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    learning_rate=1e-4,
    save_strategy="no",
    num_train_epochs=3,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
)


model = AutoModelForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=2)

Loading weights:   0%|          | 0/39 [00:00<?, ?it/s]

In [14]:
model.to(device)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset
)

trainer.train()
trainer.evaluate()

{'loss': '0.6182', 'grad_norm': '4.426', 'learning_rate': '9.98e-05', 'epoch': '0.2399'}
{'loss': '0.4005', 'grad_norm': '11.09', 'learning_rate': '9.132e-05', 'epoch': '0.4798'}
{'loss': '0.3428', 'grad_norm': '22.77', 'learning_rate': '8.263e-05', 'epoch': '0.7198'}
{'loss': '0.3363', 'grad_norm': '22.18', 'learning_rate': '7.394e-05', 'epoch': '0.9597'}
{'loss': '0.279', 'grad_norm': '1.759', 'learning_rate': '6.525e-05', 'epoch': '1.2'}
{'loss': '0.26', 'grad_norm': '5.728', 'learning_rate': '5.655e-05', 'epoch': '1.44'}
{'loss': '0.2631', 'grad_norm': '8.658', 'learning_rate': '4.786e-05', 'epoch': '1.679'}
{'loss': '0.2551', 'grad_norm': '21.32', 'learning_rate': '3.917e-05', 'epoch': '1.919'}
{'loss': '0.2107', 'grad_norm': '4.111', 'learning_rate': '3.048e-05', 'epoch': '2.159'}
{'loss': '0.2047', 'grad_norm': '28.01', 'learning_rate': '2.178e-05', 'epoch': '2.399'}
{'loss': '0.1932', 'grad_norm': '47.44', 'learning_rate': '1.309e-05', 'epoch': '2.639'}
{'loss': '0.2043', 'grad

{'eval_loss': 0.4268324077129364,
 'eval_runtime': 38.9209,
 'eval_samples_per_second': 642.328,
 'eval_steps_per_second': 53.544,
 'epoch': 3.0}

In [15]:
model.save_pretrained("./our_finetuned_model")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
model_path = "./our_finetuned_model"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Loading weights:   0%|          | 0/41 [00:00<?, ?it/s]



#### Using our tuned model

In [17]:
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)
single_text = "It was a great movie!"
result = classifier(single_text)

print(f"Prediction: {result[0]['label']}, Score: {result[0]['score']}")

Prediction: LABEL_1, Score: 0.9931102991104126


In [18]:
batch_texts = ["Terrible", "Would not recommend"]
results = classifier(batch_texts)

for i, result in enumerate(results):
    print(f"Prediction for text {i+1}: {result['label']}, Score: {result['score']}")


Prediction for text 1: LABEL_0, Score: 0.992004930973053
Prediction for text 2: LABEL_0, Score: 0.9300445318222046


#### Finetuning with LoRA

In [19]:
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training, LoftQConfig


In [20]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,  # false in training, true when inferring
    r=16,  # Rank of low-rank matrices
    lora_alpha=32,  # analogous to the learning rate in normal GD
    lora_dropout=0.1  # Dropout rate, helps prevent overfitting
)

In [21]:
model = AutoModelForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=2)
lora_model = get_peft_model(model, peft_config)

Loading weights:   0%|          | 0/39 [00:00<?, ?it/s]

In [22]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(lora_model)

trainable params: 16642 || all params: 4402820 || trainable%: 0.37798501869256523


In [23]:
lora_model.to(device)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset
)

trainer.train()
trainer.evaluate()

{'loss': '0.6878', 'grad_norm': '0.7834', 'learning_rate': '9.98e-05', 'epoch': '0.2399'}
{'loss': '0.6348', 'grad_norm': '1.525', 'learning_rate': '9.132e-05', 'epoch': '0.4798'}
{'loss': '0.5451', 'grad_norm': '4.293', 'learning_rate': '8.263e-05', 'epoch': '0.7198'}
{'loss': '0.4891', 'grad_norm': '7.75', 'learning_rate': '7.394e-05', 'epoch': '0.9597'}
{'loss': '0.4769', 'grad_norm': '2.882', 'learning_rate': '6.525e-05', 'epoch': '1.2'}
{'loss': '0.4474', 'grad_norm': '5.258', 'learning_rate': '5.655e-05', 'epoch': '1.44'}
{'loss': '0.4355', 'grad_norm': '7.839', 'learning_rate': '4.786e-05', 'epoch': '1.679'}
{'loss': '0.4486', 'grad_norm': '3.235', 'learning_rate': '3.917e-05', 'epoch': '1.919'}
{'loss': '0.4443', 'grad_norm': '9.971', 'learning_rate': '3.048e-05', 'epoch': '2.159'}
{'loss': '0.4439', 'grad_norm': '1.723', 'learning_rate': '2.178e-05', 'epoch': '2.399'}
{'loss': '0.4399', 'grad_norm': '1.885', 'learning_rate': '1.309e-05', 'epoch': '2.639'}
{'loss': '0.4346', 'g

{'eval_loss': 0.4171816408634186,
 'eval_runtime': 39.6438,
 'eval_samples_per_second': 630.616,
 'eval_steps_per_second': 52.568,
 'epoch': 3.0}

In [24]:
model.save_pretrained("./lora_finetunedmodel")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

#### Finetuning with QLoRA

Note that bert-tiny is already ~4MB, so QLoRA gives almost no benefit. QLoRA is designed for 7B+ models, but if you were to use QLoRA, below is an example setup.


load_in_4bit: This parameter, when set to True, will load the model in 4-bit precision.

bnb_4bit_quant_type: This parameter specifies the type of quantization. We will use "nf4" which stands for narrow full precision 4-bit.

bnb_4bit_use_double_quant: This parameter, when set to True, will use double quantization. Double quantization can help to reduce the quantization error.

bnb_4bit_compute_dtype: This parameter specifies the data type for computation. We will use torch.bfloat16 which is a 16-bit floating point format.

In [25]:
config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [None]:

model = AutoModelForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=2, quantization_config=config)
loftq_config = LoftQConfig(loftq_bits=4)
model = prepare_model_for_kbit_training(model)

In [31]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, # types here: https://github.com/huggingface/peft/blob/v0.7.1/src/peft/utils/peft_types.py#L22
    inference_mode=False,  # false in training, true when inferring
    loftq_config=loftq_config,
    r=16,  # Rank of low-rank matrices
    lora_alpha=32,
    lora_dropout=0.1  # Dropout rate, helps prevent overfitting
)

In [28]:
qlora_model = get_peft_model(model, peft_config)

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(qlora_model)

trainable params: 16642 || all params: 4198020 || trainable%: 0.3964249813007084


In [None]:
qlora_model.to(device)

trainer = Trainer(
    model=qlora_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset
)

trainer.train()
trainer.evaluate()

#### Evaluating Generative Models with Perplexity

In simple terms, perplexity approximates how likely the model would be to predict each token in a given sequence correctly. For instance, GPT-4 might have no problem completing the phrase “Four score and seven years ago” with the entirety of the Gettysburg Address (of which “Four score and seven years ago” are the famous opening lines).

A less capable model like GPT-2 might have difficulty completing that sequence.

We’d say that GPT-4 had a lower perplexity with respect to the Gettysburg Address than GPT-2 did.

Put more technically, perplexity is defined as the exponentiated cross-entropy of the likelihood of a given sequence.