In [None]:
!pip install datasets evaluate transformers peft

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.

# Fine-tuning Sandbox


In [None]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

### dataset

In [None]:
# # how dataset was generated

# # load imdb data
# imdb_dataset = load_dataset("imdb")

# # define subsample size
# N = 1000
# # generate indexes for random subsample
# rand_idx = np.random.randint(24999, size=N)

# # extract train and test data
# x_train = imdb_dataset['train'][rand_idx]['text']
# y_train = imdb_dataset['train'][rand_idx]['label']

# x_test = imdb_dataset['test'][rand_idx]['text']
# y_test = imdb_dataset['test'][rand_idx]['label']

# # create new dataset
# dataset = DatasetDict({'train':Dataset.from_dict({'label':y_train,'text':x_train}),
#                              'validation':Dataset.from_dict({'label':y_test,'text':x_test})})

In [None]:
# load dataset
dataset = load_dataset('shawhin/imdb-truncated')
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/592 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/836k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/853k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [None]:
# display % of training data with label=1
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

0.5

### model

In [None]:
model_checkpoint = 'distilbert-base-uncased'
# model_checkpoint = 'roberta-base' # you can alternatively use roberta-base but this model is bigger thus training will take longer

# define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# display architecture
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

### preprocess data

In [None]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [None]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [None]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### evaluation

In [None]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

### Apply untrained model to text

In [None]:
# define list of examples
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
It was good. - Negative
Not a fan, don't recommed. - Negative
Better than the first one. - Negative
This is not worth watching even once. - Negative
This one is a pass. - Negative


### Train model

In [None]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])

In [None]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [None]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [None]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 10

In [None]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [None]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.416039,{'accuracy': 0.879}
2,0.430500,0.469142,{'accuracy': 0.877}
3,0.430500,0.593736,{'accuracy': 0.882}
4,0.178200,0.646781,{'accuracy': 0.89}
5,0.178200,0.697777,{'accuracy': 0.886}
6,0.052700,0.84429,{'accuracy': 0.883}
7,0.052700,0.92037,{'accuracy': 0.887}
8,0.014400,0.954775,{'accuracy': 0.886}
9,0.014400,1.013871,{'accuracy': 0.881}
10,0.002400,1.001454,{'accuracy': 0.882}


Trainer is attempting to log a value of "{'accuracy': 0.879}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.877}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.882}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.89}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.886}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This in

TrainOutput(global_step=2500, training_loss=0.1356455415248871, metrics={'train_runtime': 466.4212, 'train_samples_per_second': 21.44, 'train_steps_per_second': 5.36, 'total_flos': 1112883852759936.0, 'train_loss': 0.1356455415248871, 'epoch': 10.0})

### Generate prediction

In [None]:
model.to('cpu') # moving to mps for Mac (can alternatively do 'cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
It was good. - Positive
Not a fan, don't recommed. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Negative
This one is a pass. - Negative


### Optional: push model to hub

In [None]:
# option 1: notebook login
from huggingface_hub import notebook_login
notebook_login() # ensure token gives write access

# # option 2: key login
# from huggingface_hub import login
# write_key = 'hf_' # paste token here
# login(write_key)

In [None]:
hf_name = 'shawhin' # your hf username or org name
model_id = hf_name + "/" + model_checkpoint + "-lora-text-classification" # you can name the model whatever you want

In [None]:
model.push_to_hub(model_id) # save model

In [None]:
trainer.push_to_hub(model_id) # save trainer

### Optional: load peft model

In [None]:
# how to load peft model from hub for inference
config = PeftConfig.from_pretrained(model_id)
inference_model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path, num_labels=2, id2label=id2label, label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(inference_model, model_id)

In [None]:
from torchvision.datasets import MNIST

In [None]:
import torchvision.transforms as transforms
transform = transforms.Compose([
    transforms.Grayscale(3),  # Convert grayscale to RGB
    transforms.Resize((224, 224)),  # Resize to match ViT input size
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])


train_dataset = MNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = MNIST(root='./data', train=False, transform=transform, download=True)


In [None]:
from torch.utils.data import DataLoader
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
from transformers import AutoModelForImageClassification
model_checkpoint = 'google/vit-base-patch16-224'
model = AutoModelForImageClassification.from_pretrained(model_checkpoint, num_labels=10, ignore_mismatched_sizes=True)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
id2label = {i: str(i) for i in range(10)}
label2id = {str(i): i for i in range(10)}
model.config.id2label = id2label
model.config.label2id = label2id

In [None]:
# Modify the classifier head to match the number of MNIST classes
model.classifier = torch.nn.Linear(model.config.hidden_size, 10)
torch.nn.init.xavier_uniform_(model.classifier.weight)  # Reinitialize the classifier weights
model.classifier.bias.data.fill_(0.0)  #

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
peft_config = LoraConfig(
    task_type="SEQ_CLS",
    r=4,
    lora_alpha=32,
    lora_dropout=0.01,
    target_modules=['query', 'key', 'value', 'dense']
)

In [None]:
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 671,242 || all params: 86,477,588 || trainable%: 0.7762


In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [None]:
def train_model(model, train_loader, test_loader, num_epochs, lr):
    # Define training arguments
    training_args = TrainingArguments(
        output_dir=model_checkpoint + "-lora-mnist-classification",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_loader.dataset,
        eval_dataset=test_loader.dataset,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    evaluation = trainer.evaluate()
    print(f"Accuracy: {evaluation['eval_accuracy']:.4f}")

In [None]:
num_epochs = 5
learning_rate = 1e-3

# Train the model
train_model(model, train_loader, test_loader, num_epochs, learning_rate)



TypeError: vars() argument must have __dict__ attribute

In [None]:
# Test the trained model
print("Trained model predictions:")
print("--------------------------")
for i, (images, labels) in enumerate(test_loader):
    if i >= 5:  # Limiting to first 5 batches for brevity
        break
    outputs = model(images).logits
    predictions = torch.argmax(outputs, dim=1)
    print(f"True Labels: {labels.tolist()}")
    print(f"Predictions: {predictions.tolist()}")

In [None]:
!pip install evaluate



In [None]:
# Import necessary libraries
from transformers import (
    AutoModelForImageClassification,
    TrainingArguments,
    Trainer,
    default_data_collator
)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from torchvision.datasets import MNIST
from datasets import Dataset as HFDataset, DatasetDict
import numpy as np

# Custom Dataset to convert PyTorch DataLoader to HuggingFace Dataset
class MNISTDataset(Dataset):
    def __init__(self, mnist_dataset):
        self.mnist_dataset = mnist_dataset

    def __len__(self):
        return len(self.mnist_dataset)

    def __getitem__(self, idx):
        image, label = self.mnist_dataset[idx]
        return {"image": image, "label": label}

# Load MNIST dataset
transform = transforms.Compose([
    transforms.Grayscale(3),  # Convert grayscale to RGB
    transforms.Resize((224, 224)),  # Resize to match ViT input size
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = MNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = MNIST(root='./data', train=False, transform=transform, download=True)




AttributeError: module 'pyarrow.lib' has no attribute 'ListViewType'

In [None]:
!pip install pyarrow==9.0.0

Collecting pyarrow==9.0.0
  Downloading pyarrow-9.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.3/35.3 MB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 16.1.0
    Uninstalling pyarrow-16.1.0:
      Successfully uninstalled pyarrow-16.1.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 9.0.0 which is incompatible.
datasets 2.20.0 requires pyarrow>=15.0.0, but you have pyarrow 9.0.0 which is incompatible.[0m[31m
[0mSuccessfully installed pyarrow-9.0.0


In [None]:
# Convert to custom dataset
train_dataset = MNISTDataset(train_dataset)
test_dataset = MNISTDataset(test_dataset)

# Convert to HuggingFace Dataset
hf_train_dataset = HFDataset.from_pandas({
    "image": [train_dataset[i]["image"] for i in range(len(train_dataset))],
    "label": [train_dataset[i]["label"] for i in range(len(train_dataset))]
})

hf_test_dataset = HFDataset.from_pandas({
    "image": [test_dataset[i]["image"] for i in range(len(test_dataset))],
    "label": [test_dataset[i]["label"] for i in range(len(test_dataset))]
})

datasets = DatasetDict({"train": hf_train_dataset, "test": hf_test_dataset})



In [None]:
# Load pre-trained ViT model for image classification with ignored mismatched sizes
model_checkpoint = 'google/vit-base-patch16-224'
model = AutoModelForImageClassification.from_pretrained(
    model_checkpoint,
    num_labels=10,  # Number of classes for MNIST
    ignore_mismatched_sizes=True  # Ignore mismatched sizes for the classifier
)



In [None]:
# Define label maps for MNIST (digits 0-9)
id2label = {i: str(i) for i in range(10)}
label2id = {str(i): i for i in range(10)}
model.config.id2label = id2label
model.config.label2id = label2id

# Modify the classifier head to match the number of MNIST classes
model.classifier = torch.nn.Linear(model.config.hidden_size, 10)
torch.nn.init.xavier_uniform_(model.classifier.weight)  # Reinitialize the classifier weights
model.classifier.bias.data.fill_(0.0)  # Reinitialize the classifier biases



In [None]:
# Apply LoRA fine-tuning
# We will specifically target the linear layers within the ViT's attention and feed-forward networks
peft_config = LoraConfig(
    task_type="SEQ_CLS",
    r=4,
    lora_alpha=32,
    lora_dropout=0.01,
    target_modules=['query', 'key', 'value', 'dense']
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Define accuracy metric




In [None]:
accuracy = evaluate.load("accuracy")

# Define a function to compute metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [None]:
# Define a function to train and evaluate the model
def train_model(model, datasets, num_epochs, lr):
    # Define training arguments
    training_args = TrainingArguments(
        output_dir=model_checkpoint + "-lora-mnist-classification",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        eval_strategy="epoch",  # Updated from deprecated `evaluation_strategy`
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=datasets["train"],
        eval_dataset=datasets["test"],
        data_collator=default_data_collator,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    evaluation = trainer.evaluate()
    print(f"Accuracy: {evaluation['eval_accuracy']:.4f}")


In [None]:

# Set training parameters
num_epochs = 10
learning_rate = 1e-3

# Train the model
train_model(model, datasets, num_epochs, learning_rate)

# Test the trained model
print("Trained model predictions:")
print("--------------------------")
for i, batch in enumerate(test_loader):
    if i >= 5:  # Limiting to first 5 batches for brevity
        break
    images, labels = batch
    outputs = model(images).logits
    predictions = torch.argmax(outputs, dim=1)
    print(f"True Labels: {labels.tolist()}")
    print(f"Predictions: {predictions.tolist()}")