## Imports


In [17]:
import torch
import numpy as np
import evaluate

from datasets import load_dataset
from transformers import (
    ViTFeatureExtractor,
    ViTForImageClassification,
    TrainingArguments,
    Trainer,
)

## Loading and preparing the data


In [21]:
ds = load_dataset("data")
vit_base = "google/vit-base-patch16-224-in21k"
feature_extractor = ViTFeatureExtractor.from_pretrained(vit_base)


def transform(example_batch):
    # Take a list of PIL images and turn them to pixel values
    inputs = feature_extractor([x for x in example_batch["image"]], return_tensors="pt")

    # Don't forget to include the labels!
    inputs["label"] = example_batch["label"]
    return inputs


reduce_data_size = False
if reduce_data_size:
    training_data = ds["train"].select(range(50))
    ds["train"] = training_data
    test_data = ds["test"].select(range(50))
    ds["test"] = test_data

prepared_ds = ds.with_transform(transform)
prepared_ds.num_rows



## Creating evaluation metrics


In [25]:
clf_metrics = evaluate.combine(
    [
        evaluate.load("accuracy"),
        # evaluate.load("f1", average="weighted"),
        # evaluate.load("precision", average="weighted"),
        # evaluate.load("recall", average="weighted"),
    ]
)


def compute_metrics(p):
    return clf_metrics.compute(
        predictions=np.argmax(p.predictions, axis=1), references=p.label_ids
    )


def collate_fn(batch):
    return {
        "pixel_values": torch.stack([x["pixel_values"] for x in batch]),
        "labels": torch.tensor([x["label"] for x in batch]),
    }

## Setting the training arguments


In [27]:
class TrainingArgumentsWithMPSSupport(TrainingArguments):
    @property
    def device(self) -> torch.device:
        if torch.cuda.is_available():
            return torch.device("cuda")
        elif torch.backends.mps.is_available():
            return torch.device("mps")
        else:
            return torch.device("cpu")


training_args = TrainingArgumentsWithMPSSupport(
    output_dir="./logs",
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    evaluation_strategy="steps",
    num_train_epochs=1,
    save_steps=10,
    eval_steps=10,
    logging_steps=10,
    learning_rate=2e-4,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to="all",
    load_best_model_at_end=True,
)

## Creating the Trainer


In [28]:
labels = ds["train"].features["label"].names

model = ViTForImageClassification.from_pretrained(
    vit_base,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)},
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["test"],
    tokenizer=feature_extractor,
)

You are adding a <class 'transformers.integrations.TensorBoardCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
ProgressCallback


## Training the model


In [29]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

                                               
100%|██████████| 1/1 [03:48<00:00, 228.44s/it]]


{'train_runtime': 228.3823, 'train_samples_per_second': 0.219, 'train_steps_per_second': 0.004, 'train_loss': 3.2275807857513428, 'epoch': 1.0}
***** train metrics *****
  epoch                    =        1.0
  train_loss               =     3.2276
  train_runtime            = 0:03:48.38
  train_samples_per_second =      0.219
  train_steps_per_second   =      0.004


## Evaluating the model


In [None]:
metrics = trainer.evaluate()
trainer.log_metrics("test", metrics)
trainer.save_metrics("test", metrics)