In [None]:
# !pip install accelerate -U
# !pip install datasets

import numpy as np
import os
from transformers import ViTForImageClassification, Trainer, TrainingArguments, ViTFeatureExtractor
from datasets import load_metric
from datasets import Dataset, DatasetDict

# pytorch imports
import torch
from torch.utils.data import DataLoader, dataset, random_split
from torchvision import datasets, transforms

In [4]:
seed = 211
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [5]:
# Define basic transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to 224x224
    transforms.ToTensor(),  # Convert images to PyTorch tensors
])

In [6]:
data_dir = './drive/MyDrive/deep learn/project/dataset'   # change to your directory
train_dataset = datasets.ImageFolder(os.path.join(data_dir, 'train'), transform=transform)
test_dataset = datasets.ImageFolder(os.path.join(data_dir, 'test'), transform=transform)
# Split dataset into train and validation
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])
labels = test_dataset.classes

In [7]:
to_pil = transforms.ToPILImage()

# Convert to Hugging Face Dataset format
def convert_to_hf_format(dataset):
    hf_dataset = {"image": [], "labels": []}
    for idx in range(len(dataset)):
        image, label = dataset[idx]
        hf_dataset["image"].append(to_pil(image))
        hf_dataset["labels"].append(label)

    return hf_dataset

In [8]:
hf_train_dataset_dict = convert_to_hf_format(train_dataset)
hf_val_dataset_dict = convert_to_hf_format(val_dataset)
hf_test_dataset_dict = convert_to_hf_format(test_dataset)

ds = DatasetDict({"train": Dataset.from_dict(hf_train_dataset_dict),
                            "validation": Dataset.from_dict(hf_val_dataset_dict),
                            "test": Dataset.from_dict(hf_test_dataset_dict)})

In [9]:
model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name_or_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
def transform(example_batch):
    # Take a list of PIL images and turn them to pixel values
    inputs = feature_extractor([x for x in example_batch['image']], return_tensors='pt')

    # Don't forget to include the labels!
    inputs['labels'] = example_batch['labels']
    return inputs

prepared_ds = ds.with_transform(transform)

In [11]:
def collate_fn(batch):
  # Converts pixel values to tensor
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

In [12]:
metric = load_metric("accuracy")

def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [13]:
model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
training_args = TrainingArguments(
    output_dir="./vit-base-elephants-v5",
    per_device_train_batch_size=16,
    evaluation_strategy="steps",
    num_train_epochs=3,
    fp16=False,
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=2e-4,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to='tensorboard',
    load_best_model_at_end=True,
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["validation"],
    tokenizer=feature_extractor,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [16]:
train_results = trainer.train()
trainer.save_model()

Step,Training Loss,Validation Loss,Accuracy
100,0.315,0.697662,0.714286


In [17]:
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

metrics = trainer.evaluate(prepared_ds['validation'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** train metrics *****
  epoch                    =         3.0
  total_flos               = 145494799GF
  train_loss               =      0.4744
  train_runtime            =  1:13:45.28
  train_samples_per_second =       0.456
  train_steps_per_second   =       0.028


***** eval metrics *****
  epoch                   =        3.0
  eval_accuracy           =     0.7143
  eval_loss               =     0.6977
  eval_runtime            = 0:01:54.44
  eval_samples_per_second =      1.468
  eval_steps_per_second   =      0.183


In [20]:
# copy results to drive to save them
# !cp -r ./vit-base-elephants-v5/ ./drive/MyDrive/ViT

In [22]:
predictions = trainer.predict(prepared_ds["test"])

In [None]:
predictions
