In [None]:
!pip install -qU transformers datasets evaluate accelerate pillow torchvision scikit-learn

# Image Classification

Image classification assigns a label or class to an image.

We will fine-tune `ViT` models on the `Food-101` dataset.

## Load Food-101 dataset

In [None]:
from datasets import load_dataset

food = load_dataset('food101', split='train[:5000]')

# split into train and test sets
food = food.train_test_split(test_size=0.2)

In [4]:
food['train'][0]

{'image': <PIL.Image.Image image mode=RGB size=512x512>, 'label': 53}

* `image` is a PIL image of the food item
* `label` is the label class of the food item

We need to create a dictionary that maps the label name to an integer and vice versa

In [6]:
labels = food['train'].features['label'].names
label2id, id2label = dict(), dict()

for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [7]:
id2label[str(53)]

'hamburger'

## Preprocess

We need to load a ViT image processor to process the image into a tensor:

In [8]:
from transformers import AutoImageProcessor

checkpoint = 'google/vit-base-patch16-224-in21k'
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


To make the model more robust against overfitting, we can apply some transformations to the images by using torchvision's `transforms` module.

In [9]:
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor

normalize = Normalize(
    mean=image_processor.image_mean,
    std=image_processor.image_std
)

size = (
    image_processor.size['shortest_edge']
    if 'shortest_edge' in image_processor.size
    else (image_processor.size['height'], image_processor.size['width'])
)

# transformations
_transforms = Compose([
    RandomResizedCrop(size),
    ToTensor(),
    normalize
])

Then we create a preprocessing function to apply the transforms and return the `pixel_values` - the inputs to the model - of the image:

In [10]:
def transforms(examples):
    examples['pixel_values'] = [
        _transforms(img.convert('RGB'))
        for img in examples['image']
    ]
    del examples['image']
    return examples

To apply the preprocessing function over the entire dataset, use the `with_transform` method. The transforms are applied on the fly when we load an element of the dataset:

In [11]:
food = food.with_transform(transforms)

In [12]:
food['train'][0]

{'label': 53,
 'pixel_values': tensor([[[ 0.6549,  0.6471,  0.5294,  ...,  0.7882,  0.8353,  0.7647],
          [ 0.6078,  0.5765,  0.5373,  ...,  0.7725,  0.7647,  0.8510],
          [ 0.4980,  0.5216,  0.4824,  ...,  0.7725,  0.7647,  0.8039],
          ...,
          [-0.0431, -0.0196, -0.0275,  ...,  0.2863,  0.2235,  0.2471],
          [-0.0745, -0.0118,  0.0196,  ...,  0.1686,  0.1843,  0.1529],
          [-0.1059, -0.0667,  0.0118,  ...,  0.1608,  0.1765,  0.3255]],
 
         [[ 0.6157,  0.6078,  0.4980,  ...,  0.6784,  0.7412,  0.6706],
          [ 0.5608,  0.5451,  0.5059,  ...,  0.6549,  0.6627,  0.7490],
          [ 0.4353,  0.4745,  0.4510,  ...,  0.6627,  0.6549,  0.7020],
          ...,
          [-0.0902, -0.0902, -0.1137,  ...,  0.2627,  0.1922,  0.2078],
          [-0.1059, -0.0745, -0.0745,  ...,  0.1294,  0.1451,  0.1216],
          [-0.1137, -0.1059, -0.0588,  ...,  0.1216,  0.1294,  0.2863]],
 
         [[ 0.4431,  0.4353,  0.2941,  ...,  0.4980,  0.5765,  0.5059]

Now we can create a batch of examples using `DefaultDataCollator`. Unlike other data collators in Transformers library, the `DefaultDataCollator` does not apply additional preprocessing such as padding.

In [13]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

## Evaluate

For image classification task, we need to load the accuracy metric

In [14]:
import evaluate

accuracy = evaluate.load('accuracy')

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Then we create a function that passes our predictions and labels to compute the accuracy:

In [15]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    return accuracy.compute(
        predictions=predictions,
        references=labels
    )

## Train

We start our training by loading the ViT with `AutoModelForImageClassification` and specifying the number of labels along with the number of expected labels, and the label mappings:

In [16]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In the training arguments, we do NOT remove unused columns because that will drop the `image` column. Without the `image` column, we cannot create `pixel_values`, so we set `remove_unused_columns=False`

In [17]:
training_args = TrainingArguments(
    output_dir='my_awesome_food_model',
    remove_unused_columns=False,
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    push_to_hub=False
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=food['train'],
    eval_dataset=food['test'],
    processing_class=image_processor,
    compute_metrics=compute_metrics
)

trainer.train()

## Inference

In [None]:
ds = load_dataset('food101', split='validation[:10]')
image = ds['image'][0]

We can try our finetuned model for inference by using the `pipeline`:

In [None]:
from transformers import pipeline

classifier = pipeline(
    'image-classification',
    model='my_awesome_food_model'
)

In [None]:
classifier(image)

Or we can manually do the inference:

In [None]:
from transformers import AutoImageProcessor
import torch

image_processor = AutoImageProcessor.from_pretrained('my_awesome_food_model')
inputs = image_processor(image, return_tensors='pt')

In [None]:
from transformers import AutoModelForImageClassification

model = AutoModelForImageClassification.from_pretrained('my_awesome_food_model')

In [None]:
import torch

with torch.no_grad():
    logits = model(**inputs).logits

predicted_label = logits.argmax(-1).item()
model.config.id2label[predicted_label]