<a href="https://colab.research.google.com/github/kisakiwata/CV_huggingface/blob/main/ImageClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up the environment 

In [27]:
from google.colab import files
 
uploaded = files.upload()
 
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving requirements.txt to requirements.txt
User uploaded file "requirements.txt" with length 127 bytes


In [28]:
!pip install -r /content/requirements.txt

#installing all the required packages

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.18.0-py3-none-any.whl (215 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.3/215.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-ml-py3
  Downloading nvidia-ml-py3-7.352.0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynvml
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: nvidia-ml-py3
  Building wheel for nvidia-ml-py3 (setup.py) ... [?25l[?25hdone
  Created wheel for nvidia-ml-py3: filename=nvidia_ml_py3-7.352.0-py3-none-any.whl size=19188 sha256=e58a8fdf8332cbeee84e067764347c951b8205c63ae5ea7120e1a15db7359c90
  Stored in directory: /root/.cache/pip/wheels/f6/d8/b0/15cfd7805d39250ac29318105f09b17

# To do:
- Try different models: EfficinetNet, RestNet, ...
- Try other peoples' models trained on food datasets in Huggingface


# GPU efficiency

In [29]:
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

# HuggingFace

In [35]:
from huggingface_hub import login
from datasets import load_dataset
from transformers import AutoImageProcessor, DefaultDataCollator, AutoModelForImageClassification, \
TrainingArguments, Trainer, pipeline, AutoImageProcessor
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor
import torch
import evaluate 
import numpy as np
import logging

In [4]:
# create and configure main logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
# create console handler with a higher log level
handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)
# create formatter and add it to the handler
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
# add the handler to the logger
logger.addHandler(handler)


In [5]:
# log into hunnging face
login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [6]:
# Load Food-101 dataset
food = load_dataset("food101", split="train[:5000]")
food = food.train_test_split(test_size=0.2)

logger.info(food["train"][0])

labels = food["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

logger.info(id2label[str(79)])


Downloading builder script:   0%|          | 0.00/6.21k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/5.56k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

Downloading and preparing dataset food101/default to /root/.cache/huggingface/datasets/food101/default/0.0.0/7cebe41a80fb2da3f08fcbef769c8874073a86346f7fb96dc0847d4dfc318295...


Downloading data:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.47M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/489k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/75750 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/25250 [00:00<?, ? examples/s]

2023-04-19 21:55:40,021 - __main__ - INFO - {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512 at 0x7F8958A4F310>, 'label': 6}
INFO:__main__:{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512 at 0x7F8958A4F310>, 'label': 6}
2023-04-19 21:55:40,030 - __main__ - INFO - prime_rib
INFO:__main__:prime_rib


Dataset food101 downloaded and prepared to /root/.cache/huggingface/datasets/food101/default/0.0.0/7cebe41a80fb2da3f08fcbef769c8874073a86346f7fb96dc0847d4dfc318295. Subsequent calls will reuse this data.


In [7]:
#The next step is to load a ViT image processor to process the image into a tensor:
checkpoint = "google/vit-base-patch16-224-in21k"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

Downloading (…)rocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

In [8]:
# Crop a random part of the image, resize it, and normalize it with the image mean and standard deviation:
normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])


In [9]:
# Then create a preprocessing function to apply the transforms and return the pixel_values - the inputs to the model - of the image:
def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples


In [10]:
#To apply the preprocessing function over the entire dataset, use 🤗 Datasets with_transform method. The transforms are applied on the fly when you load an element of the dataset:

food = food.with_transform(transforms)

In [11]:
# Now create a batch of examples using DefaultDataCollator. Unlike other data collators in 🤗 Transformers, the DefaultDataCollator does not apply additional preprocessing such as padding.
data_collator = DefaultDataCollator()


In [12]:
# Evaluate
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [13]:
# Train
model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

# set hyperparamters
training_args = TrainingArguments(
    output_dir="my_awesome_food_model",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)


Downloading pytorch_model.bin:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=food["train"],
    eval_dataset=food["test"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

trainer.train()

Cloning https://huggingface.co/Kisax/my_awesome_food_model into local empty directory.


Download file pytorch_model.bin:   0%|          | 8.00k/328M [00:00<?, ?B/s]

Download file training_args.bin: 100%|##########| 3.50k/3.50k [00:00<?, ?B/s]

Clean file training_args.bin:  29%|##8       | 1.00k/3.50k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/328M [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,Accuracy
0,2.7286,2.561893,0.828
2,1.8685,1.815649,0.866
2,1.6392,1.650978,0.879


TrainOutput(global_step=186, training_loss=2.4558806778282247, metrics={'train_runtime': 601.9922, 'train_samples_per_second': 19.934, 'train_steps_per_second': 0.309, 'total_flos': 9.232831524962304e+17, 'train_loss': 2.4558806778282247, 'epoch': 2.98})

In [18]:
trainer.push_to_hub()

To https://huggingface.co/Kisax/my_awesome_food_model
   ffb9b6a..f0456a2  main -> main

   ffb9b6a..f0456a2  main -> main

To https://huggingface.co/Kisax/my_awesome_food_model
   f0456a2..f9753d8  main -> main

   f0456a2..f9753d8  main -> main



'https://huggingface.co/Kisax/my_awesome_food_model/commit/f0456a2707721decc05c7e121bb83f1bd11404eb'

In [16]:
# inference

ds = load_dataset("food101", split="validation[:10]")
image = ds["image"][0]



# Testing out the new model and passing out the test images

In [33]:
classifier = pipeline("image-classification", model="my_awesome_food_model")
classifier(image)

[{'score': 0.31078436970710754, 'label': 'beignets'},
 {'score': 0.01526002585887909, 'label': 'chicken_wings'},
 {'score': 0.014720004051923752, 'label': 'bruschetta'},
 {'score': 0.014033169485628605, 'label': 'ramen'},
 {'score': 0.013834455981850624, 'label': 'hamburger'}]

In [31]:
# checking GPU utilization
print_gpu_utilization()

GPU memory occupied: 5075 MB.


# Load an image processor to preprocess the image and return the input as PyTorch tensors:

In [36]:
image_processor = AutoImageProcessor.from_pretrained("my_awesome_food_model")
inputs = image_processor(image, return_tensors="pt")

In [37]:
model = AutoModelForImageClassification.from_pretrained("my_awesome_food_model")
with torch.no_grad():
    logits = model(**inputs).logits

In [38]:
# Get the predicted label with the highest probability, and use the model’s id2label mapping to convert it to a label:
predicted_label = logits.argmax(-1).item()
model.config.id2label[predicted_label]

'beignets'