<a href="https://colab.research.google.com/github/j0rdan0/AI-notebooks/blob/main/StreetFoodViT_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!curl -L -o /root/popular-street-foods.zip https://www.kaggle.com/api/v1/datasets/download/nikolasgegenava/popular-street-foods
!unzip /root/popular-street-foods.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 18.6M  100 18.6M    0     0  6960k      0  0:00:02  0:00:02 --:--:-- 10.3M
Archive:  /root/popular-street-foods.zip
  inflating: popular_street_foods/dataset/arepas/0001.jpg  
  inflating: popular_street_foods/dataset/arepas/0002.jpg  
  inflating: popular_street_foods/dataset/arepas/0003.jpg  
  inflating: popular_street_foods/dataset/arepas/0004.jpg  
  inflating: popular_street_foods/dataset/arepas/0005.jpg  
  inflating: popular_street_foods/dataset/arepas/0006.jpg  
  inflating: popular_street_foods/dataset/arepas/0007.jpg  
  inflating: popular_street_foods/dataset/arepas/0008.jpg  
  inflating: popular_street_foods/dataset/arepas/0009.jpg  
  inflating: popular_street_foods/dataset/arepas/0010.jpg  
  inflating: popular_street_foods/dataset/a

In [23]:
from torchvision import datasets, transforms
from torch.utils.data import random_split
from transformers import AutoImageProcessor


#dataset to use https://www.kaggle.com/datasets/nikolasgegenava/popular-street-foods/data

class DictWrapper(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, idx):
        img, label = self.dataset[idx]
        return {
            "pixel_values": img,
            "labels": label
        }

    def __len__(self):
        return len(self.dataset)


def preprocess_dataset(dataset_path,model_name):

  img_processor = AutoImageProcessor.from_pretrained(model_name)
  normalize = transforms.Normalize(mean=img_processor.image_mean, std=img_processor.image_std)

# create dataset from on disk images
  transform_rules = transforms.Compose([transforms.Resize((224,224)),transforms.ToTensor(),normalize])
  dataset = datasets.ImageFolder(root=dataset_path,transform=transform_rules)

  label2id = dataset.class_to_idx
  id2label = {k:v for v,k in label2id.items()}

  train_size = int(0.9 * len(dataset))
  test_size = len(dataset)-train_size

  train_dataset,test_dataset = random_split(dataset,[train_size,test_size])
  return DictWrapper(train_dataset),DictWrapper(test_dataset),label2id,id2label


In [10]:
from torch.utils.data import DataLoader

def create_dataloaders(train_dataset,test_dataset)
  train_dl = DataLoader(train_dataset,batch_size=32,shuffle=True)
  test_dl = DataLoader(test_dataset,batch_size=32,shuffle=False)
  return train_dl,test_dl


115

In [None]:
import matplotlib.pyplot as plt

classes = {k:v for v,k in dataset.class_to_idx.items()}

for img,label in dl:
  i = img[0].permute(1, 2, 0)
  i = i.numpy()
  plt.imshow(i)
  print(classes[int(label[0])])
  break

In [24]:
# preprocess dataset

dataset_path = "/content/popular_street_foods/dataset"
model_name = "google/vit-base-patch16-224"

x_train,x_test,label2id,id2label = preprocess_dataset(dataset_path,model_name)


Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


In [19]:
def generate_label2_id(ds):
  label2id = ds.class_to_idx
  id2label = {k:v for v,k in label2id.items()}
  return label2id,id2label

In [None]:
#!pip install evaluate

import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)



In [None]:

import transformers
import torch
model_name = "google/vit-base-patch16-224"
labels_len = 20

img_processor = transformers.AutoImageProcessor.from_pretrained(model_name)
model = transformers.AutoModelForImageClassification.from_pretrained(model_name,device_map="auto",num_labels=labels_len,id2label=id2label,label2id=label2id,ignore_mismatched_sizes=True)


In [32]:
training_args = transformers.TrainingArguments(
    output_dir="vit-base-patch16-street-food",
    eval_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
    push_to_hub=True,
    per_device_train_batch_size=16,
    num_train_epochs=5,
    warmup_ratio=0.1,
    logging_strategy="epoch",
    report_to="none",
    save_strategy="epoch",
    remove_unused_columns=False,
    load_best_model_at_end=True
)



In [33]:
import transformers

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset= x_train,
    eval_dataset= x_test,
    processing_class = img_processor,
    compute_metrics= compute_metrics
)

In [35]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0164,0.45501,0.904891
2,0.0075,0.499138,0.904891
3,0.0018,0.469077,0.915761
4,0.0004,0.471352,0.918478
5,0.0003,0.473651,0.918478


TrainOutput(global_step=1035, training_loss=0.005293577074428687, metrics={'train_runtime': 663.1165, 'train_samples_per_second': 24.928, 'train_steps_per_second': 1.561, 'total_flos': 1.2811492406277734e+18, 'train_loss': 0.005293577074428687, 'epoch': 5.0})

In [36]:
def validation_acc():
  metrics = trainer.evaluate(eval_dataset=x_test)
  print(metrics)
validation_acc()

{'eval_loss': 0.4550101161003113, 'eval_accuracy': 0.904891304347826, 'eval_runtime': 4.3179, 'eval_samples_per_second': 85.227, 'eval_steps_per_second': 10.653, 'epoch': 5.0}


In [34]:
from huggingface_hub import create_repo
create_repo("j0rdan0/vision-models")

RepoUrl('https://huggingface.co/j0rdan0/vision-models', endpoint='https://huggingface.co', repo_type='model', repo_id='j0rdan0/vision-models')

In [None]:
 # push model and image processor to HF hub

 model.push_to_hub("j0rdan0/vision-models",commit_message="ViT finetuned on street food")
 img_processor.push_to_hub("j0rdan0/vision-models",commit_message="adding preprocessor as well")