In [None]:
!pip install transformers datasets

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Dataset

## Download Dataset

In [None]:
cp /content/drive/MyDrive/stanford_dogs/images.tar .

In [None]:
import tarfile

with tarfile.open('images.tar', 'r') as tar:
    tar.extractall()

## Dataset from Scratch

In [None]:
!wget http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar
!wget http://vision.stanford.edu/aditya86/ImageNetDogs/annotation.tar

--2023-04-02 15:41:12--  http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar
Resolving vision.stanford.edu (vision.stanford.edu)... 171.64.68.10
Connecting to vision.stanford.edu (vision.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 793579520 (757M) [application/x-tar]
Saving to: ‘images.tar’


2023-04-02 15:42:04 (14.9 MB/s) - ‘images.tar’ saved [793579520/793579520]

--2023-04-02 15:42:04--  http://vision.stanford.edu/aditya86/ImageNetDogs/annotation.tar
Resolving vision.stanford.edu (vision.stanford.edu)... 171.64.68.10
Connecting to vision.stanford.edu (vision.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21852160 (21M) [application/x-tar]
Saving to: ‘annotation.tar’


2023-04-02 15:42:13 (2.21 MB/s) - ‘annotation.tar’ saved [21852160/21852160]



In [None]:
import tarfile

with tarfile.open('images.tar', 'r') as tar:
    tar.extractall()
    
with tarfile.open('annotation.tar', 'r') as tar:
    tar.extractall()

In [None]:
import os

breed_list = os.listdir("Images/")

num_total_images, num_classes = 0, len(breed_list)

for breed in breed_list:
    num_total_images += len(os.listdir(f"Images/{breed}"))
    
print(f'{num_classes} breeds \
      {num_total_images} images')

120 breeds       20580 images


In [None]:
print(breed_list[0])

n02104029-kuvasz


In [None]:
import xml.etree.ElementTree as ET
from PIL import Image

os.mkdir('images')
for breed in breed_list:
    os.mkdir('images/' + breed)
print('Created {} folders to store cropped images of the different breeds.'.format(len(os.listdir('images'))))

for breed in os.listdir('images'):
    for file_name in os.listdir('Annotation/{}'.format(breed)):
        img = Image.open('Images/{}/{}.jpg'.format(breed, file_name))
        tree = ET.parse('Annotation/{}/{}'.format(breed, file_name))
        xmin = int(tree.getroot().findall('object')[0].find('bndbox').find('xmin').text)
        xmax = int(tree.getroot().findall('object')[0].find('bndbox').find('xmax').text)
        ymin = int(tree.getroot().findall('object')[0].find('bndbox').find('ymin').text)
        ymax = int(tree.getroot().findall('object')[0].find('bndbox').find('ymax').text)
        img = img.crop((xmin, ymin, xmax, ymax))
        img = img.convert('RGB')
        #img = img.resize((224, 224))
        img.save('images/' + breed + '/' + file_name + '.jpg')

Created 120 folders to store cropped images of the different breeds.


### images.tar로 압축하기

In [None]:
dir_path = "images"
tar_file_name = "images.tar"

with tarfile.open(tar_file_name, mode='w') as tar:
    tar.add(dir_path, arcname='images')

In [None]:
cp images.tar /content/drive/MyDrive/stanford_dogs

# Transformers datasets

In [None]:
from datasets import load_dataset

dataset = load_dataset("imagefolder", data_dir="images/", split="train")
dataset

Resolving data files:   0%|          | 0/20580 [00:00<?, ?it/s]

Downloading and preparing dataset imagefolder/default to /root/.cache/huggingface/datasets/imagefolder/default-34889be5012450c3/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...


Downloading data files:   0%|          | 0/20580 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset imagefolder downloaded and prepared to /root/.cache/huggingface/datasets/imagefolder/default-34889be5012450c3/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f. Subsequent calls will reuse this data.


Dataset({
    features: ['image', 'label'],
    num_rows: 20580
})

In [None]:
# 데이터 일부만 가지고 사용
dataset = dataset.shard(num_shards=100, index=0)
dataset

In [None]:
dataset = dataset.train_test_split(test_size=0.1, seed=42)
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 18522
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 2058
    })
})

## Loading ViT Feature Extractor

In [None]:
from transformers import ViTFeatureExtractor

model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name_or_path)
feature_extractor

Downloading (…)rocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



ViTFeatureExtractor {
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "ViTFeatureExtractor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}

In [None]:
def transform(example_batch):
    inputs = feature_extractor([x for x in example_batch['image']], return_tensors='pt')
    inputs['label'] = example_batch['label']
    return inputs

In [None]:
prepared_ds = dataset.with_transform(transform)
prepared_ds

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 18522
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 2058
    })
})

In [None]:
labels = dataset['train'].features['label']
labels

ClassLabel(names=['n02085620-Chihuahua', 'n02085782-Japanese_spaniel', 'n02085936-Maltese_dog', 'n02086079-Pekinese', 'n02086240-Shih-Tzu', 'n02086646-Blenheim_spaniel', 'n02086910-papillon', 'n02087046-toy_terrier', 'n02087394-Rhodesian_ridgeback', 'n02088094-Afghan_hound', 'n02088238-basset', 'n02088364-beagle', 'n02088466-bloodhound', 'n02088632-bluetick', 'n02089078-black-and-tan_coonhound', 'n02089867-Walker_hound', 'n02089973-English_foxhound', 'n02090379-redbone', 'n02090622-borzoi', 'n02090721-Irish_wolfhound', 'n02091032-Italian_greyhound', 'n02091134-whippet', 'n02091244-Ibizan_hound', 'n02091467-Norwegian_elkhound', 'n02091635-otterhound', 'n02091831-Saluki', 'n02092002-Scottish_deerhound', 'n02092339-Weimaraner', 'n02093256-Staffordshire_bullterrier', 'n02093428-American_Staffordshire_terrier', 'n02093647-Bedlington_terrier', 'n02093754-Border_terrier', 'n02093859-Kerry_blue_terrier', 'n02093991-Irish_terrier', 'n02094114-Norfolk_terrier', 'n02094258-Norwich_terrier', 'n020

# Training and Evaluation

## Define data collator

- torch.stack : concatenates a sequence of tensors

In [None]:
import torch

def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['label'] for x in batch])
    }

## Define an evaluation metric

- datasets의 load_metric

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

# Load Model

- num_labels
- id2label, label2id

In [None]:
from transformers import ViTForImageClassification

labels = dataset['train'].features['label'].names

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# TrainingArguments & Trainer

In [None]:
import os
import shutil
import datetime

now = datetime.datetime.now()
train_start = now.strftime("%Y-%m-%d_%H%M%S")
OUTPUT_DIR = f'./{train_start}'

def save_cpt2gdrive():
    folder_prefix = "checkpoint-"
    folder_names = [os.path.join(OUTPUT_DIR, name) for name in os.listdir(OUTPUT_DIR) if name.startswith(folder_prefix)]
    latest_folder = max(folder_names, key=os.path.getctime)
    
    dst_path = f"/content/drive/MyDrive/stanford_dogs/{latest_folder}"
    shutil.copytree(latest_folder, dst_path)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir=OUTPUT_DIR,
  per_device_train_batch_size=64,
  per_device_eval_batch_size=32,
  learning_rate=2e-5,
  num_train_epochs=15,
  fp16=True,
  evaluation_strategy="epoch",
  logging_strategy="epoch",
  save_strategy="epoch",
  weight_decay=0.01, 
  remove_unused_columns=False, #########
  save_total_limit=2,
  load_best_model_at_end=True,
  report_to='tensorboard',
  push_to_hub=False,
)

In [None]:
from transformers import TrainerCallback

class TrainerCallbacks(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        save_cpt2gdrive()

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args, # TrainingArguments
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["test"],
    tokenizer=feature_extractor,  # 역할?
    callbacks=[TrainerCallbacks]
)

# Train 🚀

In [None]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()



Epoch,Training Loss,Validation Loss,Accuracy
1,4.309,3.851336,0.740039
2,3.4565,3.190349,0.802235
3,2.831,2.684331,0.828474
4,2.3152,2.274307,0.837707
5,1.882,1.934014,0.848397
6,1.517,1.667857,0.848397
7,1.2236,1.456297,0.854227
8,0.9877,1.283973,0.856171
9,0.8036,1.161724,0.855685
10,0.6633,1.070105,0.850826


***** train metrics *****
  epoch                    =          15.0
  total_flos               = 20072207886GF
  train_loss               =        1.4847
  train_runtime            =    1:31:20.65
  train_samples_per_second =        50.693
  train_steps_per_second   =         0.794


# Evaluate 📊

In [None]:
metrics = trainer.evaluate(prepared_ds['test'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =       15.0
  eval_accuracy           =     0.8484
  eval_loss               =     0.9161
  eval_runtime            = 0:00:21.75
  eval_samples_per_second =      94.61
  eval_steps_per_second   =      2.988


# Predict

In [None]:
from transformers import AutoFeatureExtractor, AutoModelForImageClassification

extractor = AutoFeatureExtractor.from_pretrained("/content/2023-04-03_021039/")
model = AutoModelForImageClassification.from_pretrained("/content/2023-04-03_021039/")



In [None]:
from PIL import Image

img = Image.open('ex1.jpg')
img

In [None]:
inputs = extractor(images=img, return_tensors="pt")['pixel_values']
outputs = model(inputs)