In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import load_dataset

dataset = load_dataset("imagefolder", data_dir = "imageFolder")


In [3]:
labels = dataset["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [32]:
from transformers import AutoImageProcessor

checkpoint = "facebook/convnextv2-tiny-1k-224"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

In [33]:
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor

normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])

def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples

from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

import evaluate

accuracy = evaluate.load("accuracy")

import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [34]:
dataset = dataset.with_transform(transforms)

# Load model

In [None]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

# Training

In [45]:
training_args = TrainingArguments(
    output_dir="finetuned_convnext",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

In [46]:
trainer.train()

  0%|          | 0/150 [00:00<?, ?it/s]

{'loss': 0.1732, 'grad_norm': 2.4868667125701904, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.66}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.22389957308769226, 'eval_accuracy': 0.9230769230769231, 'eval_runtime': 0.9046, 'eval_samples_per_second': 57.485, 'eval_steps_per_second': 4.422, 'epoch': 0.98}
{'loss': 0.1295, 'grad_norm': 7.544907569885254, 'learning_rate': 4.814814814814815e-05, 'epoch': 1.31}
{'loss': 0.1138, 'grad_norm': 5.31884765625, 'learning_rate': 4.4444444444444447e-05, 'epoch': 1.97}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.08274884521961212, 'eval_accuracy': 0.9615384615384616, 'eval_runtime': 0.8627, 'eval_samples_per_second': 60.279, 'eval_steps_per_second': 4.637, 'epoch': 1.97}
{'loss': 0.0902, 'grad_norm': 3.876699924468994, 'learning_rate': 4.074074074074074e-05, 'epoch': 2.62}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.057396210730075836, 'eval_accuracy': 0.9807692307692307, 'eval_runtime': 1.0193, 'eval_samples_per_second': 51.017, 'eval_steps_per_second': 3.924, 'epoch': 2.95}
{'loss': 0.1006, 'grad_norm': 9.02840805053711, 'learning_rate': 3.7037037037037037e-05, 'epoch': 3.28}
{'loss': 0.0753, 'grad_norm': 1.084635615348816, 'learning_rate': 3.3333333333333335e-05, 'epoch': 3.93}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.11841147392988205, 'eval_accuracy': 0.9615384615384616, 'eval_runtime': 1.111, 'eval_samples_per_second': 46.803, 'eval_steps_per_second': 3.6, 'epoch': 4.0}
{'loss': 0.1, 'grad_norm': 5.156762599945068, 'learning_rate': 2.962962962962963e-05, 'epoch': 4.59}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.07569532096385956, 'eval_accuracy': 0.9615384615384616, 'eval_runtime': 1.0312, 'eval_samples_per_second': 50.426, 'eval_steps_per_second': 3.879, 'epoch': 4.98}
{'loss': 0.0642, 'grad_norm': 5.219565391540527, 'learning_rate': 2.5925925925925925e-05, 'epoch': 5.25}
{'loss': 0.0638, 'grad_norm': 4.011533260345459, 'learning_rate': 2.2222222222222223e-05, 'epoch': 5.9}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.04036417976021767, 'eval_accuracy': 1.0, 'eval_runtime': 1.108, 'eval_samples_per_second': 46.931, 'eval_steps_per_second': 3.61, 'epoch': 5.97}
{'loss': 0.0474, 'grad_norm': 3.725167751312256, 'learning_rate': 1.8518518518518518e-05, 'epoch': 6.56}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.025937672704458237, 'eval_accuracy': 1.0, 'eval_runtime': 1.1958, 'eval_samples_per_second': 43.486, 'eval_steps_per_second': 3.345, 'epoch': 6.95}
{'loss': 0.0703, 'grad_norm': 12.790877342224121, 'learning_rate': 1.4814814814814815e-05, 'epoch': 7.21}
{'loss': 0.0289, 'grad_norm': 3.509174346923828, 'learning_rate': 1.1111111111111112e-05, 'epoch': 7.87}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.04577340930700302, 'eval_accuracy': 1.0, 'eval_runtime': 1.0562, 'eval_samples_per_second': 49.234, 'eval_steps_per_second': 3.787, 'epoch': 8.0}
{'loss': 0.0496, 'grad_norm': 0.8930109739303589, 'learning_rate': 7.4074074074074075e-06, 'epoch': 8.52}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.03661813586950302, 'eval_accuracy': 0.9807692307692307, 'eval_runtime': 1.0691, 'eval_samples_per_second': 48.639, 'eval_steps_per_second': 3.741, 'epoch': 8.98}
{'loss': 0.0468, 'grad_norm': 2.068697214126587, 'learning_rate': 3.7037037037037037e-06, 'epoch': 9.18}
{'loss': 0.0584, 'grad_norm': 9.718114852905273, 'learning_rate': 0.0, 'epoch': 9.84}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.04591220244765282, 'eval_accuracy': 0.9615384615384616, 'eval_runtime': 1.126, 'eval_samples_per_second': 46.182, 'eval_steps_per_second': 3.552, 'epoch': 9.84}
{'train_runtime': 385.0277, 'train_samples_per_second': 25.297, 'train_steps_per_second': 0.39, 'train_loss': 0.08079533378283182, 'epoch': 9.84}


TrainOutput(global_step=150, training_loss=0.08079533378283182, metrics={'train_runtime': 385.0277, 'train_samples_per_second': 25.297, 'train_steps_per_second': 0.39, 'train_loss': 0.08079533378283182, 'epoch': 9.84})

In [47]:
trainer.model.push_to_hub(repo_id="henry-heppe/img_class_convnext", private=True)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/111M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/henry-heppe/img_class_convnext/commit/9404d3197fa6a9002b407f332ff89b282f49410a', commit_message='Upload ConvNextV2ForImageClassification', commit_description='', oid='9404d3197fa6a9002b407f332ff89b282f49410a', pr_url=None, pr_revision=None, pr_num=None)

# Evaluate finetuned model on extended test set

In [None]:
from datasets import load_dataset

dataset = load_dataset("imagefolder", data_dir = "imageTestSet")

labels = dataset["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

from transformers import AutoImageProcessor

checkpoint = "henry-heppe/finetuned_convnextv2"
image_processor = AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224")

In [10]:
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor

normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])

def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples

from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

import evaluate

accuracy = evaluate.load("accuracy")

import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

dataset = dataset.with_transform(transforms)

In [11]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id
)

In [None]:
training_args = TrainingArguments(
    output_dir="finetuned_convnext",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["train"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

In [13]:
trainer.evaluate()

  0%|          | 0/12 [00:00<?, ?it/s]

{'eval_loss': 0.19674691557884216,
 'eval_accuracy': 0.9175824175824175,
 'eval_runtime': 4.0916,
 'eval_samples_per_second': 44.481,
 'eval_steps_per_second': 2.933}