In [1]:
!pip install tensorboard
!pip install -qqq accelerate==0.28.0
!pip install -qqq transformers==4.48.3
!pip install -qqq datasets==3.6.0

!pip install -U accelerate

import os
from google.colab import drive

drive.mount('/content/drive')

zip_file = "/content/drive/MyDrive/ai_term/train.zip"
extract_dir = "data"
train_data_labels_path = os.path.join(extract_dir, "train", "train_labels.csv") 

if not os.path.exists(extract_dir) or not os.path.exists(train_data_labels_path):
    print(f"{extract_dir} 디렉토리 또는 필요한 파일({train_data_labels_path})이 존재하지 않습니다. 압축 해제합니다.")
    if not os.path.exists(extract_dir):
        os.makedirs(extract_dir)
    !unzip -q {zip_file} -d {extract_dir}
else:
    print(f"{extract_dir} 디렉토리와 필요한 파일이 이미 존재합니다. 압축 해제를 건너뜁니다.")

train_data_labels = train_data_labels_path
train_image_path = os.path.join(extract_dir, "train", "images") + "/"

zsh:1: no matches found: https://docs.googㅁle.com/uc?id=1e7P8XjrkPSKzIrmjt-zi5ndKxpuG07X8
mkdir: data: File exists
unzip:  cannot find or open train.zip, train.zip.zip or train.zip.ZIP.


In [4]:
train_data_labels = "./data/train/train_labels.csv"
train_image_path = "./data/train/images/"

model_output_path = "./output"

fruit_labels = ["apple", "asian pear", "banana", "cherry", "grape", "pineapple"]
style_labels = ["pencil color", "oil painting", "water color"]

In [5]:
from transformers import AutoImageProcessor
from transformers import Trainer, TrainingArguments
from transformers import ViTModel, PreTrainedModel, ViTConfig
import transformers as tf

from transformers.modeling_outputs import SequenceClassifierOutput

from datasets import Dataset, load_dataset
from datasets.features import ClassLabel, Image


import torch.nn as nn
import torch


from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import train_test_split


import matplotlib.pyplot as plt


import numpy as np


import pandas as pd



In [6]:
import os
import random

def set_seed(seed_value=42):
    """Set seed for reproducibility."""
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)  
    random.seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()

In [9]:
def load_data(csv_path):
    df = pd.read_csv(csv_path)

    df['image'] = train_image_path + df['file_name']

    ds = Dataset.from_pandas(df)
    ds = ds.cast_column("image", Image())

    return ds

df = pd.read_csv(train_data_labels)

dataset = load_data(train_data_labels)

In [10]:
print(df.head())
print()
print(dataset)

  file_name  style  fruit
0     0.jpg      0      0
1     1.jpg      0      0
2     2.jpg      0      0
3     3.jpg      0      0
4     4.jpg      0      0

Dataset({
    features: ['file_name', 'style', 'fruit', 'image'],
    num_rows: 7200
})


In [11]:
feature_extractor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")

def preprocess_images(examples):
    images = [feature_extractor(image.convert("RGB")) for image in examples["image"]]
    examples['pixel_values'] = [image['pixel_values'][0] for image in images]
    return examples

dataset = dataset.map(preprocess_images, batched=True)
dataset.set_format(type='torch', columns=['image', 'pixel_values', 'fruit', 'style'])

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


Map:   0%|          | 0/7200 [00:00<?, ? examples/s]

In [12]:
train_val_dataset = dataset.train_test_split(test_size=0.3, seed=42)
train_val = train_val_dataset["train"]
test = train_val_dataset["test"]

train_val = train_val.train_test_split(test_size=0.2, seed=42)
train = train_val["train"]
val = train_val["test"]

In [13]:
class MultiTaskViTConfig(ViTConfig):
    def __init__(self, num_fruit=6, num_style=3, **kwargs):
        super().__init__(**kwargs)
        self.num_fruit = num_fruit
        self.num_style = num_style

class MultiTaskViT(PreTrainedModel):
    config_class = MultiTaskViTConfig

    def __init__(self, config):
        super().__init__(config)

        self.vit = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
        hidden_size = self.vit.config.hidden_size

        self.fruit_classifier = nn.Linear(hidden_size, config.num_fruit)
        self.style_classifier = nn.Linear(hidden_size, config.num_style)

    def forward(self, pixel_values, fruit=None, style=None, **kwargs):
        outputs = self.vit(pixel_values=pixel_values)
        pooled_output = outputs.pooler_output 


        fruit_logits = self.fruit_classifier(pooled_output)
        style_logits = self.style_classifier(pooled_output)

        loss = None
        if fruit is not None and style is not None:
            loss_fn = nn.CrossEntropyLoss()
            fruit_loss = loss_fn(fruit_logits, fruit)
            style_loss = loss_fn(style_logits, style)
            loss = fruit_loss + style_loss

        return SequenceClassifierOutput(
            loss=loss,
            logits=(fruit_logits, style_logits),
        )

In [14]:
train_epoch = 15
learning_rate = 1e-4
batch_size = 128
weight_decay = 0.01

In [None]:
num_fruit = len(set(int(x) for x in dataset["fruit"]))
num_style = len(set(int(x) for x in dataset["style"]))

print("num_fruit: ", num_fruit)
print("num_style: ", num_style)

config = MultiTaskViTConfig.from_pretrained(
    "google/vit-base-patch16-224-in21k",
)

model = MultiTaskViT(config=config)

num_fruit:  6
num_style:  3


In [285]:
training_args = TrainingArguments(
    output_dir="./vit_fruit_classification",
    eval_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=train_epoch,
    weight_decay=weight_decay,
    logging_dir='./logs',
    report_to=["tensorboard"],
    label_names=["fruit", "style"],
)

def compute_metrics(eval_pred):
    fruit_logits, style_logits = eval_pred.predictions

    fruit_labels, style_labels = eval_pred.label_ids

    fruit_predictions = np.argmax(fruit_logits, axis=-1)
    style_predictions = np.argmax(style_logits, axis=-1)

    fruit_precision, fruit_recall, fruit_f1, _ = precision_recall_fscore_support(fruit_labels, fruit_predictions,
                                                               average='weighted')
    fruit_acc = accuracy_score(fruit_labels, fruit_predictions)

    style_precision, style_recall, style_f1, _ = precision_recall_fscore_support(style_labels, style_predictions,
                                                               average='weighted')
    style_acc = accuracy_score(style_labels, style_predictions)

    return {"fruit_acc": fruit_acc, "fruit_precision": fruit_precision, "fruit_recall": fruit_recall, "fruit_f1": fruit_f1,
            "style_acc": style_acc, "style_precision": style_precision, "style_recall": style_recall, "style_f1": style_f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=val,
    compute_metrics=compute_metrics,
)

In [286]:
trainer.train()

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Epoch,Training Loss,Validation Loss,Fruit Acc,Fruit Precision,Fruit Recall,Fruit F1,Style Acc,Style Precision,Style Recall,Style F1
1,No log,0.129347,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,No log,0.031596,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,No log,0.022059,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


TrainOutput(global_step=15, training_loss=0.4414660135904948, metrics={'train_runtime': 7.7772, 'train_samples_per_second': 30.859, 'train_steps_per_second': 1.929, 'total_flos': 0.0, 'train_loss': 0.4414660135904948, 'epoch': 3.0})

In [None]:
model.save_pretrained("./vit_fruit_cls")

feature_extractor.save_pretrained('./vit_fruit_cls')

RuntimeError: File ./vit_fruit_cls cannot be opened.

In [16]:
from transformers import AutoFeatureExtractor

if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

print(f"Using device: {device}")

extractor = AutoFeatureExtractor.from_pretrained('./vit_fruit_cls')
model = MultiTaskViT.from_pretrained("./vit_fruit_cls").to(device).eval()



In [None]:
test['image'].shape

tensor([0, 1, 3,  ..., 5, 1, 1])

In [23]:
from PIL import Image
import torchvision.transforms.functional as TF
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm


fruit_preds, style_preds = [], []
fruit_labels, style_labels = [], []

for item in tqdm(test):
    image = TF.to_pil_image(item['image'])

    inputs = feature_extractor(images=image, return_tensors="pt")
    pixel_values = inputs["pixel_values"].to(device)

    with torch.no_grad():
        outputs = model(pixel_values=pixel_values)
        fruit_logits, style_logits = outputs.logits

        fruit_pred = torch.argmax(fruit_logits, dim=-1).item()
        style_pred = torch.argmax(style_logits, dim=-1).item()

    fruit_preds.append(fruit_pred)
    style_preds.append(style_pred)
    fruit_labels.append(item['fruit'].item())
    style_labels.append(item['style'].item())

def report(task_name, y_true, y_pred):
    print(f"[{task_name}]")
    print(f"  Accuracy : {accuracy_score(y_true, y_pred):.4f}")
    print(f"  Precision: {precision_score(y_true, y_pred, average='macro'):.4f}")
    print(f"  Recall   : {recall_score(y_true, y_pred, average='macro'):.4f}")
    print(f"  F1 Score : {f1_score(y_true, y_pred, average='macro'):.4f}\n")

report("Fruit", fruit_labels, fruit_preds)
report("Style", style_labels, style_preds)

100%|██████████| 1440/1440 [00:49<00:00, 29.38it/s]

[Fruit]
  Accuracy : 0.9993
  Precision: 0.9993
  Recall   : 0.9993
  F1 Score : 0.9993

[Style]
  Accuracy : 1.0000
  Precision: 1.0000
  Recall   : 1.0000
  F1 Score : 1.0000




