In [96]:
!pip install -q torch torchvision transformers tqdm scikit-learn matplotlib ipywidgets timm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, models, transforms
from torchvision.transforms import Compose, RandomHorizontalFlip, RandomRotation, Resize, CenterCrop, ToTensor, Normalize, RandomResizedCrop
from torchvision.models import efficientnet_v2_s, EfficientNet_V2_S_Weights
from transformers import ViTForImageClassification, ViTImageProcessor, get_linear_schedule_with_warmup
from tqdm import tqdm
from sklearn.metrics import f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import os

from torch.amp import GradScaler, autocast

In [3]:
torch.manual_seed(42)

data_root = './data'

full_train_dataset = datasets.Food101(
    root=data_root,
    split='train',
    download=False
)

Downloading https://data.vision.ee.ethz.ch/cvl/food-101.tar.gz to ./data/food-101.tar.gz


100%|██████████| 5.00G/5.00G [08:01<00:00, 10.4MB/s]   


Extracting ./data/food-101.tar.gz to ./data


In [4]:
train_size = int(0.8 * len(full_train_dataset))  # Training: 80%
val_size = len(full_train_dataset) - train_size  # Validation: 20%

# Dataset Split
train_dataset, val_dataset = random_split(full_train_dataset, [train_size, val_size])

# Load Test Dataset
test_dataset = datasets.Food101(
    root=data_root,
    split='test',
    download=False
)

In [5]:
feature_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')

class ViTTransform:
    def __init__(self, feature_processor, augment=False):
        self.feature_processor = feature_processor
        if augment:
            self.augment = Compose([
                RandomHorizontalFlip(),
                RandomRotation(10),
            ])
        else:
            self.augment = None

    def __call__(self, image):
        if self.augment:
            image = self.augment(image)
        encoding = self.feature_processor(images=image, return_tensors="pt")
        return encoding['pixel_values'].squeeze()

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

In [6]:
full_train_dataset.transform = ViTTransform(feature_processor, augment=True)
test_dataset.transform = ViTTransform(feature_processor, augment=False)

In [7]:
# Configure DataLoader
batch_size = 64
num_workers = os.cpu_count()
epochs = 10  
patience = 3 

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    pin_memory=True
)

validation_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    pin_memory=True
)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Set CUDA Optimization
torch.backends.cudnn.benchmark = True

# Initialize ViT Model
model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224',
    num_labels=101,  # Food101 클래스 수로 설정
    ignore_mismatched_sizes=True
)

# Set ID and Label Mapping (from Original Dataset)
model.config.id2label = {str(i): label for i, label in enumerate(full_train_dataset.classes)}
model.config.label2id = {label: i for i, label in enumerate(full_train_dataset.classes)}

# Freeze Parameters: Fix all parameters of the model
for param in model.parameters():
    param.requires_grad = False

# Set only the last classifier layer to be trainable
for param in model.classifier.parameters():
    param.requires_grad = True

# Move the model to the device
model.to(device)

Using device: cuda


config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([101]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([101, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

In [9]:
optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5)
criterion = nn.CrossEntropyLoss()

num_training_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,  # Number of warm-up steps
    num_training_steps=num_training_steps
)

scaler = GradScaler('cuda')

best_val_loss = float('inf')
epochs_no_improve = 0
early_stop = False

In [10]:
for epoch in range(epochs):
    if early_stop:
        print("Early stopping triggered.")
        break

    # Training steps
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    all_preds = []
    all_labels = []

    loop = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs} [Training]', leave=False)
    for batch in loop:
        inputs, labels = batch  # Batch Unpacking
        inputs = inputs.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        optimizer.zero_grad()

        with autocast('cuda'):  # Use modified autocast
            outputs = model(inputs)
            loss = criterion(outputs.logits, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        # Update statistics
        running_loss += loss.item() * inputs.size(0)
        _, preds = torch.max(outputs.logits, 1)
        correct_predictions += torch.sum(preds == labels)
        total_predictions += labels.size(0)

        # Save predictions and actual labels
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        loop.set_postfix(loss=loss.item())

    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_acc = correct_predictions.double() / total_predictions
    f1 = f1_score(all_labels, all_preds, average='weighted')
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    print(f'Epoch {epoch+1}/{epochs} [Training] - Loss: {epoch_loss:.4f} | Acc: {epoch_acc:.4f} | F1: {f1:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f}')

    # Validation step
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    val_preds = []
    val_labels = []

    with torch.no_grad():
        loop = tqdm(validation_loader, desc=f'Epoch {epoch+1}/{epochs} [Validation]', leave=False)
        for batch in loop:
            inputs, labels = batch  # Batch Unpacking
            inputs = inputs.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            with autocast('cuda'):  # Use modified autocast
                outputs = model(inputs)
                loss = criterion(outputs.logits, labels)

            val_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs.logits, 1)
            val_correct += torch.sum(preds == labels)
            val_total += labels.size(0)

            # Save predictions and actual labels
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    val_epoch_loss = val_loss / len(validation_loader.dataset)
    val_epoch_acc = val_correct.double() / val_total
    val_f1 = f1_score(val_labels, val_preds, average='weighted')
    val_precision = precision_score(val_labels, val_preds, average='weighted')
    val_recall = recall_score(val_labels, val_preds, average='weighted')
    print(f'Epoch {epoch+1}/{epochs} [Validation] - Loss: {val_epoch_loss:.4f} | Acc: {val_epoch_acc:.4f} | F1: {val_f1:.4f} | Precision: {val_precision:.4f} | Recall: {val_recall:.4f}')

    # Early Stopping check
    if val_epoch_loss < best_val_loss:
        best_val_loss = val_epoch_loss
        epochs_no_improve = 0
        # Save model
        torch.save(model.state_dict(), 'ViT_best_model_state.bin')
        print(f'Validation loss improved. Saving model...')
    else:
        epochs_no_improve += 1
        print(f'No improvement in validation loss for {epochs_no_improve} epoch(s).')
        if epochs_no_improve >= patience:
            print(f'Early stopping triggered after {patience} epochs with no improvement.')
            early_stop = True

# Load model after training completion
model.load_state_dict(torch.load('ViT_best_model_state.bin'))

# Save the model and feature processor (changed to ViTImageProcessor)
model.save_pretrained('./vit-food101')
feature_processor.save_pretrained('./vit-food101')

print("Training complete and model saved.")

                                                                                   

Epoch 1/10 [Training] - Loss: 2.8362 | Acc: 0.5006 | F1: 0.5000 | Precision: 0.5269 | Recall: 0.5006


                                                                          

Epoch 1/10 [Validation] - Loss: 1.7542 | Acc: 0.6931 | F1: 0.6865 | Precision: 0.6947 | Recall: 0.6931
Validation loss improved. Saving model...


                                                                                    

Epoch 2/10 [Training] - Loss: 1.4293 | Acc: 0.7245 | F1: 0.7216 | Precision: 0.7240 | Recall: 0.7245


                                                                          

Epoch 2/10 [Validation] - Loss: 1.2461 | Acc: 0.7363 | F1: 0.7338 | Precision: 0.7376 | Recall: 0.7363
Validation loss improved. Saving model...


                                                                                    

Epoch 3/10 [Training] - Loss: 1.1284 | Acc: 0.7527 | F1: 0.7513 | Precision: 0.7529 | Recall: 0.7527


                                                                          

Epoch 3/10 [Validation] - Loss: 1.0830 | Acc: 0.7513 | F1: 0.7500 | Precision: 0.7527 | Recall: 0.7513
Validation loss improved. Saving model...


                                                                                    

Epoch 4/10 [Training] - Loss: 1.0058 | Acc: 0.7661 | F1: 0.7653 | Precision: 0.7665 | Recall: 0.7661


                                                                          

Epoch 4/10 [Validation] - Loss: 1.0004 | Acc: 0.7628 | F1: 0.7615 | Precision: 0.7638 | Recall: 0.7628
Validation loss improved. Saving model...


                                                                                    

Epoch 5/10 [Training] - Loss: 0.9421 | Acc: 0.7719 | F1: 0.7712 | Precision: 0.7721 | Recall: 0.7719


                                                                          

Epoch 5/10 [Validation] - Loss: 0.9567 | Acc: 0.7686 | F1: 0.7681 | Precision: 0.7702 | Recall: 0.7686
Validation loss improved. Saving model...


                                                                                    

Epoch 6/10 [Training] - Loss: 0.9019 | Acc: 0.7794 | F1: 0.7788 | Precision: 0.7795 | Recall: 0.7794


                                                                          

Epoch 6/10 [Validation] - Loss: 0.9238 | Acc: 0.7716 | F1: 0.7707 | Precision: 0.7724 | Recall: 0.7716
Validation loss improved. Saving model...


                                                                                    

Epoch 7/10 [Training] - Loss: 0.8729 | Acc: 0.7818 | F1: 0.7812 | Precision: 0.7821 | Recall: 0.7818


                                                                          

Epoch 7/10 [Validation] - Loss: 0.9040 | Acc: 0.7758 | F1: 0.7752 | Precision: 0.7771 | Recall: 0.7758
Validation loss improved. Saving model...


                                                                                    

Epoch 8/10 [Training] - Loss: 0.8569 | Acc: 0.7852 | F1: 0.7847 | Precision: 0.7854 | Recall: 0.7852


                                                                          

Epoch 8/10 [Validation] - Loss: 0.8976 | Acc: 0.7758 | F1: 0.7752 | Precision: 0.7771 | Recall: 0.7758
Validation loss improved. Saving model...


                                                                                    

Epoch 9/10 [Training] - Loss: 0.8445 | Acc: 0.7880 | F1: 0.7875 | Precision: 0.7881 | Recall: 0.7880


                                                                          

Epoch 9/10 [Validation] - Loss: 0.8913 | Acc: 0.7782 | F1: 0.7771 | Precision: 0.7788 | Recall: 0.7782
Validation loss improved. Saving model...


                                                                                     

Epoch 10/10 [Training] - Loss: 0.8392 | Acc: 0.7891 | F1: 0.7885 | Precision: 0.7891 | Recall: 0.7891


  model.load_state_dict(torch.load('ViT_best_model_state.bin'))


Epoch 10/10 [Validation] - Loss: 0.8929 | Acc: 0.7745 | F1: 0.7735 | Precision: 0.7751 | Recall: 0.7745
No improvement in validation loss for 1 epoch(s).
Training complete and model saved.


In [11]:
# Load the model and Feature Extractor using from_pretrained
model = ViTForImageClassification.from_pretrained('./vit-food101')
feature_processor = ViTImageProcessor.from_pretrained('./vit-food101')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move to device
model.to(device)

# Switch to evaluation mode
model.eval()

criterion = nn.CrossEntropyLoss()

test_loss = 0.0
test_correct = 0
test_total = 0

test_preds = []
test_labels = []

with torch.no_grad():
    loop = tqdm(test_loader, desc='[Test Evaluation]', leave=False)
    for batch in loop:
        inputs, labels = batch
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)
        loss = criterion(outputs.logits, labels)

        test_loss += loss.item() * inputs.size(0)
        _, preds = torch.max(outputs.logits, 1)
        test_correct += torch.sum(preds == labels)
        test_total += labels.size(0)

        # Save predictions and actual labels
        test_preds.extend(preds.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_epoch_loss = test_loss / len(test_dataset)
test_epoch_acc = test_correct.double() / test_total
test_f1 = f1_score(test_labels, test_preds, average='weighted')
test_precision = precision_score(test_labels, test_preds, average='weighted')
test_recall = recall_score(test_labels, test_preds, average='weighted')
print(f'[Test Evaluation] - Loss: {test_epoch_loss:.4f} | Acc: {test_epoch_acc:.4f} | F1: {test_f1:.4f} | Precision: {test_precision:.4f} | Recall: {test_recall:.4f}')

                                                                    

[Test Evaluation] - Loss: 0.6902 | Acc: 0.8282 | F1: 0.8279 | Precision: 0.8293 | Recall: 0.8282




In [35]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader, random_split
import time
import copy
import os

from torchvision.models import efficientnet_v2_s, EfficientNet_V2_S_Weights
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score

In [36]:
# Data root directory
data_root = './data'

# Define transforms
train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(384),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

val_test_transforms = transforms.Compose([
    transforms.Resize(384),
    transforms.CenterCrop(384),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

# Load the datasets with ImageFolder
full_train_dataset = datasets.Food101(
    root=data_root,
    split='train',
    transform=train_transforms,
    download=False
)

# Split into training and validation
train_size = int(0.8 * len(full_train_dataset))
val_size = len(full_train_dataset) - train_size
train_dataset, val_dataset = random_split(full_train_dataset, [train_size, val_size])

# Apply validation transforms to the validation dataset
val_dataset.dataset.transform = val_test_transforms

# Load test dataset
test_dataset = datasets.Food101(
    root=data_root,
    split='test',
    transform=val_test_transforms,
    download=False
)

# Create data loaders
batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

# Class names
class_names = full_train_dataset.classes
num_classes = len(class_names)
print(f'Number of classes: {num_classes}')

Number of classes: 101


In [52]:
# Load the pretrained EfficientNet_V2_S model
model = models.efficientnet_v2_s(weights=models.EfficientNet_V2_S_Weights.DEFAULT)

# Modify the classifier to match the number of classes in Food101
# The classifier is typically the last layer
num_ftrs = model.classifier[1].in_features
model.classifier[1] = nn.Linear(num_ftrs, num_classes)

# Move the model to the device
model = model.to(device)

# Freeze all layers
for param in model.parameters():
    param.requires_grad = False

# Retrieve the layers from the model
layers = list(model.features.children())

for layer in layers[-2:]:
    for param in layer.parameters():
        param.requires_grad = True
for param in model.classifier.parameters():
    param.requires_grad = True

trainable_params = [name for name, param in model.named_parameters() if param.requires_grad]
print(f'Trainable parameters: {trainable_params}')

Trainable parameters: ['features.6.0.block.0.0.weight', 'features.6.0.block.0.1.weight', 'features.6.0.block.0.1.bias', 'features.6.0.block.1.0.weight', 'features.6.0.block.1.1.weight', 'features.6.0.block.1.1.bias', 'features.6.0.block.2.fc1.weight', 'features.6.0.block.2.fc1.bias', 'features.6.0.block.2.fc2.weight', 'features.6.0.block.2.fc2.bias', 'features.6.0.block.3.0.weight', 'features.6.0.block.3.1.weight', 'features.6.0.block.3.1.bias', 'features.6.1.block.0.0.weight', 'features.6.1.block.0.1.weight', 'features.6.1.block.0.1.bias', 'features.6.1.block.1.0.weight', 'features.6.1.block.1.1.weight', 'features.6.1.block.1.1.bias', 'features.6.1.block.2.fc1.weight', 'features.6.1.block.2.fc1.bias', 'features.6.1.block.2.fc2.weight', 'features.6.1.block.2.fc2.bias', 'features.6.1.block.3.0.weight', 'features.6.1.block.3.1.weight', 'features.6.1.block.3.1.bias', 'features.6.2.block.0.0.weight', 'features.6.2.block.0.1.weight', 'features.6.2.block.0.1.bias', 'features.6.2.block.1.0.we

In [53]:
# Define loss function
criterion = nn.CrossEntropyLoss()

# Collect only the parameters that require gradients
trainable_parameters = [param for param in model.parameters() if param.requires_grad]

# Define optimizer
optimizer = optim.Adam(trainable_parameters, lr=5e-5)

# Define a learning rate scheduler
scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [54]:
def train_model(
    model, 
    criterion, 
    optimizer, 
    scheduler, 
    train_loader, 
    val_loader, 
    device, 
    num_epochs=10, 
    patience=3
):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    best_val_loss = float('inf')
    epochs_no_improve = 0
    early_stop = False

    scaler = GradScaler()  # Initialize scaler for mixed precision training

    for epoch in range(num_epochs):
        if early_stop:
            print("Early stopping triggered.")
            break

        print(f'Epoch {epoch +1}/{num_epochs}')
        print('-' * 10)

        # Training step
        model.train()
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        all_preds = []
        all_labels = []

        loop = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs} [Training]', leave=False)
        for batch in loop:
            inputs, labels = batch  # Batch unpacking
            inputs = inputs.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            optimizer.zero_grad()

            with autocast('cuda'):  # Use mixed precision
                outputs = model(inputs)
                loss = criterion(outputs, labels)  # EfficientNet returns outputs, not outputs.logits

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            # Update statistics
            running_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, 1)
            correct_predictions += torch.sum(preds == labels)
            total_predictions += labels.size(0)

            # Save predictions and actual labels
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            loop.set_postfix(loss=loss.item())

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc = correct_predictions.double() / total_predictions
        f1 = f1_score(all_labels, all_preds, average='weighted')
        precision = precision_score(all_labels, all_preds, average='weighted')
        recall = recall_score(all_labels, all_preds, average='weighted')
        print(f'[Training] Loss: {epoch_loss:.4f} | Acc: {epoch_acc:.4f} | F1: {f1:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f}')

        # Validation step
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0

        val_preds = []
        val_labels = []

        with torch.no_grad():
            loop = tqdm(val_loader, desc=f'Epoch {epoch+1}/{num_epochs} [Validation]', leave=False)
            for batch in loop:
                inputs, labels = batch  # Batch unpacking
                inputs = inputs.to(device, non_blocking=True)
                labels = labels.to(device, non_blocking=True)

                with autocast('cuda'):  # Use mixed precision
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)  # EfficientNet returns outputs, not outputs.logits

                val_loss += loss.item() * inputs.size(0)
                _, preds = torch.max(outputs, 1)
                val_correct += torch.sum(preds == labels)
                val_total += labels.size(0)

                # Save predictions and actual labels
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

        val_epoch_loss = val_loss / len(val_loader.dataset)
        val_epoch_acc = val_correct.double() / val_total
        val_f1 = f1_score(val_labels, val_preds, average='weighted')
        val_precision = precision_score(val_labels, val_preds, average='weighted')
        val_recall = recall_score(val_labels, val_preds, average='weighted')
        print(f'[Validation] Loss: {val_epoch_loss:.4f} | Acc: {val_epoch_acc:.4f} | F1: {val_f1:.4f} | Precision: {val_precision:.4f} | Recall: {val_recall:.4f}')

        # Scheduler step (typically after validation epoch)
        scheduler.step()

        # Early stopping check
        if val_epoch_loss < best_val_loss:
            best_val_loss = val_epoch_loss
            epochs_no_improve = 0
            # Save model
            torch.save(model.state_dict(), 'best_model_state.pth')
            print(f'Validation loss improved. Saving model...')
        else:
            epochs_no_improve += 1
            print(f'Validation loss has not improved for {epochs_no_improve} epoch(s).')
            if epochs_no_improve >= patience:
                print(f'Early stopping triggered after {patience} epochs with no improvement.')
                early_stop = True

        print()

    time_elapsed = time.time() - since
    print(f'Training completed in: {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best validation accuracy: {best_acc:.4f}')

    # Load best model weights
    model.load_state_dict(torch.load('best_model_state.pth', weights_only=True))
    return model

In [55]:
# Device setting
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Run training
model = train_model(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    train_loader=train_loader,
    val_loader=val_loader,
    device=device,
    num_epochs=10,
    patience=3
)

Epoch 1/10
----------


                                                                                    

[Training] Loss: 2.3790 | Acc: 0.4872 | F1: 0.4800 | Precision: 0.4911 | Recall: 0.4872


                                                                          

[Validation] Loss: 1.1013 | Acc: 0.7185 | F1: 0.7144 | Precision: 0.7178 | Recall: 0.7185
검증 손실이 개선되었습니다. 모델을 저장합니다...

Epoch 2/10
----------


                                                                                    

[Training] Loss: 0.9743 | Acc: 0.7519 | F1: 0.7508 | Precision: 0.7506 | Recall: 0.7519


                                                                          

[Validation] Loss: 0.8306 | Acc: 0.7804 | F1: 0.7792 | Precision: 0.7836 | Recall: 0.7804
검증 손실이 개선되었습니다. 모델을 저장합니다...

Epoch 3/10
----------


                                                                                    

[Training] Loss: 0.6752 | Acc: 0.8222 | F1: 0.8218 | Precision: 0.8219 | Recall: 0.8222


                                                                          

[Validation] Loss: 0.7568 | Acc: 0.7993 | F1: 0.7983 | Precision: 0.8003 | Recall: 0.7993
검증 손실이 개선되었습니다. 모델을 저장합니다...

Epoch 4/10
----------


                                                                                    

[Training] Loss: 0.4886 | Acc: 0.8705 | F1: 0.8704 | Precision: 0.8706 | Recall: 0.8705


                                                                          

[Validation] Loss: 0.7452 | Acc: 0.8059 | F1: 0.8050 | Precision: 0.8072 | Recall: 0.8059
검증 손실이 개선되었습니다. 모델을 저장합니다...

Epoch 5/10
----------


                                                                                     

[Training] Loss: 0.3619 | Acc: 0.9033 | F1: 0.9033 | Precision: 0.9034 | Recall: 0.9033


                                                                          

[Validation] Loss: 0.7420 | Acc: 0.8139 | F1: 0.8134 | Precision: 0.8154 | Recall: 0.8139
검증 손실이 개선되었습니다. 모델을 저장합니다...

Epoch 6/10
----------


                                                                                     

[Training] Loss: 0.2747 | Acc: 0.9265 | F1: 0.9264 | Precision: 0.9265 | Recall: 0.9265


                                                                          

[Validation] Loss: 0.7652 | Acc: 0.8125 | F1: 0.8112 | Precision: 0.8140 | Recall: 0.8125
검증 손실이 1 에폭 동안 개선되지 않았습니다.

Epoch 7/10
----------


                                                                                     

[Training] Loss: 0.2060 | Acc: 0.9440 | F1: 0.9440 | Precision: 0.9441 | Recall: 0.9440


                                                                          

[Validation] Loss: 0.7823 | Acc: 0.8150 | F1: 0.8148 | Precision: 0.8173 | Recall: 0.8150
검증 손실이 2 에폭 동안 개선되지 않았습니다.

Epoch 8/10
----------


                                                                                     

[Training] Loss: 0.1462 | Acc: 0.9618 | F1: 0.9617 | Precision: 0.9618 | Recall: 0.9618


                                                                          

[Validation] Loss: 0.7710 | Acc: 0.8224 | F1: 0.8225 | Precision: 0.8239 | Recall: 0.8224
검증 손실이 3 에폭 동안 개선되지 않았습니다.

Epoch 9/10
----------


                                                                                     

[Training] Loss: 0.1236 | Acc: 0.9684 | F1: 0.9684 | Precision: 0.9684 | Recall: 0.9684


                                                                          

[Validation] Loss: 0.7886 | Acc: 0.8236 | F1: 0.8232 | Precision: 0.8243 | Recall: 0.8236
검증 손실이 4 에폭 동안 개선되지 않았습니다.

Epoch 10/10
----------


                                                                                      

[Training] Loss: 0.1181 | Acc: 0.9704 | F1: 0.9704 | Precision: 0.9705 | Recall: 0.9704


                                                                           

[Validation] Loss: 0.7935 | Acc: 0.8236 | F1: 0.8234 | Precision: 0.8249 | Recall: 0.8236
검증 손실이 5 에폭 동안 개선되지 않았습니다.
5 에폭 동안 개선이 없어서 조기 종료를 트리거합니다.

학습 완료까지 걸린 시간: 38분 38초
최고 검증 정확도: 0.0000


In [56]:
print(model)
# Save model
torch.save(model.state_dict(), './efficientnet_food101.pth')

# Load model
model.load_state_dict(torch.load('./efficientnet_food101.pth', weights_only=True))
model.to(device)

print("Model saved.")

EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): FusedMBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
        )
        (stochastic_depth): StochasticDepth(p=0.0, mode=row)
      )
      (1): FusedMBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  

In [63]:
# Device setting
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.load_state_dict(torch.load('./efficientnet_food101.pth', weights_only=True))
model.to(device)

# Model loading function
def load_model(model_type, model_path, num_classes=101):
    if model_type == 'vit':
        model = ViTForImageClassification.from_pretrained(model_path)
    elif model_type == 'efficientnet':
        model = EfficientNet.from_name('efficientnet-b2')  # Adjust according to the EfficientNet version used
        num_features = model._fc.in_features
        model._fc = nn.Linear(num_features, num_classes)
    else:
        raise ValueError("Unsupported model type. Choose 'vit' or 'efficientnet'.")
    
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    return model

# Model initialization
efficientnet_model = model

def evaluate_model(model, loader, model_type='vit'):
    criterion = nn.CrossEntropyLoss()
    
    test_loss = 0.0
    test_correct = 0
    test_total = 0
    
    test_preds = []
    test_labels = []
    
    with torch.no_grad():
        loop = tqdm(loader, desc='[Test Evaluation]', leave=False)
        for batch in loop:
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            
            # Extract logits based on model type
            if model_type == 'vit':
                logits = outputs.logits
            elif model_type == 'efficientnet':
                logits = outputs  # EfficientNetV2 returns a Tensor directly
            else:
                raise ValueError("Unsupported model type. Choose 'vit' or 'efficientnet'.")

            loss = criterion(logits, labels)

            test_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(logits, 1)
            test_correct += torch.sum(preds == labels)
            test_total += labels.size(0)

            # Save predictions and actual labels
            test_preds.extend(preds.cpu().numpy())
            test_labels.extend(labels.cpu().numpy())
    
    test_epoch_loss = test_loss / len(loader.dataset)
    test_epoch_acc = test_correct.double() / test_total
    test_f1 = f1_score(test_labels, test_preds, average='weighted')
    test_precision = precision_score(test_labels, test_preds, average='weighted')
    test_recall = recall_score(test_labels, test_preds, average='weighted')
    return test_epoch_loss, test_epoch_acc, test_f1, test_precision, test_recall

# Evaluating EfficientNetV2 model
print("Evaluating EfficientNetV2 Model...")
eff_loss, eff_acc, eff_f1, eff_precision, eff_recall = evaluate_model(efficientnet_model, test_loader, model_type='efficientnet')
print(f'[EfficientNetV2 Test Evaluation] - Loss: {eff_loss:.4f} | Acc: {eff_acc:.4f} | F1: {eff_f1:.4f} | Precision: {eff_precision:.4f} | Recall: {eff_recall:.4f}')

Evaluating EfficientNetV2 Model...


                                                                    

[EfficientNetV2 Test Evaluation] - Loss: 0.5197 | Acc: 0.8613 | F1: 0.8610 | Precision: 0.8621 | Recall: 0.8613




In [65]:
import torch
import torch.nn as nn
from transformers import ViTForImageClassification
from torchvision.models import efficientnet_v2_s, EfficientNet_V2_S_Weights
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import os

In [66]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_classes = 101  # Number of classes in the Food101 dataset

In [98]:
vit_model = ViTForImageClassification.from_pretrained('./vit-food101')
vit_model.to(device)
vit_model.eval()
for param in vit_model.parameters():
    param.requires_grad = False

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

In [78]:
efficientnet_model.eval()
for param in efficientnet_model.parameters():
    param.requires_grad = False

In [79]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
vit_model.to(device)
efficientnet_model.to(device)

Using device: cuda


EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): FusedMBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
        )
        (stochastic_depth): StochasticDepth(p=0.0, mode=row)
      )
      (1): FusedMBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  

In [100]:
import torch
import torch.nn as nn
from transformers import ViTForImageClassification, ViTImageProcessor
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader, Subset
from tqdm import tqdm
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import os

# Ensure reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Data root directory
data_root = './data'

# Number of classes
num_classes = 101

# ViT Image Processor
feature_processor = ViTImageProcessor.from_pretrained('./vit-food101')

# ViT transforms
class ViTTransform:
    def __init__(self, feature_processor, augment=False):
        self.feature_processor = feature_processor
        if augment:
            self.augment = transforms.Compose([
                transforms.RandomHorizontalFlip(),
                transforms.RandomRotation(10),
            ])
        else:
            self.augment = None

    def __call__(self, image):
        if self.augment:
            image = self.augment(image)
        encoding = self.feature_processor(images=image, return_tensors="pt")
        return encoding['pixel_values'].squeeze()

# EfficientNet transforms
efficientnet_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(384),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ]),
    'val_test': transforms.Compose([
        transforms.Resize(384),
        transforms.CenterCrop(384),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ]),
}

# Load the full training datasets
full_train_dataset_vit = datasets.Food101(
    root=data_root,
    split='train',
    transform=ViTTransform(feature_processor, augment=True),
    download=False
)

full_train_dataset_eff = datasets.Food101(
    root=data_root,
    split='train',
    transform=efficientnet_transforms['train'],
    download=False
)

# Generate consistent train/validation indices
num_samples = len(full_train_dataset_vit)
indices = list(range(num_samples))
random_seed = 42

train_indices, val_indices = train_test_split(
    indices, test_size=0.2, random_state=random_seed)

# Create Subsets using the same indices for both datasets
train_dataset_vit = Subset(full_train_dataset_vit, train_indices)
val_dataset_vit = Subset(full_train_dataset_vit, val_indices)

train_dataset_eff = Subset(full_train_dataset_eff, train_indices)
val_dataset_eff = Subset(full_train_dataset_eff, val_indices)

# Apply validation transforms to the validation datasets
val_dataset_vit.dataset.transform = ViTTransform(feature_processor, augment=False)
val_dataset_eff.dataset.transform = efficientnet_transforms['val_test']

# Load test datasets
test_dataset_vit = datasets.Food101(
    root=data_root,
    split='test',
    transform=ViTTransform(feature_processor, augment=False),
    download=False
)

test_dataset_eff = datasets.Food101(
    root=data_root,
    split='test',
    transform=efficientnet_transforms['val_test'],
    download=False
)

# Data loaders
batch_size = 64
num_workers = os.cpu_count()

validation_loader_vit = DataLoader(
    val_dataset_vit,
    batch_size=batch_size,
    shuffle=False,  # Important: Do not shuffle to maintain order
    num_workers=num_workers,
    pin_memory=True
)

test_loader_vit = DataLoader(
    test_dataset_vit,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    pin_memory=True
)

validation_loader_eff = DataLoader(
    val_dataset_eff,
    batch_size=batch_size,
    shuffle=False,  # Important: Do not shuffle to maintain order
    num_workers=num_workers,
    pin_memory=True
)

test_loader_eff = DataLoader(
    test_dataset_eff,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    pin_memory=True
)

# Load the trained ViT model
vit_model = ViTForImageClassification.from_pretrained('./vit-food101')
vit_model.to(device)
vit_model.eval()
for param in vit_model.parameters():
    param.requires_grad = False

# Load the trained EfficientNet model
efficientnet_model = models.efficientnet_v2_s(weights=None)
num_ftrs = efficientnet_model.classifier[1].in_features
efficientnet_model.classifier[1] = nn.Linear(num_ftrs, num_classes)
efficientnet_model.load_state_dict(torch.load('./efficientnet_food101.pth', map_location=device))
efficientnet_model.to(device)
efficientnet_model.eval()
for param in efficientnet_model.parameters():
    param.requires_grad = False

# Function to get outputs from a model
def get_model_outputs(model, loader, model_type='vit'):
    outputs_list = []
    labels_list = []

    with torch.no_grad():
        loop = tqdm(loader, desc=f'[{model_type.upper()} Evaluation]', leave=False)
        for batch in loop:
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            if model_type == 'vit':
                outputs = model(inputs)
                logits = outputs.logits
            elif model_type == 'efficientnet':
                logits = model(inputs)
            else:
                raise ValueError("Invalid model type. Choose 'vit' or 'efficientnet'.")

            probabilities = torch.softmax(logits, dim=1)
            outputs_list.append(probabilities.cpu())
            labels_list.append(labels.cpu())

    outputs_tensor = torch.cat(outputs_list)
    labels_tensor = torch.cat(labels_list)
    return outputs_tensor, labels_tensor

# Get outputs for validation set
vit_val_outputs, val_labels_vit = get_model_outputs(vit_model, validation_loader_vit, 'vit')
eff_val_outputs, val_labels_eff = get_model_outputs(efficientnet_model, validation_loader_eff, 'efficientnet')

# Ensure labels are the same
assert torch.equal(val_labels_vit, val_labels_eff), "Validation labels do not match!"
val_labels = val_labels_vit  # Use one of them since they are equal

# Concatenate model outputs
ensemble_val_features = torch.cat([vit_val_outputs, eff_val_outputs], dim=1)

# Define the meta-classifier
class MetaClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(MetaClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Initialize meta-classifier
input_dim = num_classes * 2  # Since we have outputs from two models
meta_model = MetaClassifier(input_dim=input_dim, num_classes=num_classes)
meta_model.to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(meta_model.parameters(), lr=1e-3)

# Create DataLoader for the meta-classifier
val_dataset_meta = torch.utils.data.TensorDataset(ensemble_val_features, val_labels)
val_loader_meta = torch.utils.data.DataLoader(val_dataset_meta, batch_size=batch_size, shuffle=True)

# Train the meta-classifier
num_epochs = 5
for epoch in range(num_epochs):
    meta_model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    loop = tqdm(val_loader_meta, desc=f'Epoch {epoch+1}/{num_epochs} [Meta Training]', leave=False)
    for inputs, labels in loop:
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = meta_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        _, preds = torch.max(outputs, 1)
        correct_predictions += torch.sum(preds == labels)
        total_predictions += labels.size(0)

        loop.set_postfix(loss=loss.item())

    epoch_loss = running_loss / len(val_dataset_meta)
    epoch_acc = correct_predictions.double() / total_predictions
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Acc: {epoch_acc:.4f}')

# Get outputs for test set
vit_test_outputs, test_labels_vit = get_model_outputs(vit_model, test_loader_vit, 'vit')
eff_test_outputs, test_labels_eff = get_model_outputs(efficientnet_model, test_loader_eff, 'efficientnet')

# Ensure labels are the same
assert torch.equal(test_labels_vit, test_labels_eff), "Test labels do not match!"
test_labels = test_labels_vit  # Use one of them since they are equal

# Concatenate model outputs for test set
ensemble_test_features = torch.cat([vit_test_outputs, eff_test_outputs], dim=1)

# Create DataLoader for the test set
test_dataset_meta = torch.utils.data.TensorDataset(ensemble_test_features, test_labels)
test_loader_meta = torch.utils.data.DataLoader(test_dataset_meta, batch_size=batch_size, shuffle=False)

# Evaluate the meta-classifier on the test set
meta_model.eval()
test_loss = 0.0
test_correct = 0
test_total = 0

test_preds = []
test_labels_list = []

with torch.no_grad():
    for inputs, labels in tqdm(test_loader_meta, desc='[Meta Test Evaluation]', leave=False):
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = meta_model(inputs)
        loss = criterion(outputs, labels)
        test_loss += loss.item() * inputs.size(0)
        _, preds = torch.max(outputs, 1)
        test_correct += torch.sum(preds == labels)
        test_total += labels.size(0)
        test_preds.extend(preds.cpu().numpy())
        test_labels_list.extend(labels.cpu().numpy())

test_epoch_loss = test_loss / len(test_dataset_meta)
test_epoch_acc = test_correct.double() / test_total
test_f1 = f1_score(test_labels_list, test_preds, average='weighted')
test_precision = precision_score(test_labels_list, test_preds, average='weighted')
test_recall = recall_score(test_labels_list, test_preds, average='weighted')
print(f'[Ensemble Test Evaluation] - Loss: {test_epoch_loss:.4f} | Acc: {test_epoch_acc:.4f} | F1: {test_f1:.4f} | Precision: {test_precision:.4f} | Recall: {test_recall:.4f}')

Using device: cuda


  efficientnet_model.load_state_dict(torch.load('./efficientnet_food101.pth', map_location=device))
                                                                                          

Epoch 1/5, Loss: 1.7137, Acc: 0.8868


                                                                                          

Epoch 2/5, Loss: 0.2946, Acc: 0.9389


                                                                                          

Epoch 3/5, Loss: 0.2677, Acc: 0.9409


                                                                                          

Epoch 4/5, Loss: 0.2495, Acc: 0.9426


                                                                                          

Epoch 5/5, Loss: 0.2358, Acc: 0.9420


                                                                            

[Ensemble Test Evaluation] - Loss: 0.5561 | Acc: 0.8805 | F1: 0.8806 | Precision: 0.8815 | Recall: 0.8805
