In [1]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [2]:
import timm
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split, ConcatDataset
import numpy as np
from tqdm import tqdm

import itertools
from torchinfo import summary

In [3]:
ViT = timm.create_model('vit_base_patch16_224', pretrained=False)

In [4]:
model = ViT

In [5]:
print(model)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

batch_size = 32
learning_rate = 0.001
num_epochs = 50

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity(

In [6]:
print(summary(model, input_size=(32, 3, 224, 224)))

Layer (type:depth-idx)                   Output Shape              Param #
VisionTransformer                        [32, 1000]                152,064
├─PatchEmbed: 1-1                        [32, 196, 768]            --
│    └─Conv2d: 2-1                       [32, 768, 14, 14]         590,592
│    └─Identity: 2-2                     [32, 196, 768]            --
├─Dropout: 1-2                           [32, 197, 768]            --
├─Identity: 1-3                          [32, 197, 768]            --
├─Identity: 1-4                          [32, 197, 768]            --
├─Sequential: 1-5                        [32, 197, 768]            --
│    └─Block: 2-3                        [32, 197, 768]            --
│    │    └─LayerNorm: 3-1               [32, 197, 768]            1,536
│    │    └─Attention: 3-2               [32, 197, 768]            2,362,368
│    │    └─Identity: 3-3                [32, 197, 768]            --
│    │    └─Identity: 3-4                [32, 197, 768]          

  x = F.scaled_dot_product_attention(


In [15]:
# extract the .tar files of training dataset
import os
import tarfile

# Paths
train_tar_dir = "./data/train/"
output_dir = "./data/train"

os.makedirs(output_dir, exist_ok=True)

# Extract each tar file into a subdirectory
for tar_file in os.listdir(train_tar_dir):
    if tar_file.endswith(".tar"):
        tar_path = os.path.join(train_tar_dir, tar_file)
        wnid = os.path.splitext(tar_file)[0]
        wnid_dir = os.path.join(output_dir, wnid)
        os.makedirs(wnid_dir, exist_ok=True)
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=wnid_dir)

  tar.extractall(path=wnid_dir)


In [44]:
import os
import tarfile
import pickle

# Paths
train_tar_dir = "./data/train/"  # .tar 파일 경로
output_dir = "./data/train_org/"  # 디렉토리가 생성될 경로
meta_file = "./data/data.pkl"  # WNID 매핑 정보 파일

# Load metadata
with open(meta_file, "rb") as f:
    metadata = pickle.load(f)

wnid_to_name = metadata[0]  # WNID to readable name mapping
expected_wnids = set(wnid_to_name.keys())  # pkl 파일의 WNID 키

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Extract each tar file and verify WNID
actual_wnids = set()
for tar_file in os.listdir(train_tar_dir):
    if tar_file.endswith(".tar"):
        tar_path = os.path.join(train_tar_dir, tar_file)
        wnid = os.path.splitext(tar_file)[0]  # Extract WNID from tar file name
        actual_wnids.add(wnid)  # Track processed WNIDs

        if wnid in expected_wnids:
            wnid_dir = os.path.join(output_dir, wnid)
            os.makedirs(wnid_dir, exist_ok=True)
            with tarfile.open(tar_path) as tar:
                tar.extractall(path=wnid_dir)
        else:
            print(f"Warning: WNID {wnid} not found in metadata.")

# Find missing WNIDs
missing_wnids = expected_wnids - actual_wnids
if missing_wnids:
    print(f"Missing directories for the following WNIDs: {missing_wnids}")
else:
    print("All WNIDs processed successfully.")

  tar.extractall(path=wnid_dir)


All WNIDs processed successfully.


In [45]:
# Example: Mapping ILSVRC2012_ID to WNIDs
wnid_to_ilsvrc2012_id = {i + 1: wnid for i, wnid in enumerate(wnid_to_name.keys())}

# Inverse mapping for reverse lookup
ilsvrc2012_id_to_wnid = {v: k for k, v in wnid_to_ilsvrc2012_id.items()}

In [53]:
# Paths
val_dir = "./data/val/ILSVRC2012_img_val/"  # Directory containing the validation images
output_val_dir = "./data/val_org/"  # Output directory for organized validation images
meta_file = "./data/data.pkl"  # WNID mapping metadata

# Load metadata
with open(meta_file, "rb") as f:
    metadata = pickle.load(f)

wnid_to_name = metadata[0]  # WNID to readable name mapping
wnid_list = metadata[1]  # List of WNIDs for the validation set

# Ensure output directory exists
os.makedirs(output_val_dir, exist_ok=True)

# We don't have explicit image-to-WNID mappings, so we assume the order of WNIDs in metadata[1] corresponds to images in the validation set.
# For this, we'll need to know how many images are in the validation set to correctly assign WNIDs.

# Validate we have enough WNIDs to match the number of validation images
num_images = len(os.listdir(val_dir))
num_wnids = len(wnid_list)

if num_images != num_wnids:
    print(f"Warning: Number of images ({num_images}) doesn't match the number of WNIDs ({num_wnids}) in the metadata.")
else:
    print(f"Proceeding with {num_images} validation images and {num_wnids} WNIDs.")

# Create the WNID directories and move the validation images
missing_wnids = set()
for i, wnid in enumerate(wnid_list):
    if wnid in wnid_to_name:
        wnid_dir = os.path.join(output_val_dir, wnid)
        os.makedirs(wnid_dir, exist_ok=True)

        # Assume the images are named in the order of metadata[1], so the i-th image corresponds to the i-th WNID
        image_filename = f"ILSVRC2012_val_{str(i).zfill(8)}.JPEG"
        src_image_path = os.path.join(val_dir, image_filename)
        dst_image_path = os.path.join(wnid_dir, image_filename)

        if os.path.exists(src_image_path):
            shutil.move(src_image_path, dst_image_path)
        else:
            continue
            #print(f"Warning: Image {image_filename} not found in {val_dir}")
    else:
        missing_wnids.add(wnid)

# Check for any missing WNIDs
if missing_wnids:
    print(f"Warning: The following WNIDs are missing in metadata: {missing_wnids}")
else:
    print("All validation images have been successfully reorganized.")

print("Validation data reorganization complete.")

Proceeding with 50000 validation images and 50000 WNIDs.
All validation images have been successfully reorganized.
Validation data reorganization complete.


In [54]:
from torchvision.datasets import ImageFolder
from torchvision import transforms
from torch.utils.data import DataLoader

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load datasets
train_dataset = ImageFolder(root="./data/train_org", transform=transform)
val_dataset = ImageFolder(root="./data/val_org", transform=transform)

# Create DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

In [55]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [56]:
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for inputs, labels in tqdm(train_loader, desc="Training"):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(train_loader)
    accuracy = 100 * correct / total
    print(f"Train Loss: {epoch_loss:.4f}, Train Accuracy: {accuracy:.2f}%")

In [57]:
def evaluate(model, data_loader, criterion, device, phase="Validation"):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in tqdm(data_loader, desc=f"{phase}"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(data_loader)
    accuracy = 100 * correct / total
    print(f"{phase} Loss: {epoch_loss:.4f}, {phase} Accuracy: {accuracy:.2f}%")

In [58]:
def measure_inference_time(model, data_loader, device):
    model.eval()
    times = []

    with torch.no_grad():
        for inputs, _ in data_loader:
            inputs = inputs.to(device)
            start_time = torch.cuda.Event(enable_timing=True)
            end_time = torch.cuda.Event(enable_timing=True)

            start_time.record()
            _ = model(inputs)  # inference 수행
            end_time.record()

            # 시간 측정
            torch.cuda.synchronize()  # CUDA에서 모든 커널이 완료될 때까지 대기
            elapsed_time = start_time.elapsed_time(end_time)  # 밀리초 단위로 반환
            times.append(elapsed_time)

    # 통계량 계산
    times_np = np.array(times)
    total_inferences = len(times_np)
    avg_time = np.mean(times_np)
    std_dev = np.std(times_np)
    max_time = np.max(times_np)
    min_time = np.min(times_np)

    # 결과 출력
    print(f"Inference Time Measurement Results:")
    print(f"Total Inferences: {total_inferences}")
    print(f"Average Time: {avg_time:.2f} ms")
    print(f"Standard Deviation: {std_dev:.2f} ms")
    print(f"Maximum Time: {max_time:.2f} ms")
    print(f"Minimum Time: {min_time:.2f} ms")

    return times

In [None]:
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    train(model, train_loader, criterion, optimizer, device)
    evaluate(model, val_loader, criterion, device, phase="Validation")


Epoch 1/50


Training:   1%|▍                                                                 | 139/20019 [02:54<6:27:50,  1.17s/it]

In [None]:
print("\nFinal Test Evaluation")
evaluate(model, test_loader, criterion, device, phase="Test")

In [None]:
times = measure_inference_time(model, test_loader, device)