# Data Preparation 

In [1]:
import os
import cv2
import random
import shutil
from pathlib import Path

# === Config ===
image_dir = 'dataset/images'
label_dir = 'dataset/labels'
output_root = 'dataset_vit'
train_split = 0.8
resize_dim = (224, 224)  # Vision Transformer input size

# === Setup ===
train_dir = os.path.join(output_root, 'train')
val_dir = os.path.join(output_root, 'val')
for split_dir in [train_dir, val_dir]:
    os.makedirs(split_dir, exist_ok=True)

# === Gather Crops ===
crops = []  # Will store (image_crop, class_id) tuples

for label_file in os.listdir(label_dir):
    if not label_file.endswith(".txt"):
        continue

    image_file = label_file.replace(".txt", ".jpg")
    image_path = os.path.join(image_dir, image_file)
    label_path = os.path.join(label_dir, label_file)

    # Load image
    img = cv2.imread(image_path)
    if img is None:
        print(f"Warning: image not found: {image_path}")
        continue
    h, w, _ = img.shape

    # Parse YOLO label
    with open(label_path, "r") as f:
        for idx, line in enumerate(f):
            parts = line.strip().split()
            if len(parts) != 5:
                continue
            class_id, cx, cy, bw, bh = map(float, parts)

            # Convert normalized coordinates to pixel values
            x1 = int((cx - bw/2) * w)
            y1 = int((cy - bh/2) * h)
            x2 = int((cx + bw/2) * w)
            y2 = int((cy + bh/2) * h)

            # Clamp and crop
            x1, y1, x2, y2 = max(0, x1), max(0, y1), min(w, x2), min(h, y2)
            crop = img[y1:y2, x1:x2]
            if crop.size == 0:
                continue

            # Resize for ViT
            crop_resized = cv2.resize(crop, resize_dim)

            # Store for later split
            crops.append((crop_resized, str(int(class_id))))

# === Shuffle and Split ===
random.shuffle(crops)
split_idx = int(len(crops) * train_split)
train_crops = crops[:split_idx]
val_crops = crops[split_idx:]

# === Save Crops ===
def save_crops(crop_list, base_dir):
    counter = {}
    for img, class_id in crop_list:
        class_dir = os.path.join(base_dir, class_id)
        os.makedirs(class_dir, exist_ok=True)

        counter[class_id] = counter.get(class_id, 0) + 1
        filename = f"{class_id}_{counter[class_id]}.jpg"
        cv2.imwrite(os.path.join(class_dir, filename), img)

save_crops(train_crops, train_dir)
save_crops(val_crops, val_dir)

print(f"✅ Done! Saved {len(train_crops)} training and {len(val_crops)} validation crops.")


✅ Done! Saved 167 training and 42 validation crops.


# Transformer Config

In [1]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Define transforms (ViT expects 224x224 and normalized input)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5]*3, std=[0.5]*3)  # or use ImageNet mean/std if using ImageNet-pretrained ViT
])

train_dataset = datasets.ImageFolder('dataset_vit/train', transform=transform)
val_dataset = datasets.ImageFolder('dataset_vit/val', transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


In [2]:
import timm
import torch.nn as nn

# Load pretrained ViT base model and modify the classifier head
model = timm.create_model('vit_base_patch16_224', pretrained=True)
model.head = nn.Linear(model.head.in_features, len(train_dataset.classes))  # Adjust output classes


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [3]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)


In [4]:
from tqdm import tqdm

epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for images, labels in tqdm(train_loader):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")


100%|██████████| 6/6 [01:16<00:00, 12.75s/it]


Epoch 1, Loss: 2.0277


100%|██████████| 6/6 [01:09<00:00, 11.66s/it]


Epoch 2, Loss: 1.4282


100%|██████████| 6/6 [01:09<00:00, 11.52s/it]


Epoch 3, Loss: 0.9288


100%|██████████| 6/6 [01:07<00:00, 11.24s/it]


Epoch 4, Loss: 0.3004


100%|██████████| 6/6 [01:05<00:00, 10.95s/it]


Epoch 5, Loss: 0.1433


100%|██████████| 6/6 [01:09<00:00, 11.55s/it]


Epoch 6, Loss: 0.1324


100%|██████████| 6/6 [01:09<00:00, 11.56s/it]


Epoch 7, Loss: 0.0157


100%|██████████| 6/6 [01:06<00:00, 11.16s/it]


Epoch 8, Loss: 0.0025


100%|██████████| 6/6 [01:09<00:00, 11.51s/it]


Epoch 9, Loss: 0.0001


100%|██████████| 6/6 [01:07<00:00, 11.29s/it]

Epoch 10, Loss: 0.0000





In [5]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in val_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        predicted = torch.argmax(outputs, dim=1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

print(f"Validation Accuracy: {100 * correct / total:.2f}%")


Validation Accuracy: 100.00%


In [6]:
torch.save(model.state_dict(), 'vit_currency_classifier.pth')


In [8]:
from sklearn.metrics import classification_report, average_precision_score
import numpy as np

model.eval()
all_labels = []
all_preds = []
all_probs = []

with torch.no_grad():
    for images, labels in val_loader:
        images = images.to(device)
        outputs = model(images)
        probs = torch.softmax(outputs, dim=1)

        preds = torch.argmax(probs, dim=1)

        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())
        all_probs.extend(probs.cpu().numpy())  # needed for mAP


In [9]:
from sklearn.metrics import classification_report

print(classification_report(all_labels, all_preds, target_names=val_dataset.classes))


              precision    recall  f1-score   support

          10       1.00      1.00      1.00        17
          20       1.00      1.00      1.00        12
          50       1.00      1.00      1.00        13

    accuracy                           1.00        42
   macro avg       1.00      1.00      1.00        42
weighted avg       1.00      1.00      1.00        42



In [10]:
from sklearn.preprocessing import label_binarize

# Convert labels to one-hot encoding
y_true_bin = label_binarize(all_labels, classes=list(range(len(val_dataset.classes))))
y_score = np.array(all_probs)

mAP = average_precision_score(y_true_bin, y_score, average='macro')
print(f"Mean Average Precision (mAP): {mAP:.4f}")


Mean Average Precision (mAP): 1.0000


In [3]:
import cv2
import os

# === CONFIG ===
video_path = r'testing_video\NZD10_resized_video_640x640.mp4'
output_dir = r'cropped_frames'
crop_size = (224, 224)  # Resize for ViT input
os.makedirs(output_dir, exist_ok=True)

# === Open Video ===
cap = cv2.VideoCapture(video_path)
frame_idx = 0
crop_idx = 0

roi = None
drawing = False
ix, iy = -1, -1

def draw_rectangle(event, x, y, flags, param):
    global ix, iy, roi, drawing

    if event == cv2.EVENT_LBUTTONDOWN:
        drawing = True
        ix, iy = x, y

    elif event == cv2.EVENT_MOUSEMOVE and drawing:
        roi = (ix, iy, x, y)

    elif event == cv2.EVENT_LBUTTONUP:
        drawing = False
        roi = (ix, iy, x, y)

cv2.namedWindow("Frame")
cv2.setMouseCallback("Frame", draw_rectangle)

# === Main Loop ===
while True:
    ret, frame = cap.read()
    if not ret:
        break

    temp_frame = frame.copy()
    if roi:
        x1, y1, x2, y2 = roi
        cv2.rectangle(temp_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

    cv2.imshow("Frame", temp_frame)
    key = cv2.waitKey(1)

    if key == ord('c') and roi:
        # Crop and resize ROI
        x1, y1, x2, y2 = roi
        x1, x2 = sorted([x1, x2])
        y1, y2 = sorted([y1, y2])
        cropped = frame[y1:y2, x1:x2]
        if cropped.size > 0:
            resized = cv2.resize(cropped, crop_size)
            filename = f"crop_{frame_idx:04d}_{crop_idx:02d}.jpg"
            cv2.imwrite(os.path.join(output_dir, filename), resized)
            print(f"✅ Saved: {filename}")
            crop_idx += 1
        roi = None

    elif key == ord('n'):
        # Next frame
        frame_idx += 1
        crop_idx = 0
        roi = None

    elif key == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
