In [2]:
import os
import yaml
import time
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image, ImageDraw
from tqdm import tqdm

# -------------------------------
# Config Loader
# -------------------------------
class Config:
    def __init__(self, config_path="CNN_Object_Detection/configs/config.yml"):
        with open(config_path, "r") as file:
            self.config = yaml.safe_load(file)

    def get(self, key, default=None):
        keys = key.split(".")
        value = self.config
        for k in keys:
            value = value.get(k, {})
            if not isinstance(value, dict):
                return value
        return default

# -------------------------------
# CNN 모델
# -------------------------------
class CNN(nn.Module):
    def __init__(self, input_shape=(1, 640, 640), num_classes=1):
        super(CNN, self).__init__()
        self.input_shape = input_shape
        self.features = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, 3, padding=1), nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(64, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(128, 256, 3, padding=1), nn.BatchNorm2d(256), nn.ReLU(), nn.AdaptiveAvgPool2d(1)
        )
        self.classifier = nn.Sequential(
            nn.Linear(256, 128), nn.ReLU(), nn.Dropout(0.3), nn.Linear(128, num_classes)
        )
        self.bbox_regressor = nn.Sequential(
            nn.Linear(256, 128), nn.ReLU(), nn.Linear(128, 4)
        )

    def forward(self, x):
        if x.ndim != 4 or tuple(x.shape[1:]) != self.input_shape:
            raise ValueError(f"Expected input shape [B, {self.input_shape}], but got {x.shape}")
        x = self.features(x)
        x = x.view(x.size(0), -1)
        class_logits = self.classifier(x)
        bbox = torch.sigmoid(self.bbox_regressor(x))
        return class_logits, bbox

# -------------------------------
# YOLO Dataset
# -------------------------------
class YoloFolderDataset(Dataset):
    def __init__(self, img_dir, label_dir, input_shape):
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.input_shape = input_shape
        self.image_files = sorted(os.listdir(img_dir))
        self.transform = transforms.Compose([
            transforms.Resize(input_shape[1:]),
            transforms.Grayscale(num_output_channels=1),
            transforms.ToTensor()
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_file = self.image_files[idx]
        label_file = img_file.replace(".jpg", ".txt").replace(".png", ".txt")
        img_path = os.path.join(self.img_dir, img_file)
        label_path = os.path.join(self.label_dir, label_file)

        img = Image.open(img_path).convert("RGB")
        img_tensor = self.transform(img)

        with open(label_path, "r") as f:
            parts = f.readline().strip().split()
            class_id = int(parts[0])

            # YOLO polygon 형식 -> AABB로 변환
            poly_xys = [float(x) for x in parts[3:]]  # x1 y1 x2 y2 ...
            xs = poly_xys[::2]
            ys = poly_xys[1::2]

            xmin = min(xs)
            xmax = max(xs)
            ymin = min(ys)
            ymax = max(ys)

            x_center = (xmin + xmax) / 2
            y_center = (ymin + ymax) / 2
            width = xmax - xmin
            height = ymax - ymin

            bbox = torch.tensor([x_center, y_center, width, height], dtype=torch.float32)

        return img_tensor, torch.tensor(class_id), bbox, img.copy(), img_file

# -------------------------------
# Custom collate_fn
# -------------------------------
def collate_fn(batch):
    images, classes, bboxes, orig_imgs, img_names = zip(*batch)
    return (
        torch.stack(images),
        torch.tensor(classes),
        torch.stack(bboxes),
        orig_imgs,
        img_names
    )

# -------------------------------
# Training
# -------------------------------
def train(model, loader, optimizer, criterion_cls, criterion_bbox, device):
    model.train()
    total_loss = 0
    for img, cls, bbox, _, _ in loader:
        img, cls, bbox = img.to(device), cls.to(device), bbox.to(device)
        optimizer.zero_grad()
        out_cls, out_bbox = model(img)
        loss_cls = criterion_cls(out_cls, cls)
        loss_bbox = criterion_bbox(out_bbox, bbox)
        loss = loss_cls + loss_bbox * 10.0  # bbox loss에 가중치 부여
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# -------------------------------
# Inference + Visualization
# -------------------------------
def infer_and_visualize(model, loader, device, output_dir="predicted", num_samples=5):
    os.makedirs(output_dir, exist_ok=True)
    model.eval()
    with torch.no_grad():
        for i, (img, cls, bbox, orig_imgs, img_names) in enumerate(loader):
            if i >= num_samples:
                break
            img = img.to(device)
            out_cls, out_bbox = model(img)
            pred_class = torch.argmax(out_cls, dim=1).item()
            pred_bbox = out_bbox[0].cpu().numpy()

            print(f"[{img_names[0]}] ➤ Predicted bbox: {pred_bbox}")

            orig_img = orig_imgs[0]
            img_name = img_names[0]
            draw = ImageDraw.Draw(orig_img)
            w, h = orig_img.size
            cx, cy, bw, bh = pred_bbox
            x1 = int((cx - bw / 2) * w)
            y1 = int((cy - bh / 2) * h)
            x2 = int((cx + bw / 2) * w)
            y2 = int((cy + bh / 2) * h)
            draw.rectangle([x1, y1, x2, y2], outline="red", width=3)
            draw.text((x1, y1), f"Class {pred_class}", fill="red")
            orig_img.save(os.path.join(output_dir, img_name))
            print(f"Saved: {os.path.join(output_dir, img_name)}")

In [3]:
# -------------------------------
# Main
# -------------------------------
config = Config("CNN_Object_Detection/configs/config.yml")
input_shape = tuple(config.get("model.input_shape"))
num_classes = config.get("model.num_classes")
batch_size = config.get("train.batch_size")
epochs = config.get("train.epochs")
lr = config.get("train.learning_rate")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNN(input_shape=input_shape, num_classes=num_classes).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion_cls = nn.CrossEntropyLoss()
criterion_bbox = nn.SmoothL1Loss()

base_path = "/home/kar/Projects/CNN_Object_Detection/Tank-detection-using-YOLO-8"
train_ds = YoloFolderDataset(f"{base_path}/train/images", f"{base_path}/train/labels", input_shape)
test_ds  = YoloFolderDataset(f"{base_path}/test/images",  f"{base_path}/test/labels",  input_shape)
valid_ds  = YoloFolderDataset(f"{base_path}/valid/images",  f"{base_path}/valid/labels",  input_shape)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader  = DataLoader(test_ds, batch_size=1, shuffle=True, collate_fn=collate_fn)
valid_loader  = DataLoader(valid_ds, batch_size=1, shuffle=True, collate_fn=collate_fn)

In [4]:
from sklearn.metrics import (
    classification_report,
    precision_recall_fscore_support,
    confusion_matrix,
)
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch


def evaluate_model(model, loader, device="cuda" if torch.cuda.is_available() else "cpu", class_names=None, plot_confusion=False):
    model.eval()
    model = model.to(device)
    print(device)
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for img, cls, _, _, _ in loader:
            img = img.to(device)
            cls = cls.to(device)
            out_cls, _ = model(img)
            preds = torch.argmax(out_cls, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(cls.cpu().numpy())

    unique_labels = sorted(list(set(all_labels)))
    if class_names is None:
        class_names = [f"class_{i}" for i in unique_labels]

    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average="weighted", zero_division=0
    )
    report = classification_report(
        all_labels, all_preds, target_names=class_names, zero_division=0
    )

    print("\nClassification Evaluation")
    print(f"Precision : {precision:.4f}")
    print(f"Recall    : {recall:.4f}")
    print(f"F1-Score  : {f1:.4f}")
    print("\n" + report)

    # confusion matrix
    if plot_confusion:
        cm = confusion_matrix(all_labels, all_preds, labels=unique_labels)
        plt.figure(figsize=(6, 5))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                    xticklabels=class_names, yticklabels=class_names)
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title("Confusion Matrix")
        plt.tight_layout()
        plt.show()


In [5]:
def save_model(model, path="model.pth"):
    torch.save(model.state_dict(), path)
    print(f"모델 저장 완료: {path}")

def load_model(model_class, path, input_shape, num_classes, device):
    model = model_class(input_shape=input_shape, num_classes=num_classes)
    model.load_state_dict(torch.load(path, map_location=device))
    model.to(device)
    model.eval()
    print(f"모델 로드 완료: {path}")
    return model

### Train and save model

In [6]:
for epoch in tqdm(range(epochs), desc="Training"):
    loss = train(model, train_loader, optimizer, criterion_cls, criterion_bbox, device)
    print(f"[Epoch {epoch+1}] Train Loss: {loss:.4f}")

save_model(model, "model.pth")
torch.save({
    "model_state": model.state_dict(),
    "optimizer_state": optimizer.state_dict()
}, "checkpoint.pth")

Training:   0%|          | 0/10 [00:03<?, ?it/s]


KeyboardInterrupt: 

### Load and infer model

In [7]:
model = load_model(CNN, "model.pth", input_shape, num_classes, device)
checkpoint = torch.load("checkpoint.pth", map_location=device)
model.load_state_dict(checkpoint["model_state"])
optimizer.load_state_dict(checkpoint["optimizer_state"])

print("\nInference + Visualization...")
# infer_and_visualize(model, test_loader, device, output_dir="predicted", num_samples=20)
evaluate_model(model, valid_loader, device)

모델 로드 완료: model.pth

Inference + Visualization...
cuda

Classification Evaluation
Precision : 1.0000
Recall    : 1.0000
F1-Score  : 1.0000

              precision    recall  f1-score   support

     class_0       1.00      1.00      1.00        83

    accuracy                           1.00        83
   macro avg       1.00      1.00      1.00        83
weighted avg       1.00      1.00      1.00        83



In [8]:
import time

def measure_inference_speed(model, loader, device, warmup=5, num_batches=50):
    model.eval()
    total_time = 0
    total_images = 0

    # GPU warm-up
    with torch.no_grad():
        for i, (img, _, _, _, _) in enumerate(loader):
            if i >= warmup:
                break
            img = img.to(device)
            _ = model(img)

    # Inference
    with torch.no_grad():
        for i, (img, _, _, _, _) in enumerate(loader):
            if i >= num_batches:
                break
            img = img.to(device)

            start_time = time.time()
            _ = model(img)
            end_time = time.time()

            total_time += (end_time - start_time)
            total_images += img.size(0)

    fps = total_images / total_time if total_time > 0 else 0
    print(f"Inference Speed: {fps:.2f} FPS ({1000/fps:.2f} ms/frame)")
    return fps

In [9]:
base_path = "/home/kar/Projects/CNN_Object_Detection/Tank-detection-using-YOLO-8"
test_ds  = YoloFolderDataset(f"{base_path}/test/images",  f"{base_path}/test/labels",  input_shape)
test_loader = DataLoader(test_ds, batch_size=1, shuffle=False, collate_fn=collate_fn)

measure_inference_speed(model, test_loader, device, warmup=5, num_batches=50)

Inference Speed: 1088.81 FPS (0.92 ms/frame)


1088.8130876542632

In [10]:
odel = model.to("cpu")
dummy_input = torch.randn(1, 1, 640, 640)

torch.onnx.export(
    model,
    dummy_input,
    "improved_cnn.onnx",
    input_names=["input"],
    output_names=["class_logits", "bbox"],
    dynamic_axes={"input": {0: "batch_size"}},
    opset_version=11
)
print("✅ ONNX export 완료!")


✅ ONNX export 완료!


  if x.ndim != 4 or tuple(x.shape[1:]) != self.input_shape:


In [None]:
# import time
# import onnxruntime as ort
# import numpy as np

def measure_inference_speed_onnx(session, loader, warmup=5, num_batches=50):
    total_time = 0
    total_images = 0
    input_name = session.get_inputs()[0].name

    # warm-up
    for i, (img, _, _, _, _) in enumerate(loader):
        if i >= warmup:
            break
        session.run(None, {input_name: img.numpy()})

    # measure
    for i, (img, _, _, _, _) in enumerate(loader):
        if i >= num_batches:
            break

        start = time.time()
        session.run(None, {input_name: img.numpy()})
        end = time.time()

        total_time += (end - start)
        total_images += img.size(0)

    fps = total_images / total_time
    print(f"⚡ Inference Speed (ONNX): {fps:.2f} FPS ({1000/fps:.2f} ms/frame)")
    return fps



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/kar/Projects/CNN_Object_Detection/venv/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/kar/Projects/CNN_Object_Detection/venv/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
 

AttributeError: _ARRAY_API not found

ImportError: 

In [None]:
from sklearn.metrics import classification_report, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_model_onnx(session, loader, class_names=None, plot_confusion=False):
    input_name = session.get_inputs()[0].name
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for img, cls, _, _, _ in loader:
            logits, _ = session.run(None, {input_name: img.numpy()})
            preds = np.argmax(logits, axis=1)
            all_preds.extend(preds)
            all_labels.extend(cls.numpy())

    unique_labels = sorted(list(set(all_labels)))
    if class_names is None:
        class_names = [f"class_{i}" for i in unique_labels]

    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average="weighted", zero_division=0)
    report = classification_report(all_labels, all_preds, target_names=class_names, zero_division=0)

    print("\n📊 ONNX Classification Evaluation")
    print(f"Precision : {precision:.4f}")
    print(f"Recall    : {recall:.4f}")
    print(f"F1-Score  : {f1:.4f}")
    print("\n" + report)

    if plot_confusion:
        cm = confusion_matrix(all_labels, all_preds, labels=unique_labels)
        plt.figure(figsize=(6, 5))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                    xticklabels=class_names, yticklabels=class_names)
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title("🧩 ONNX Confusion Matrix")
        plt.tight_layout()
        plt.show()


In [None]:
from onnxruntime.quantization import quantize_dynamic, QuantType
import onnxruntime as ort

# GPU 세션
session = ort.InferenceSession("improved_cnn.onnx", providers=["CUDAExecutionProvider"])

import onnxruntime as ort
print(ort.get_available_providers())

# 평가
evaluate_model_onnx(session, test_loader, class_names=["tank"])
measure_inference_speed_onnx(session, test_loader)

  """


['AzureExecutionProvider', 'CPUExecutionProvider']

📊 ONNX Classification Evaluation
Precision : 1.0000
Recall    : 1.0000
F1-Score  : 1.0000

              precision    recall  f1-score   support

        tank       1.00      1.00      1.00        50

    accuracy                           1.00        50
   macro avg       1.00      1.00      1.00        50
weighted avg       1.00      1.00      1.00        50

⚡ Inference Speed (ONNX): 43.15 FPS (23.18 ms/frame)


43.14508803984285

In [None]:
from onnxruntime.quantization import quantize_dynamic, QuantType
import onnxruntime as ort

# GPU 세션
session = ort.InferenceSession("improved_cnn.onnx", providers=["CUDAExecutionProvider"])

import onnxruntime as ort
print(ort.get_available_providers())

# 평가
evaluate_model_onnx(session, test_loader, class_names=["tank"])
measure_inference_speed_onnx(session, test_loader)

  """


['AzureExecutionProvider', 'CPUExecutionProvider']

📊 ONNX Classification Evaluation
Precision : 1.0000
Recall    : 1.0000
F1-Score  : 1.0000

              precision    recall  f1-score   support

        tank       1.00      1.00      1.00        50

    accuracy                           1.00        50
   macro avg       1.00      1.00      1.00        50
weighted avg       1.00      1.00      1.00        50

⚡ Inference Speed (ONNX): 40.28 FPS (24.83 ms/frame)


40.28092806524384