### Config

In [165]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import random

In [166]:
import yaml

class Config:
    """
    Loads configuration from the YAML file and provides access to constants.
    """

    def __init__(self, config_path="CNN_Object_Detection/configs/config.yml"):
        with open(config_path, "r") as file:
            self.config = yaml.safe_load(file)

    def get(self, key, default=None):
        """
        Retrieve a value from the YAML config file.

        Args:
            key (str): Configuration key in the format "section.key".
            default: Default value if the key is not found.

        Returns:
            Value from the config file or the default value.
        """
        keys = key.split(".")
        value = self.config
        for k in keys:
            value = value.get(k, {})
            if not isinstance(value, dict):
                return value
        return default

config = Config()

In [167]:
input_shape = tuple(config.get("model.input_shape"))
num_classes = config.get("model.num_classes")
batch_size = config.get("model.batch_size")
epochs = config.get("model.epochs")
lr = config.get("model.learning_rate")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


### Model

In [168]:
# CNN Model for 640x640 input images
class CNN(nn.Module):
    def __init__(self, input_shape, num_classes):
        super(CNN, self).__init__()
        # (Channels, H, W)
        self.input_shape = input_shape

        self.features = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2), # 320x320

            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2), # 160x160

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1), # 1x1x64
        )

        # Fully connected heads
        self.classifier = nn.Linear(64, num_classes)
        self.bbox_regressor = nn.Linear(64, 4) # [x_center, y_center, width, height]

    def forward(self, x):
        assert x.shape[1:] == self.input_shape, \
            f"Expected input shape (B, {self.input_shape}), but got {x.shape}"
        
        x = self.features(x)
        x = x.view(x.size(0), -1)  # Flatten; (batch_size, 64)

        class_logits = self.classifier(x)
        bbox = torch.sigmoid(self.bbox_regressor(x))  # Normalization

        return class_logits, bbox

### Test

In [169]:
model = CNN(input_shape=input_shape, num_classes=10)
image = torch.randn(1, 1, 640, 640)  # Grayscale image [Batch, Channel, Height, Width]
# image = torch.randn(8, 3, 256, 256) # Colour image
class_logits, bbox = model(image)

In [170]:
class_logits, bbox

(tensor([[-0.1875,  0.1363, -0.0763, -0.0438,  0.1508,  0.0284, -0.0679, -0.0269, -0.0474,  0.0433]], grad_fn=<AddmmBackward0>),
 tensor([[0.5166, 0.4641, 0.5034, 0.5351]], grad_fn=<SigmoidBackward0>))

### Dataset

In [171]:
class ImageDataset(Dataset):
    def __init__(self, num_samples, input_shape, num_classes):
        self.num_samples = num_samples
        self.input_shape = input_shape
        self.num_classes = num_classes

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        image = torch.rand(self.input_shape)
        label = torch.randint(0, self.num_classes, (1,)).item()
        bbox = torch.rand(4)  # [x_center, y_center, width, height]
        return image, label, bbox

### Train

In [172]:
def train(model, dataloader, optimizer, criterion_cls, criterion_bbox):
    model.train()
    total_loss = 0
    for images, labels, bboxes in dataloader:
        images, labels, bboxes = images.to(device), labels.to(device), bboxes.to(device)

        optimizer.zero_grad()
        outputs, pred_bbox = model(images)

        loss_cls = criterion_cls(outputs, labels)
        loss_bbox = criterion_bbox(pred_bbox, bboxes)
        loss = loss_cls + loss_bbox
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

In [173]:
def evaluate(model, dataloader, criterion_cls, criterion_bbox):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels, bboxes in dataloader:
            images, labels, bboxes = images.to(device), labels.to(device), bboxes.to(device)
            outputs, pred_bbox = model(images)

            loss_cls = criterion_cls(outputs, labels)
            loss_bbox = criterion_bbox(pred_bbox, bboxes)
            loss = loss_cls + loss_bbox
            total_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    acc = 100. * correct / total
    return total_loss / len(dataloader), acc

In [174]:
model = CNN(input_shape=input_shape, num_classes=num_classes).to(device)

train_data = ImageDataset(100, input_shape, num_classes)
test_data = ImageDataset(20, input_shape, num_classes)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size)

optimizer = optim.Adam(model.parameters(), lr=lr)
criterion_cls = nn.CrossEntropyLoss()
criterion_bbox = nn.SmoothL1Loss()

# Train loop
for epoch in range(epochs):
    train_loss = train(model, train_loader, optimizer, criterion_cls, criterion_bbox)
    val_loss, acc = evaluate(model, test_loader, criterion_cls, criterion_bbox)
    print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Acc: {acc:.2f}%")

Epoch 1/5 | Train Loss: 2.3590 | Val Loss: 2.3586 | Acc: 5.00%
Epoch 2/5 | Train Loss: 2.3412 | Val Loss: 2.3574 | Acc: 10.00%
Epoch 3/5 | Train Loss: 2.3419 | Val Loss: 2.3598 | Acc: 20.00%
Epoch 4/5 | Train Loss: 2.3473 | Val Loss: 2.3477 | Acc: 10.00%
Epoch 5/5 | Train Loss: 2.3466 | Val Loss: 2.3417 | Acc: 15.00%


### Infer

In [175]:
def infer(model, image):
    model.eval()
    with torch.no_grad():
        logits, bbox = model(image.unsqueeze(0).to(device))  # Add batch dim
        probs = torch.softmax(logits, dim=1)
        pred_class = torch.argmax(probs)
        return pred_class.item(), probs[0].cpu().numpy(), bbox[0].cpu().numpy()

In [176]:
# Inference test
sample_image, _, _ = test_data[random.randint(0, len(test_data)-1)]
pred_class, probs, bbox = infer(model, sample_image)
print("\n[Inference Result]")
print("Predicted class:", pred_class)
print("Class probabilities:", probs)
print("Bounding box:", bbox)


[Inference Result]
Predicted class: 6
Class probabilities: [   0.099593     0.10057     0.10417    0.094595     0.10301    0.098473     0.11019    0.091384    0.093159     0.10486]
Bounding box: [    0.51103     0.49356     0.50759     0.53333]


### Load Roboflow Dataset

In [177]:
# Ultralytics(YOLOv8) 설치
# !pip install ultralytics roboflow

from roboflow import Roboflow
rf = Roboflow(api_key="Ew4WmTF6TrvDqghB1sEe")
project = rf.workspace("tank-project").project("tank-detection-using-yolo")
version = project.version(8)
dataset = version.download("yolov8")
import os

base_path = dataset.location  # 다운로드된 데이터셋 경로

train_count = len(os.listdir(os.path.join(base_path, "train", "images")))
valid_count = len(os.listdir(os.path.join(base_path, "valid", "images")))
test_count = len(os.listdir(os.path.join(base_path, "test", "images")))

print("Train set size:", train_count)
print("Validation set size:", valid_count)
print("Test set size:", test_count)

loading Roboflow workspace...
loading Roboflow project...
Train set size: 586
Validation set size: 83
Test set size: 50


In [178]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, input_shape=(1, 640, 640), num_classes=1):
        """
        - input_shape: (C, H, W) — e.g. (1, 640, 640)
        - num_classes: 분류할 클래스 수
        """
        super(CNN, self).__init__()
        self.input_shape = input_shape  # For input validation

        self.features = nn.Sequential(
            nn.Conv2d(in_channels=input_shape[0], out_channels=16, kernel_size=3, stride=1, padding=1),  # [B, 16, H, W]
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),  # ↓ [B, 16, H/2, W/2]

            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),  # ↓ [B, 32, H/4, W/4]

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1),  # → [B, 64, 1, 1]
        )

        # Classification head
        self.classifier = nn.Linear(64, num_classes)

        # Bounding box head: [x_center, y_center, width, height]
        self.bbox_regressor = nn.Linear(64, 4)

    def forward(self, x):
        # Debug shape check
        assert x.shape[1:] == self.input_shape, \
            f"Expected input shape (B, {self.input_shape}), but got {x.shape}"

        x = self.features(x)  # [B, 64, 1, 1]
        x = x.view(x.size(0), -1)  # [B, 64]

        class_logits = self.classifier(x)         # [B, num_classes]
        bbox = torch.sigmoid(self.bbox_regressor(x))  # [B, 4] in range [0, 1]

        return class_logits, bbox


In [179]:
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image
import os
import torch

class YoloFolderDataset(Dataset):
    def __init__(self, img_dir, label_dir, input_shape):
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.input_shape = input_shape  # e.g. (1, 640, 640)
        self.image_files = sorted(os.listdir(img_dir))

        # ⚠️ [1, H, W] 형태로 강제 변환되도록 구성
        self.transform = transforms.Compose([
            transforms.Resize(input_shape[1:]),   # (H, W)
            transforms.Grayscale(num_output_channels=1),  # 흑백 채널 1개
            transforms.ToTensor(),               # → shape: [1, H, W], float32, [0, 1] 범위
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_file = self.image_files[idx]
        label_file = img_file.replace(".jpg", ".txt").replace(".png", ".txt")

        img_path = os.path.join(self.img_dir, img_file)
        label_path = os.path.join(self.label_dir, label_file)

        # ✅ 이미지 로드 및 Tensor 변환
        img = Image.open(img_path)
        img_tensor = self.transform(img)  # → [1, H, W]

        # ✅ 라벨 로드
        with open(label_path, "r") as f:
            line = f.readline().strip().split()
            class_id = int(line[0])
            bbox = torch.tensor([float(x) for x in line[1:]], dtype=torch.float32)

        class_tensor = torch.tensor(class_id, dtype=torch.long)
        return img_tensor, class_tensor, bbox


In [180]:
base_path = "Tank-detection-using-YOLO-8"

train_dataset = YoloFolderDataset(
    img_dir=os.path.join(base_path, "train/images"),
    label_dir=os.path.join(base_path, "train/labels"),
    input_shape=(1, 640, 640)
)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)

In [181]:
def train(model, dataloader, optimizer, criterion_cls, criterion_bbox, device):
    model.train()
    total_loss = 0
    for images, labels, bboxes in dataloader:
        images, labels, bboxes = images.to(device), labels.to(device), bboxes.to(device)

        optimizer.zero_grad()
        outputs, pred_bbox = model(images)

        loss_cls = criterion_cls(outputs, labels)
        loss_bbox = criterion_bbox(pred_bbox, bboxes)
        loss = loss_cls + loss_bbox
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

In [182]:
import time

def infer_and_measure(model, loader, device, num_batches=10):
    model.eval()
    total_time = 0
    total_samples = 0

    with torch.no_grad():
        for i, (img, cls, bbox) in enumerate(loader):
            if i >= num_batches: break
            img = img.to(device)
            start = time.time()
            out_cls, out_bbox = model(img)
            end = time.time()
            total_time += (end - start)
            total_samples += img.size(0)

            pred = torch.argmax(torch.softmax(out_cls, dim=1), dim=1)
            print(f"[Sample {i}] Class: {pred[0].item()}  BBox: {out_bbox[0].cpu().numpy()}")

    fps = total_samples / total_time
    print(f"\n평균 FPS: {fps:.2f} Hz")

In [None]:
if __name__ == "__main__":
    input_shape = tuple(config.get("model.input_shape"))
    num_classes = config.get("model.num_classes")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 경로 설정
    base_path = "/home/kar/Projects/CNN_Object_Detection/Tank-detection-using-YOLO-8"  # Roboflow 다운로드 위치
    model = CNN(input_shape, num_classes).to(device)

    train_ds = YoloFolderDataset(f"{base_path}/train/images", f"{base_path}/train/labels", input_shape)
    test_ds = YoloFolderDataset(f"{base_path}/test/images", f"{base_path}/test/labels", input_shape)

    train_loader = DataLoader(train_ds, batch_size=config.get("batch_size"), shuffle=True)
    test_loader = DataLoader(test_ds, batch_size=1, shuffle=False)

    optimizer = torch.optim.Adam(model.parameters(), lr=config.get("model.learning_rate"))
    criterion_cls = nn.CrossEntropyLoss()
    criterion_bbox = nn.SmoothL1Loss()

    for epoch in range(config.get("model.epochs")):
        loss = train(model, train_loader, optimizer, criterion_cls, criterion_bbox, device)
        print(f"[Epoch {epoch+1}] Train Loss: {loss:.4f}")

    print("\n추론 및 FPS 측정 중...")
    infer_and_measure(model, test_loader, device)

AssertionError: Expected input shape (B, (1, 640, 640)), but got torch.Size([1, 640, 640])