## Faster R-CNN
- 데이터셋 다운로드만 한 상태로 loader 부분 코드 수정해서 실습 진행하면 되지만, 오래 걸려서 따로 해볼 것

In [3]:
# 필요한 파일들 다운로드
!wget http://images.cocodataset.org/zips/train2017.zip
!wget http://images.cocodataset.org/zips/val2017.zip
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip

--2025-05-22 10:48:28--  http://images.cocodataset.org/zips/train2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 16.15.185.235, 3.5.28.57, 52.216.60.193, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|16.15.185.235|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19336861798 (18G) [application/zip]
Saving to: ‘train2017.zip’


2025-05-22 10:54:51 (48.2 MB/s) - ‘train2017.zip’ saved [19336861798/19336861798]

--2025-05-22 10:54:51--  http://images.cocodataset.org/zips/val2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 16.15.200.222, 52.217.124.41, 16.15.177.183, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|16.15.200.222|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 815585330 (778M) [application/zip]
Saving to: ‘val2017.zip’


2025-05-22 10:55:06 (52.9 MB/s) - ‘val2017.zip’ saved [815585330/815585330]

--2025-05-22 10:55:06--  http://images.cocodataset.or

In [4]:
import zipfile
import os

# 경로 설정
dataset_dir = '/content/drive/MyDrive/cbnu/datasets/12/coco'
train_dir = os.path.join(dataset_dir, 'train2017')
val_dir = os.path.join(dataset_dir, 'val2017')
annotations_dir = os.path.join(dataset_dir, 'annotations')

# 디렉터리 생성
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(annotations_dir, exist_ok=True)

# 압축 해제
with zipfile.ZipFile('train2017.zip', 'r') as zip_ref:
    zip_ref.extractall(train_dir)

with zipfile.ZipFile('val2017.zip', 'r') as zip_ref:
    zip_ref.extractall(val_dir)

with zipfile.ZipFile('annotations_trainval2017.zip', 'r') as zip_ref:
    zip_ref.extractall(annotations_dir)


In [5]:
import os
import torch
from PIL import Image
from pycocotools.coco import COCO
from torch.utils.data import Dataset


class COCODataset(Dataset):
    def __init__(self, root, train, transform=None):
        super().__init__()
        directory = "train2017" if train else "val2017"
        annotations = os.path.join(root, "annotations", f"instances_{directory}.json")

        self.coco = COCO(annotations)
        self.image_path = os.path.join(root, directory)
        self.transform = transform

        self.categories = self._get_categories()
        self.data = self._load_data()

    def _get_categories(self):
        categories = {0: "background"}
        for category in self.coco.cats.values():
            categories[category["id"]] = category["name"]
        return categories

    def _load_data(self):
        data = []
        for _id in self.coco.imgs:
            file_name = self.coco.loadImgs(_id)[0]["file_name"]
            image_path = os.path.join(self.image_path, file_name)
            image = Image.open(image_path).convert("RGB")

            boxes = []
            labels = []
            anns = self.coco.loadAnns(self.coco.getAnnIds(_id))
            for ann in anns:
                x, y, w, h = ann["bbox"]

                boxes.append([x, y, x + w, y + h])
                labels.append(ann["category_id"])

            target = {
            "image_id": torch.LongTensor([_id]),
                "boxes": torch.FloatTensor(boxes),
                "labels": torch.LongTensor(labels)
            }
            data.append([image, target])
        return data

    def __getitem__(self, index):
        image, target = self.data[index]
        if self.transform:
            image = self.transform(image)
        return image, target

    def __len__(self):
        return len(self.data)

In [7]:
from torchvision import transforms
from torch.utils.data import DataLoader


def collator(batch):
    return tuple(zip(*batch))

transform = transforms.Compose(
    [
        transforms.PILToTensor(),
        transforms.ConvertImageDtype(dtype=torch.float)
    ]
)

# COCO 데이터셋 경로 수정
dataset_root = "/content/drive/MyDrive/cbnu/datasets/12/coco"

# train과 test 데이터셋 경로를 dataset_root로 수정하여 인스턴스화
train_dataset = COCODataset(dataset_root, train=True, transform=transform)
test_dataset = COCODataset(dataset_root, train=False, transform=transform)

# DataLoader 설정
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collator)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True, drop_last=True, collate_fn=collator)

loading annotations into memory...
Done (t=34.71s)
creating index...
index created!


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/cbnu/datasets/12/coco/train2017/000000391895.jpg'

In [None]:
from torchvision import models
from torchvision import ops
from torchvision.models.detection import rpn
from torchvision.models.detection import FasterRCNN



backbone = models.vgg(weights="VGG16_Weights.IMAGENET1K_V1").features
backbone.out_channels = 512

anchor_generator = rpn.AnchorGenerator(
    sizes=((32, 64, 128, 256, 512),),
    aspect_ratios=((0.5, 1.0, 2.0),)
)
roi_pooler = ops.MultiScaleRoIAlign(
    featmap_names=["0"],
    output_size=(7, 7),
    sampling_ratio=2
)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = FasterRCNN(
    backbone=backbone,
    num_classes=3,
    rpn_anchor_generator=anchor_generator,
    box_roi_pool=roi_pooler
).to(device)

In [None]:
from torch import optim


params = [p for p in model.parameters() if p.requires_grad]
optimizer = optim.SGD(params, lr=0.001, momentum=0.9, weight_decay=0.0005)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

In [None]:
for epoch in range(1):
    cost = 0.0
    for idx, (images, targets) in enumerate(train_dataloader):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]


        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        cost += losses

    lr_scheduler.step()
    cost = cost / len(train_dataloader)
    print(f"Epoch : {epoch+1:4d}, Cost : {cost:.3f}")

In [None]:
import numpy as np
from PIL import Image
from matplotlib import pyplot as plt
from torchvision.transforms.functional import to_pil_image


def draw_bbox(ax, box, text, color):
    ax.add_patch(
        plt.Rectangle(
            xy=(box[0], box[1]),
            width=box[2] - box[0],
            height=box[3] - box[1],
            fill=False,
            edgecolor=color,
            linewidth=2,
        )
    )
    ax.annotate(
        text=text,
        xy=(box[0] - 5, box[1] - 5),
        color=color,
        weight="bold",
        fontsize=13,
    )

threshold = 0.5
categories = test_dataset.categories
with torch.no_grad():
    model.eval()
    for images, targets in test_dataloader:
        images = [image.to(device) for image in images]
        outputs = model(images)

        boxes = outputs[0]["boxes"].to("cpu").numpy()
        labels = outputs[0]["labels"].to("cpu").numpy()
        scores = outputs[0]["scores"].to("cpu").numpy()

        boxes = boxes[scores >= threshold].astype(np.int32)
        labels = labels[scores >= threshold]
        scores = scores[scores >= threshold]

        fig = plt.figure(figsize=(8, 8))
        ax = fig.add_subplot(1, 1, 1)
        plt.imshow(to_pil_image(images[0]))

        for box, label, score in zip(boxes, labels, scores):
            draw_bbox(ax, box, f"{categories[label]} - {score:.4f}", "red")

        tboxes = targets[0]["boxes"].numpy()
        tlabels = targets[0]["labels"].numpy()
        for box, label in zip(tboxes, tlabels):
            draw_bbox(ax, box, f"{categories[label]}", "blue")

        plt.show()

## YOLOv8
- ultralytics 다운로드 필요

In [8]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.142-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading n

In [7]:
from ultralytics import YOLO


model = YOLO("yolov8n-pose.pt")

In [2]:
import cv2
from google.colab.patches import cv2_imshow


capture = cv2.VideoCapture("/content/drive/MyDrive/cbnu/datasets/12/woman.mp4")
while cv2.waitKey(10) < 0:
    if capture.get(cv2.CAP_PROP_POS_FRAMES) == capture.get(cv2.CAP_PROP_FRAME_COUNT):
        capture.set(cv2.CAP_PROP_POS_FRAMES, 0)

    ret, frame = capture.read()
    #cv2_imshow(frame)

capture.release()
cv2.destroyAllWindows()

Output hidden; open in https://colab.research.google.com to view.

In [3]:
import torch


def predict(frame, iou=0.7, conf=0.25):
    results = model(
        source=frame,
        device="0" if torch.cuda.is_available() else "cpu",
        iou=0.7,
        conf=0.25,
        verbose=False,
    )
    result = results[0]
    return result

In [4]:
def draw_boxes(result, frame):
    for boxes in result.boxes:
        x1, y1, x2, y2, score, classes = boxes.data.squeeze().cpu().numpy()
        cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 0, 255), 1)
    return frame

In [4]:
import cv2
from google.colab.patches import cv2_imshow


capture = cv2.VideoCapture("/content/drive/MyDrive/cbnu/datasets/12/woman.mp4")
while cv2.waitKey(10) < 0:
    if capture.get(cv2.CAP_PROP_POS_FRAMES) == capture.get(cv2.CAP_PROP_FRAME_COUNT):
        capture.set(cv2.CAP_PROP_POS_FRAMES, 0)

    ret, frame = capture.read()
    result = predict(frame)
    frame = draw_boxes(result, frame)

    #cv2_imshow(frame)

capture.release()
cv2.destroyAllWindows()

Output hidden; open in https://colab.research.google.com to view.

In [5]:
from ultralytics.utils.plotting import Annotator


def draw_keypoints(result, frame):
    annotator = Annotator(frame, line_width=1)
    for kps in result.keypoints:
        kps = kps.data.squeeze()
        annotator.kpts(kps)

        nkps = kps.cpu().numpy()
        # nkps[:,2] = 1
        # annotator.kpts(nkps)
        for idx, (x, y, score) in enumerate(nkps):
            if score > 0.5:
                cv2.circle(frame, (int(x), int(y)), 3, (0, 0, 255), cv2.FILLED)
                cv2.putText(frame, str(idx), (int(x), int(y)), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 255), 1)

    return frame

In [8]:
import cv2
from google.colab.patches import cv2_imshow

capture = cv2.VideoCapture("/content/drive/MyDrive/cbnu/datasets/12/woman.mp4")
while cv2.waitKey(10) < 0:
    if capture.get(cv2.CAP_PROP_POS_FRAMES) == capture.get(cv2.CAP_PROP_FRAME_COUNT):
        capture.set(cv2.CAP_PROP_POS_FRAMES, 0)

    ret, frame = capture.read()
    result = predict(frame)
    frame = draw_keypoints(result, frame)
    cv2_imshow(frame)

capture.release()
cv2.destroyAllWindows()

Output hidden; open in https://colab.research.google.com to view.

## Mask R-CNN
- COCO 데이터셋 경로 수정해서 실습 해야 함

In [4]:
import os
import torch
import numpy as np
from PIL import Image
from pycocotools.coco import COCO
from torch.utils.data import Dataset
from pycocotools import mask as maskUtils


class COCODataset(Dataset):
    def __init__(self, root, train, transform=None):
        super().__init__()
        directory = "train" if train else "val"
        annotations = os.path.join("/content/drive/MyDrive/cbnu/datasets/12/coco", "annotations", f"{directory}_annotations.json")

        self.coco = COCO(annotations)
        self.iamge_path = os.path.join(root, directory)
        self.transform = transform

        self.categories = self._get_categories()
        self.data = self._load_data()

    def _get_categories(self):
        categories = {0: "background"}
        for category in self.coco.cats.values():
            categories[category["id"]] = category["name"]
        return categories

    def _load_data(self):
        data = []
        for _id in self.coco.imgs:
            file_name = self.coco.loadImgs(_id)[0]["file_name"]
            image_path = os.path.join(self.iamge_path, file_name)
            image = Image.open(image_path).convert("RGB")
            width, height = image.size

            boxes = []
            labels = []
            masks = []
            anns = self.coco.loadAnns(self.coco.getAnnIds(_id))
            for ann in anns:
                x, y, w, h = ann["bbox"]
                segmentations = ann["segmentation"]
                try:
                    mask = self._polygon_to_mask(segmentations, width, height)
                except Exception as e:
                    pass

                boxes.append([x, y, x + w, y + h])
                labels.append(ann["category_id"])
                masks.append(mask)

            target = {
            "image_id": torch.LongTensor([_id]),
                "boxes": torch.FloatTensor(boxes),
                "labels": torch.LongTensor(labels),
                "masks": torch.FloatTensor(masks)
            }
            data.append([image, target])
        return data

    def _polygon_to_mask(self, segmentations, width, height):
        binary_mask = []
        for seg in segmentations:
            rles = maskUtils.frPyObjects([seg], height, width)
            binary_mask.append(maskUtils.decode(rles))

        combined_mask = np.sum(binary_mask, axis=0).squeeze()
        return combined_mask

    def __getitem__(self, index):
        image, target = self.data[index]
        if self.transform:
            image = self.transform(image)
        return image, target

    def __len__(self):
        return len(self.data)

In [5]:
from torchvision import transforms
from torch.utils.data import DataLoader


def collator(batch):
    return tuple(zip(*batch))

transform = transforms.Compose(
    [
        transforms.PILToTensor(),
        transforms.ConvertImageDtype(dtype=torch.float)
    ]
)

# COCO 데이터셋 경로 수정
dataset_root = "/content/drive/MyDrive/cbnu/datasets/12/coco"

# train과 test 데이터셋 경로를 dataset_root로 수정하여 인스턴스화
train_dataset = COCODataset(dataset_root, train=True, transform=transform)
test_dataset = COCODataset(dataset_root, train=False, transform=transform)

train_dataloader = DataLoader(
    train_dataset, batch_size=4, shuffle=True, drop_last=True, collate_fn=collator
)
test_dataloader = DataLoader(
    test_dataset, batch_size=1, shuffle=True, drop_last=True, collate_fn=collator
)

loading annotations into memory...


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/cbnu/datasets/12/coco/annotations/train_annotations.json'

In [None]:
from torchvision.models.detection import maskrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor


num_classes = 3
hidden_layer = 256
device = "cuda" if torch.cuda.is_available() else "cpu"
model = maskrcnn_resnet50_fpn(weights="DEFAULT")

model.roi_heads.box_predictor = FastRCNNPredictor(
    in_channels=model.roi_heads.box_predictor.cls_score.in_features,
    num_classes=num_classes
)
model.roi_heads.mask_predictor = MaskRCNNPredictor(
    in_channels=model.roi_heads.mask_predictor.conv5_mask.in_channels,
    dim_reduced=hidden_layer,
    num_classes=num_classes
)
model.to(device)

In [None]:
from torch import optim


params = [p for p in model.parameters() if p.requires_grad]
optimizer = optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

In [None]:
for epoch in range(1):
    cost = 0.0
    for idx, (images, targets) in enumerate(train_dataloader):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        cost += losses

    lr_scheduler.step()
    cost = cost / len(train_dataloader)
    print(f"Epoch : {epoch+1:4d}, Cost : {cost:.3f}")

In [None]:
import numpy as np
from matplotlib import pyplot as plt
from torchvision.transforms.functional import to_pil_image


def draw_bbox(ax, box, text, color, mask):
    ax.add_patch(
        plt.Rectangle(
            xy=(box[0], box[1]),
            width=box[2] - box[0],
            height=box[3] - box[1],
            fill=False,
            edgecolor=color,
            linewidth=2,
        )
    )
    ax.annotate(
        text=text,
        xy=(box[0] - 5, box[1] - 5),
        color=color,
        weight="bold",
        fontsize=13,
    )

    mask = np.ma.masked_where(mask == 0, mask)
    mask_color = {"blue": "Blues", "red" : "Reds"}

    cmap = plt.cm.get_cmap(mask_color.get(color, "Greens"))
    norm = plt.Normalize(vmin=0, vmax=1)
    rgba = cmap(norm(mask))
    ax.imshow(rgba, interpolation="nearest", alpha=0.3)

threshold = 0.5
categories = test_dataset.categories

with torch.no_grad():
    model.eval()
    for images, targets in test_dataloader:
        images = [image.to(device) for image in images]
        outputs = model(images)

        boxes = outputs[0]["boxes"].to("cpu").numpy()
        labels = outputs[0]["labels"].to("cpu").numpy()
        scores = outputs[0]["scores"].to("cpu").numpy()

        boxes = boxes[scores >= threshold].astype(np.int32)
        labels = labels[scores >= threshold]
        scores = scores[scores >= threshold]

        fig = plt.figure(figsize=(8, 8))
        ax = fig.add_subplot(1, 1, 1)
        plt.imshow(to_pil_image(images[0]))

        masks = outputs[0]["masks"].squeeze(1).to("cpu").numpy()
        masks[masks >= threshold] = 1.0
        masks[masks < threshold] = 0.0

        for box, mask, label, score in zip(boxes, masks, labels, scores):
            draw_bbox(ax, box, f"{categories[label]} - {score:.4f}", "red", mask)

        tboxes = targets[0]["boxes"].numpy()
        tmask = targets[0]["masks"].numpy()
        tlabels = targets[0]["labels"].numpy()

        for box, mask, label in zip(tboxes, tmask, tlabels):
            draw_bbox(ax, box, f"{categories[label]}", "blue", mask)

        plt.show()

In [None]:
import numpy as np
from pycocotools.cocoeval import COCOeval


with torch.no_grad():
    model.eval()
    coco_detections = []
    for images, targets in test_dataloader:
        images = [img.to(device) for img in images]
        outputs = model(images)

        for i in range(len(targets)):
            image_id = targets[i]["image_id"].data.cpu().numpy().tolist()[0]
            boxes = outputs[i]["boxes"].data.cpu().numpy()
            boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
            boxes[:, 3] = boxes[:, 3] - boxes[:, 1]
            scores = outputs[i]["scores"].data.cpu().numpy()
            labels = outputs[i]["labels"].data.cpu().numpy()
            masks = outputs[i]["masks"].squeeze(1).data.cpu().numpy()

            for instance_id in range(len(boxes)):
                segmentation_mask = masks[instance_id]
                binary_mask = segmentation_mask > 0.5
                binary_mask = binary_mask.astype(np.uint8)
                binary_mask_encoded = maskUtils.encode(
                    np.asfortranarray(binary_mask)
                )

                prediction = {
                    "image_id": int(image_id),
                    "category_id": int(labels[instance_id]),
                    "bbox": [round(coord, 2) for coord in boxes[instance_id]],
                    "score": float(scores[instance_id]),
                    "segmentation": binary_mask_encoded
                }
                coco_detections.append(prediction)

    coco_gt = test_dataloader.dataset.coco
    coco_dt = coco_gt.loadRes(coco_detections)
    coco_evaluator = COCOeval(coco_gt, coco_dt, iouType="segm")
    coco_evaluator.evaluate()
    coco_evaluator.accumulate()
    coco_evaluator.summarize()