# Library Import

In [2]:
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
import numpy as np
import cv2
import os

import albumentations as A
from albumentations.pytorch import ToTensorV2

import torch
# faster rcnn modelÏù¥ Ìè¨Ìï®Îêú library
import torchvision

from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

from torch.utils.data import DataLoader, Dataset
import pandas as pd
from tqdm import tqdm

import mlflow
import mlflow.pytorch
from mlflow.models import infer_signature

# Dataset ÏÉùÏÑ±

In [3]:
annotation = '../../dataset/train.json'
coco = COCO(annotation)
image_id = coco.getImgIds(imgIds=0)
image_info = coco.loadImgs(image_id)[0]
ann_ids = coco.getAnnIds(imgIds=image_info['id'])
anns = coco.loadAnns(ann_ids)
labels = np.array([x['category_id']+1 for x in anns]) 
labels = torch.as_tensor(labels, dtype=torch.int64)

labels[:5]

loading annotations into memory...
Done (t=0.08s)
creating index...
index created!


tensor([1])

In [4]:
class CustomDataset(Dataset):
    '''
      data_dir: dataÍ∞Ä Ï°¥Ïû¨ÌïòÎäî Ìè¥Îçî Í≤ΩÎ°ú
      transforms: data transform (resize, crop, Totensor, etc,,,)
    '''

    def __init__(self, annotation, data_dir, transforms=None):
        super().__init__()
        self.data_dir = data_dir
        # coco annotation Î∂àÎü¨Ïò§Í∏∞ (coco API)
        self.coco = COCO(annotation)
        self.predictions = {
            "images": self.coco.dataset["images"].copy(),
            "categories": self.coco.dataset["categories"].copy(),
            "annotations": None
        }
        self.transforms = transforms

    def __getitem__(self, index: int):
        
        image_id = self.coco.getImgIds(imgIds=index)

        image_info = self.coco.loadImgs(image_id)[0]
        
        image = cv2.imread(os.path.join(self.data_dir, image_info['file_name']))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image /= 255.0

        ann_ids = self.coco.getAnnIds(imgIds=image_info['id'])
        anns = self.coco.loadAnns(ann_ids)

        boxes = np.array([x['bbox'] for x in anns])

        # boxex (x_min, y_min, x_max, y_max)
        boxes[:, 2] = boxes[:, 0] + boxes[:, 2]
        boxes[:, 3] = boxes[:, 1] + boxes[:, 3]
        
        # torchvision faster_rcnnÏùÄ label=0ÏùÑ backgroundÎ°ú Ï∑®Í∏â
        # class_idÎ•º 1~10ÏúºÎ°ú ÏàòÏ†ï 
        labels = np.array([x['category_id']+1 for x in anns]) 
        labels = torch.as_tensor(labels, dtype=torch.int64)
        
        areas = np.array([x['area'] for x in anns])
        areas = torch.as_tensor(areas, dtype=torch.float32)
                                
        is_crowds = np.array([x['iscrowd'] for x in anns])
        is_crowds = torch.as_tensor(is_crowds, dtype=torch.int64)

        target = {'boxes': boxes, 'labels': labels, 'image_id': torch.tensor([index]), 'area': areas,
                  'iscrowd': is_crowds}

        # transform
        if self.transforms:
            sample = {
                'image': image,
                'bboxes': target['boxes'],
                'labels': labels
            }
            sample = self.transforms(**sample)
            image = sample['image']
            target['boxes'] = torch.tensor(sample['bboxes'], dtype=torch.float32)

        return image, target, image_id
    
    def __len__(self) -> int:
        return len(self.coco.getImgIds())

In [5]:
def get_train_transform():
    return A.Compose([
        A.Resize(1024, 1024),
        A.Flip(p=0.5),
        ToTensorV2(p=1.0)
    ], bbox_params={'format': 'pascal_voc', 'label_fields': ['labels']})


def get_valid_transform():
    return A.Compose([
        ToTensorV2(p=1.0)
    ], bbox_params={'format': 'pascal_voc', 'label_fields': ['labels']})

# Util Functions

In [6]:
class Averager:
    def __init__(self):
        self.current_total = 0.0
        self.iterations = 0.0

    def send(self, value):
        self.current_total += value
        self.iterations += 1

    @property
    def value(self):
        if self.iterations == 0:
            return 0
        else:
            return 1.0 * self.current_total / self.iterations

    def reset(self):
        self.current_total = 0.0
        self.iterations = 0.0


def collate_fn(batch):
    return tuple(zip(*batch))

# Trainer

In [7]:
def train_fn(num_epochs, train_data_loader, optimizer, model, device):
    best_loss = 1000
    total_loss_hist = Averager()
    cls_loss_hist = Averager()
    box_loss_hist = Averager()
    rpn_cls_loss_hist = Averager()
    rpn_box_loss_hist = Averager()
    
    checkpoint_dir = './checkpoints'
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
        
    for epoch in range(num_epochs):
        total_loss_hist.reset()
        cls_loss_hist.reset()
        box_loss_hist.reset()
        rpn_cls_loss_hist.reset()
        rpn_box_loss_hist.reset()

        for images, targets, image_ids in tqdm(train_data_loader):

            # gpu Í≥ÑÏÇ∞ÏùÑ ÏúÑÌï¥ image.to(device)
            images = list(image.float().to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            # calculate loss
            loss_dict = model(images, targets)

            losses = sum(loss for loss in loss_dict.values())
            loss_value = losses.item()

            total_loss_hist.send(loss_value)
            
            cls_loss_hist.send(loss_dict['loss_classifier'].item())
            box_loss_hist.send(loss_dict['loss_box_reg'].item())
            rpn_cls_loss_hist.send(loss_dict['loss_objectness'].item())
            rpn_box_loss_hist.send(loss_dict['loss_rpn_box_reg'].item())

            # backward
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            
            
        epoch_loss = total_loss_hist.value
        loss_cls = cls_loss_hist.value
        loss_box_reg = box_loss_hist.value
        loss_rpn_cls = rpn_cls_loss_hist.value
        loss_rpn_loc = rpn_box_loss_hist.value
        
        print(f"Epoch #{epoch+1} loss: {epoch_loss}")
        
        mlflow.log_metrics({
            "total_loss": epoch_loss,
            "loss_cls": loss_cls,
            "loss_box_reg" : loss_box_reg,
            "loss_rpn_cls" : loss_rpn_cls,
            "loss_rpn_loc" : loss_rpn_loc,
            }, step=epoch)
        
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            
            save_path = os.path.join(checkpoint_dir, f'faster_rcnn_torchvision_{epoch+1}.pth')
            torch.save(model.state_dict(), save_path)
            
            # MLflow Î™®Îç∏ Ï†ÄÏû•
            mlflow.pytorch.log_model(model, f'faster_rcnn_torchvision_{epoch+1}.pth')
            mlflow.log_artifact(save_path)
            
    return model

# Main

In [8]:
def main():
    
    mlflow.set_tracking_uri("http://localhost:30280")
    
    experiment_name = "Faster_RCNN_COCO"

    mlflow.set_experiment(experiment_name)
    
    try:
        with mlflow.start_run():
            # ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ ÏÑ§Ï†ï
            num_epochs = 5
            batch_size = 16
            learning_rate = 0.005
            momentum = 0.9
            weight_decay = 0.0005
        
            # Îç∞Ïù¥ÌÑ∞ÏÖã Î∂àÎü¨Ïò§Í∏∞
            annotation = '/data/ephemeral/home/workspace/dataset/train.json' # annotation Í≤ΩÎ°ú
            data_dir = '/data/ephemeral/home/workspace/dataset' # data_dir Í≤ΩÎ°ú
            train_dataset = CustomDataset(annotation, data_dir, get_train_transform()) 
            train_data_loader = DataLoader(
                train_dataset,
                batch_size=batch_size,
                shuffle=False,
                num_workers=0,
                collate_fn=collate_fn
            )
            
            device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
            print(f"Using device: {device}")
            
            # torchvision model Î∂àÎü¨Ïò§Í∏∞
            model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
            num_classes = 11 # class Í∞úÏàò= 10 + background
            # get number of input features for the classifier
            in_features = model.roi_heads.box_predictor.cls_score.in_features
            model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
            model.to(device)
            
            # optimizer 
            params = [p for p in model.parameters() if p.requires_grad]
            optimizer = torch.optim.SGD(params, lr=learning_rate, momentum=momentum, weight_decay=weight_decay)
        
            # ML flowÏóê ÌïòÏù¥Ìçº ÌååÎùºÎØ∏ÌÑ∞ Î°úÍπÖ
            mlflow.log_params({
                    "num_epochs": num_epochs,
                    "batch_size": batch_size,
                    "learning_rate": learning_rate,
                    "momentum": momentum,
                    "weight_decay": weight_decay
            })
            
            # training
            trained_model = train_fn(num_epochs, train_data_loader, optimizer, model, device)
            
            # ÏµúÏ¢Ö Í≤∞Í≥º Î°úÍπÖ
            final_dir = './final_models'
            if not os.path.exists(final_dir):
                os.makedirs(final_dir)
            final_model_path = os.path.join(final_dir, "final_model.pth")
            torch.save(trained_model.state_dict(), final_model_path)
            mlflow.log_artifact(final_model_path)
            
            # model signature 
            trained_model.eval()
            
            sample_input = next(iter(train_data_loader))[0][0].unsqueeze(0).to(device)
            with torch.no_grad():
                sample_output = trained_model(sample_input)
                
            # ÏãúÍ∑∏ÎãàÏ≤ò ÏÉùÏÑ±
            input_sample = sample_input.cpu().numpy()
            output_sample = {k: v.cpu().numpy() for k, v in sample_output[0].items()}
            signature = infer_signature(input_sample, output_sample)
            
            # ÏµúÏ¢Ö Î™®Îç∏ Î°úÍπÖ 
            mlflow.pytorch.log_model(trained_model, "final_model", signature=signature)
    
    except Exception as e:
        print(f"An error occured: {e}")
        mlflow.log_param("error", str(e))
    finally:
        mlflow.end_run()    
        

In [9]:
if __name__ == '__main__':
    main()

loading annotations into memory...
Done (t=0.22s)
creating index...
index created!
Using device: cuda


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 306/306 [04:32<00:00,  1.12it/s]


Epoch #1 loss: 0.6454067307652211


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 306/306 [04:30<00:00,  1.13it/s]


Epoch #2 loss: 0.5096143619492163


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 306/306 [04:31<00:00,  1.13it/s]


Epoch #3 loss: 0.47500742285274994


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 306/306 [04:31<00:00,  1.13it/s]


Epoch #4 loss: 0.4526448973448448


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 306/306 [04:31<00:00,  1.13it/s]


Epoch #5 loss: 0.4319207645417039


2024/10/10 03:54:27 INFO mlflow.tracking._tracking_service.client: üèÉ View run delightful-robin-384 at: http://localhost:30280/#/experiments/383093648340972743/runs/1cb7a64f7c534a6597d2dec47d3ab095.
2024/10/10 03:54:27 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:30280/#/experiments/383093648340972743.
