In [1]:
# Clone the official YOLACT repository
!git clone https://github.com/dbolya/yolact.git
%cd yolact

# Install required packages
!pip install torch torchvision
!pip install opencv-python Pillow pycocotools matplotlib


fatal: destination path 'yolact' already exists and is not an empty directory.


/home/user/Downloads/Robotic/T5.2/onebook/yolact

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
import torch
from data.config import Config

# Define dataset paths
dataset_base = Config({
    'name': 'My Dataset',

    # Training images and annotations   
    'train_images': '../my_dataset/train',
    'train_info':   '../my_dataset/train/_annotations.coco.json',
    
    # Validation images and annotations
    'valid_images': '../my_dataset/valid',
    'valid_info':   '../my_dataset/valid/_annotations.coco.json',

    # Define class names list directly inside the config.
    'class_names': ('bench', 'chair', 'couch', 'dining table', 'laptop', 'person'),

    'has_gt': True,
})

In [5]:
# The base YOLACT configuration
from data.config import yolact_base_config as base_config

# Create custom config by overriding the base config
yolact_my_config = base_config.copy({
    'name': 'yolact_custom',
    
    # Add dataset config
    'dataset': dataset_base,
    
    # The number of classes classes + 1 for the background.
    # this is 6 + 1 = 7.
    'num_classes': len(dataset_base.class_names) + 1,

    # You can change this to a lower number for faster training, but will result in a less accurate model.
    # The default is 800000. 
    # 'max_iter': 10000,
    'max_iter': 200,
    
    # Decrease the learning rate decay steps to fit the new max_iter
    # 'lr_steps': (5000, 8000, 9000),
    'lr_steps': (120, 160, 180),

    # Do not use an FPN (Feature Pyramid Network) for a very small dataset
    # 'fpn': None,
})

# This registers config and makes it active
cfg = yolact_my_config
def set_cfg(config_name:str):
    global cfg
    cfg.replace(config_name)

set_cfg(yolact_my_config)
print("Configuration is set.")

Configuration is set.


### For CPU environment

In [6]:
# This cell contains all the necessary functions and classes from the original scripts,
# adapted to run in a notebook and on a CPU.

# Imports from the YOLACT project
from data import *
from utils.augmentations import SSDAugmentation, BaseTransform
from utils.functions import MovingAverage, SavePath
from utils.logger import Log
from utils import timer
from layers.modules import MultiBoxLoss
from yolact import Yolact
import os
import sys
import time
import math, random
from pathlib import Path
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
import torch.nn.init as init
import torch.utils.data as data
import numpy as np
import datetime
import eval as eval_script # Import the original eval script

# A simple class to mimic the command-line arguments
class Args:
    def __init__(self):
        self.batch_size = 4 # Lowered for CPU training on a notebook
        self.resume = None
        self.start_iter = -1
        self.num_workers = 0 # Use 0 for workers on Windows/Jupyter, or 2-4 on Linux
        self.cuda = False # The key setting for CPU-only
        self.lr = 1e-3
        self.momentum = 0.9
        self.decay = 5e-4
        self.gamma = 0.1
        self.save_folder = 'weights/'
        self.log_folder = 'logs/'
        self.config = None # We set this manually above
        self.save_interval = 2000
        self.validation_size = 5000
        self.validation_epoch = 2
        self.keep_latest = True
        self.keep_latest_interval = 10000
        self.dataset = None
        self.log = True
        self.log_gpu = False
        self.interrupt = True
        self.batch_alloc = None
        self.autoscale = False # Disable autoscale since we are setting params manually

args = Args()

# Set the current learning rate
cur_lr = args.lr

# Set the default tensor type
if args.cuda and torch.cuda.is_available():
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
else:
    torch.set_default_tensor_type('torch.FloatTensor')

# The loss function wrapper from train.py
class NetLoss(nn.Module):
    def __init__(self, net:Yolact, criterion:MultiBoxLoss):
        super().__init__()
        self.net = net
        self.criterion = criterion
    
    def forward(self, images, targets, masks, num_crowds):
        preds = self.net(images)
        losses = self.criterion(self.net, preds, targets, masks, num_crowds)
        return losses

# Function to set learning rate
def set_lr(optimizer, new_lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = new_lr
    global cur_lr
    cur_lr = new_lr

# This is the corrected setup_eval function from train.py
def setup_eval():
    eval_args = ['--no_bar', '--max_images=' + str(args.validation_size)]
    if not args.cuda:
        eval_args.append('--cuda=False')
    eval_script.parse_args(eval_args)

# This is the corrected compute_validation_map function from train.py
def compute_validation_map(epoch, iteration, yolact_net, dataset, log:Log=None):
    with torch.no_grad():
        yolact_net.eval()
        start = time.time()
        print()
        print("Computing validation mAP (this may take a while)...", flush=True)
        
        # This function will now use the CPU-fixed eval script
        val_info = eval_script.evaluate(yolact_net, dataset, train_mode=True)
        
        end = time.time()
        if log is not None:
            log.log('val', val_info, elapsed=(end - start), epoch=epoch, iter=iteration)
        yolact_net.train()
        
print("Helper functions and classes are defined.")

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
# This cell monkey-patches the loaded eval_script module to make it CPU-compatible.

# --- Fix 1: prep_display ---
# The original function has hardcoded .cuda() calls. We replace it with our fixed version.

# Grab the original functions from the module so we can still call them
original_postprocess = eval_script.postprocess
original_undo_image_transformation = eval_script.undo_image_transformation

def fixed_prep_display(dets_out, img, h, w, undo_transform=True, class_color=False, mask_alpha=0.45, fps_str=''):
    if undo_transform:
        img_numpy = original_undo_image_transformation(img, w, h)
        display_tensor = torch.Tensor(img_numpy)
        if eval_script.args.cuda:
            display_tensor = display_tensor.cuda()
    else:
        display_tensor = torch.Tensor(img) / 255.0
        h, w, _ = img.shape
        if eval_script.args.cuda:
            display_tensor = display_tensor.cuda()

    with timer.env('Postprocess'):
        save = cfg.rescore_bbox
        cfg.rescore_bbox = True
        t = original_postprocess(dets_out, w, h, visualize_lincomb = eval_script.args.display_lincomb,
                                                 crop_masks        = eval_script.args.crop,
                                                 score_threshold   = eval_script.args.score_threshold)
        cfg.rescore_bbox = save

    with timer.env('Copy'):
        idx = t[1].argsort(0, descending=True)[:eval_script.args.top_k]
        
        if cfg.eval_mask_branch:
            masks = t[3][idx]
        classes, scores, boxes = [x[idx].cpu().numpy() for x in t[:3]]

    num_dets_to_consider = min(eval_script.args.top_k, classes.shape[0])
    for j in range(num_dets_to_consider):
        if scores[j] < eval_script.args.score_threshold:
            num_dets_to_consider = j
            break
    
    def get_color(j, on_gpu=None):
        color_idx = (classes[j] * 5 if class_color else j * 5) % len(COLORS)
        color = COLORS[color_idx]
        if on_gpu is not None:
            color = torch.Tensor(color).to(on_gpu).float() / 255.
        return color

    if eval_script.args.display_masks and cfg.eval_mask_branch and num_dets_to_consider > 0:
        masks = masks[:num_dets_to_consider, :, :, None]
        colors = torch.cat([get_color(j, on_gpu=display_tensor.device).view(1, 1, 1, 3) for j in range(num_dets_to_consider)], dim=0)
        masks_color = masks.repeat(1, 1, 1, 3) * colors * mask_alpha
        inv_alph_masks = masks * (-mask_alpha) + 1
        
        masks_color_summand = masks_color[0]
        if num_dets_to_consider > 1:
            inv_alph_cumul = inv_alph_masks[:(num_dets_to_consider-1)].cumprod(dim=0)
            masks_color_cumul = masks_color[1:] * inv_alph_cumul
            masks_color_summand += masks_color_cumul.sum(dim=0)

        display_tensor = display_tensor * inv_alph_masks.prod(dim=0) + masks_color_summand
    
    img_numpy = (display_tensor * 255).byte().cpu().numpy()
    
    # ... The rest of the drawing logic is fine and uses CPU (cv2) ...
    # (The original drawing code from eval.py can be copied here if needed for display)
    if num_dets_to_consider > 0:
        if eval_script.args.display_text or eval_script.args.display_bboxes:
            for j in reversed(range(num_dets_to_consider)):
                x1, y1, x2, y2 = boxes[j, :]
                color = get_color(j)
                score = scores[j]
                if eval_script.args.display_bboxes:
                    cv2.rectangle(img_numpy, (x1, y1), (x2, y2), color, 1)

    return img_numpy


# --- Fix 2: prep_metrics ---
original_prep_metrics = eval_script.prep_metrics

def fixed_prep_metrics(ap_data, dets, img, gt, gt_masks, h, w, num_crowd, image_id, detections:Detections=None):
    # This function is complex, so we will call the original but first fix the tensors
    # inside `dets` because the original function doesn't handle device placement.
    dets = [d.cpu() for d in dets]
    return original_prep_metrics(ap_data, dets, img, gt, gt_masks, h, w, num_crowd, image_id, detections)

# --- Fix 3: The main evaluate function ---
original_evaluate = eval_script.evaluate

def fixed_evaluate(net:Yolact, dataset, train_mode=False):
    # The original evaluate function has several .cuda() calls.
    # We will create a wrapper around it that ensures the network is on the right device.
    
    net.eval()
    if args.cuda:
        net = net.cuda()
    else:
        net = net.cpu() # Explicitly move to CPU

    # Now call the original function, but with our CPU-safe network.
    # We also need to patch the other functions it calls internally.
    eval_script.prep_display = fixed_prep_display
    eval_script.prep_metrics = fixed_prep_metrics
    
    # The original 'evaluate' has its own batch.cuda() call, so we need to patch that.
    # Let's do it by wrapping it
    def patched_net_call(batch):
        if not args.cuda:
            batch = batch.cpu()
        return net(batch)

    # We can't directly patch net() inside evaluate, so this is a bit of a workaround.
    # The simplest way is to ensure all tensors are on CPU before they enter the original func.
    # Since our fixes in `train.py` handle this, we can proceed.
    
    return original_evaluate(net, dataset, train_mode)


# --- Apply the Patches ---
eval_script.evaluate = fixed_evaluate
eval_script.prep_display = fixed_prep_display

print("Monkey-patches for eval.py have been applied.")

In [None]:
def train():
    if not os.path.exists(args.save_folder):
        os.mkdir(args.save_folder)

    dataset = COCODetection(image_path=cfg.dataset.train_images,
                            info_file=cfg.dataset.train_info,
                            transform=SSDAugmentation())
    
    if args.validation_epoch > 0:
        setup_eval()
        val_dataset = COCODetection(image_path=cfg.dataset.valid_images,
                                      info_file=cfg.dataset.valid_info,
                                      transform=BaseTransform())

    yolact_net = Yolact()
    net = yolact_net
    net.train()

    if args.log:
        log = Log(cfg.name, args.log_folder)

    if args.resume is not None:
        print('Resuming training, loading {}...'.format(args.resume))
        yolact_net.load_weights(args.resume)
    else:
        print('Initializing weights...')
        yolact_net.init_weights(backbone_path=args.save_folder + cfg.backbone.path)

    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum,
                          weight_decay=args.decay)
    criterion = MultiBoxLoss(num_classes=cfg.num_classes,
                             pos_threshold=cfg.positive_iou_threshold,
                             neg_threshold=cfg.negative_iou_threshold,
                             negpos_ratio=cfg.ohem_negpos_ratio)

    if not args.cuda:
        net = NetLoss(net, criterion)
    
    # ... (the rest of the training setup from train.py)
    if not cfg.freeze_bn:
        yolact_net.freeze_bn()
    
    iteration = max(args.start_iter, 0)
    last_time = time.time()
    epoch_size = len(dataset) // args.batch_size
    num_epochs = math.ceil(cfg.max_iter / epoch_size)
    step_index = 0

    data_loader = data.DataLoader(dataset, args.batch_size,
                                  num_workers=args.num_workers,
                                  shuffle=True, collate_fn=detection_collate,
                                  pin_memory=False) # Pin memory is for GPU
    
    save_path = lambda epoch, iteration: SavePath(cfg.name, epoch, iteration).get_path(root=args.save_folder)
    time_avg = MovingAverage()
    loss_types = ['B', 'C', 'M', 'P', 'D', 'E', 'S', 'I']
    loss_avgs  = { k: MovingAverage(100) for k in loss_types }

    print('Begin training!')
    print()
    
    try:
        for epoch in range(num_epochs):
            for datum in data_loader:
                if iteration >= cfg.max_iter:
                    break
                
                while step_index < len(cfg.lr_steps) and iteration >= cfg.lr_steps[step_index]:
                    step_index += 1
                    set_lr(optimizer, args.lr * (args.gamma ** step_index))
                
                optimizer.zero_grad()
                
                # The fix from our previous conversation
                images, (targets, masks, num_crowds) = datum
                images = torch.stack(images, 0)
                
                if args.cuda:
                    # Logic for GPU would go here
                    pass
                else:
                    losses = net(images, targets, masks, num_crowds)
                
                losses = { k: v.mean() for k, v in losses.items() }
                loss = sum([losses[k] for k in losses])
                
                loss.backward()
                optimizer.step()
                
                # Logging
                cur_time  = time.time()
                elapsed   = cur_time - last_time
                last_time = cur_time
                time_avg.add(elapsed)
                for k in losses:
                    loss_avgs[k].add(losses[k].item())
                
                if iteration % 10 == 0:
                    eta_str = str(datetime.timedelta(seconds=(cfg.max_iter-iteration) * time_avg.get_avg())).split('.')[0]
                    total = sum([loss_avgs[k].get_avg() for k in losses])
                    loss_labels = sum([[k, loss_avgs[k].get_avg()] for k in loss_types if k in losses], [])
                    print(('[%3d] %7d ||' + (' %s: %.3f |' * len(losses)) + ' T: %.3f || ETA: %s || timer: %.3f')
                            % tuple([epoch, iteration] + loss_labels + [total, eta_str, elapsed]), flush=True)

                iteration += 1

                if iteration % args.save_interval == 0 and iteration != 0:
                    print('Saving state, iter:', iteration)
                    yolact_net.save_weights(save_path(epoch, iteration))

            if args.validation_epoch > 0:
                if epoch % args.validation_epoch == 0 and epoch > 0:
                    compute_validation_map(epoch, iteration, yolact_net, val_dataset, log if args.log else None)
    
    except KeyboardInterrupt:
        print('Stopping early. Saving network...')
        yolact_net.save_weights(save_path(epoch, repr(iteration) + '_interrupt'))
    
    yolact_net.save_weights(save_path(epoch, iteration))
    print("Training finished.")

# Start the training process
train()