In [1]:
import warnings
warnings.filterwarnings('ignore')

import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import albumentations as A
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps:0' if torch.backends.mps.is_available() else 'cpu')

IMAGE_SIZE = 416
BATCH_SIZE = 10
data_dir = '../data/IndoorObjectsDetection'

learning_rate = 0.001
epochs = 20

In [3]:
import yaml
import os
import random

data_config = open( data_dir + '/data.yaml')

data_info = yaml.load(data_config, Loader=yaml.FullLoader)

train_data_path = data_info['train_data_path']
val_data_path = data_info['val_data_path']
test_data_path = data_info['test_data_path']

train_labels_path = data_info['train_labels_path']
val_labels_path = data_info['val_labels_path']
test_labels_path = data_info['test_labels_path']

target_list = data_info['names']
target_dict = dict(zip(range(len(target_list)), target_list))

target_dict

{0: 'door',
 1: 'cabinetDoor',
 2: 'refrigeratorDoor',
 3: 'window',
 4: 'chair',
 5: 'table',
 6: 'cabinet',
 7: 'couch',
 8: 'openedDoor',
 9: 'pole'}

In [4]:
resnet18 = torchvision.models.resnet18(pretrained = False)
layers = [m for m in resnet18.children()]

# 마지막 2층인 average pooling & fully connected layer 은 back bone으로 사용하지 않음
test_net = nn.Sequential(*layers[:-2]) 

temp_x = torch.randn(1,3,IMAGE_SIZE,IMAGE_SIZE)
temp_y = test_net(temp_x)


print(type(temp_x))
print(temp_x.shape)
print(temp_y.shape)

<class 'torch.Tensor'>
torch.Size([1, 3, 416, 416])
torch.Size([1, 512, 13, 13])


In [5]:
"""
class YOLOv1_RESNET(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        
        self.num_classes = num_classes
        self.num_bboxes = 2
        self.grid_size = 7
        
        resnet18 = torchvision.models.resnet18(pretrained = False)
        layers = [m for m in resnet18.children()]
        
        self.backbone = nn.Sequential(*layers[:-2])
        
        self.neck = nn.Sequential(
            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=1, padding=0, bias=False),
            nn.BatchNorm2d(1024),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.ReLU(inplace=True)
        )
        
        self.head = nn.Sequential(
            nn.Conv2d(in_channels=1024, out_channels=5*self.num_bboxes+self.num_classes, kernel_size=1, padding=0, bias=False),
            nn.AdaptiveAvgPool2d(output_size=(self.grid_size, self.grid_size))
        )
        
    def forward(self, x):
        out = self.backbone(x)
        out = self.neck(out)
        out = self.head(out)
        return out
"""

'\nclass YOLOv1_RESNET(nn.Module):\n    def __init__(self, num_classes):\n        super().__init__()\n        \n        self.num_classes = num_classes\n        self.num_bboxes = 2\n        self.grid_size = 7\n        \n        resnet18 = torchvision.models.resnet18(pretrained = False)\n        layers = [m for m in resnet18.children()]\n        \n        self.backbone = nn.Sequential(*layers[:-2])\n        \n        self.neck = nn.Sequential(\n            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=1, padding=0, bias=False),\n            nn.BatchNorm2d(1024),\n            nn.ReLU(inplace=True),\n            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1, bias=False),\n            nn.BatchNorm2d(1024),\n            nn.ReLU(inplace=True),\n            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1, bias=False),\n            nn.BatchNorm2d(1024),\n            nn.ReLU(inplace=True)\n        )\n        \n        self.head = nn.S

In [54]:
"""
NUM_CLASSES = len(target_list)
model = YOLOv1_RESNET(num_classes = NUM_CLASSES)

model
"""

'\nNUM_CLASSES = len(target_list)\nmodel = YOLOv1_RESNET(num_classes = NUM_CLASSES)\n\nmodel\n'

In [7]:
class Detection_dataset():
    def __init__(self, data_dir, phase, transform=None):
        self.data_dir = data_dir
        self.phase = phase
        self.image_files = []
        self.transform = transform
        
        for fn in os.listdir(os.path.join(self.data_dir, phase, 'images')):
            bboxes, class_ids = self.get_label(fn)
                                
            if(fn.endswith("jpg") and bboxes.size != 0 and class_ids.size != 0):
                self.image_files.append(fn)
        
    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, index):
        filename, image = self.get_image(index)
        bboxes, class_ids = self.get_label(filename)
        
        if self.transform: 
            transformed_data = self.transform(image=image, bboxes=bboxes, class_ids=class_ids)
            image = transformed_data['image']
            bboxes = np.array(transformed_data['bboxes'])
            class_ids = np.array(transformed_data['class_ids'])
        else:
            #transform 을 하지 않을경우 reshape to (C,W,H)
            image = torch.Tensor(image).permute(2,0,1)
        
        target = np.concatenate((bboxes, class_ids[:, np.newaxis]), axis=1)    
        return image, target, filename
    
    def get_image(self, index):
        filename = self.image_files[index]
        image_path = os.path.join(self.data_dir, self.phase, 'images', filename)
        image = cv2.imread(image_path)
        return filename, image
    
    
    def get_label(self, filename):
        image_id = filename.split('.')[0]
        label_file_path = os.path.join(self.data_dir, self.phase, 'labels') + '/' + image_id + '.txt'
        try:
            bbox_df = pd.read_csv(label_file_path, sep=' ', header=None)

            # width or height 가 0이면 제거 
            bbox_df = bbox_df[(bbox_df[3] != 0) & (bbox_df[4] != 0)]
            
            bboxes = np.asarray(bbox_df[[1,2,3,4]])
            class_ids = np.asarray(bbox_df[0])
            
        except Exception as e:
            bboxes = np.array([])
            class_ids = np.array([])
            
            
        return bboxes, class_ids

In [8]:
import albumentations as A
from albumentations.pytorch import ToTensorV2

"""
    when you use yolo format bbox param
    need to add logic in albumentations/augmentations/bbox_utils.py - check_bbox() method
    to make bbox boundery in [0,1]
    
    -------------------
    bbox=list(bbox)
    
    for i in range(4):
      if (bbox[i]<0) :
        bbox[i]=0
      elif (bbox[i]>1) :
        bbox[i]=1
    
    bbox=tuple(bbox)
    --------------------
"""

#mean=(0.485, 0.456, 0.406),std=(0.229, 0.224, 0.225) -> imageNet 데이터셋에 기반한 계산된 수치 
transform = A.Compose([
        A.Resize(height=IMAGE_SIZE, width=IMAGE_SIZE),
        # A.Normalize(mean=(0.485, 0.456, 0.406),std=(0.229, 0.224, 0.225)),
        ToTensorV2(),
    ],
    bbox_params=A.BboxParams(format='yolo', label_fields=['class_ids']),
)

transform

Compose([
  Resize(p=1.0, height=416, width=416, interpolation=1),
  Normalize(p=1.0, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, normalization='standard'),
  ToTensorV2(p=1.0, transpose_mask=False),
], p=1.0, bbox_params={'format': 'yolo', 'label_fields': ['class_ids'], 'min_area': 0.0, 'min_visibility': 0.0, 'min_width': 0.0, 'min_height': 0.0, 'check_each_transform': True, 'clip': False}, keypoint_params=None, additional_targets={}, is_check_shapes=True)

In [9]:
from torchvision.utils import make_grid
from my_util import set_bounding_boxes, set_bounding_box, get_random_color_dict
from ipywidgets import interact

transformed_train_dataset = Detection_dataset(data_dir='../data/IndoorObjectsDetection', phase="train", transform=transform)

@interact(index=(0, len(transformed_train_dataset)-1))
def show_transformed_image(index=0):
    img, target, filename = transformed_train_dataset[index]
    
    np_image = make_grid(img, normalize=True).permute(1,2,0).numpy()
    np_image_unit8 = (np_image*255).astype(np.uint8)
    
    res = set_bounding_boxes(np_image_unit8, target[:,0:4], 'yolo', target[:,4].astype(int), target_dict, get_random_color_dict(target_dict))
    plt.imshow(res)

interactive(children=(IntSlider(value=0, description='index', max=860), Output()), _dom_classes=('widget-inter…

In [10]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    image_list = []
    target_list = []
    filename_list = []
    
    for a,b,c in batch:
        image_list.append(a)
        target_list.append(b)
        filename_list.append(c)
        
    return torch.stack(image_list, dim=0), target_list, filename_list

def train_valid_dataloader(data_dir, batch_size=4, transform=None):
    dataloaders = {}
    
    train_dataset = Detection_dataset(data_dir=data_dir, phase="train", transform=transform)
    dataloaders["train"] = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    
    valid_dataset = Detection_dataset(data_dir=data_dir, phase="valid", transform=transform)    
    dataloaders["val"] = DataLoader(valid_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)
    
    return dataloaders

dataloaders = train_valid_dataloader(data_dir, BATCH_SIZE, transform)

In [12]:
'''
pip install --trusted-host pypi.python.org --trusted-host pypi.org --trusted-host files.pythonhosted.org tqdm
'''

'\npip install --trusted-host pypi.python.org --trusted-host pypi.org --trusted-host files.pythonhosted.org tqdm\n'

# **YOLO_V3 Model** 

In [13]:
class BasicConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1):
        super().__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding=(kernel_size-1)//2, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.LeakyReLU(0.1, inplace=True)
        )

    def forward(self, x):
        return self.conv(x)

In [14]:
#Backbone
class ResidualBlock(nn.Module):
    def __init__(self, channels):
        super().__init__()

        self.residual = nn.Sequential(
            BasicConvBlock(channels, channels//2, kernel_size=1, stride=1),
            BasicConvBlock(channels//2, channels, kernel_size=3, stride=1)
        )

    def forward(self, x):
        return self.residual(x) + x

class DarkNet53(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv1 = BasicConvBlock(3, 32, 3, 1)
        self.block1 = nn.Sequential(
            BasicConvBlock(32, 64, 3, 2),
            ResidualBlock(64)
        )
        self.block2 = nn.Sequential(
            BasicConvBlock(64, 128, 3, 2),
            nn.Sequential(*[ResidualBlock(128) for _ in range(2)])
        )
        self.block3 = nn.Sequential(
            BasicConvBlock(128, 256, 3, 2),
            nn.Sequential(*[ResidualBlock(256) for _ in range(8)])
        )
        self.block4 = nn.Sequential(
            BasicConvBlock(256, 512, 3, 2),
            nn.Sequential(*[ResidualBlock(512) for _ in range(8)])
        )
        self.block5 = nn.Sequential(
            BasicConvBlock(512, 1024, 3, 2),
            nn.Sequential(*[ResidualBlock(1024) for _ in range(4)])
        )
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.block1(x)
        x = self.block2(x)
        feature_map1 = self.block3(x)
        feature_map2 = self.block4(feature_map1)
        feature_map3 = self.block5(feature_map2)

        return feature_map1, feature_map2, feature_map3

In [15]:
#Neck : FPN top-down
class FPN_featureBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()

        self.conv = nn.Sequential(
            BasicConvBlock(in_channels, out_channels, 1),
            BasicConvBlock(out_channels, out_channels*2, 3),
            BasicConvBlock(out_channels*2, out_channels, 1),
            BasicConvBlock(out_channels, out_channels*2, 3),
            BasicConvBlock(out_channels*2, out_channels, 1)
        )

    def forward(self, x):
        return self.conv(x)
        
class UpSampling(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()

        self.upsample = nn.Sequential(
            BasicConvBlock(in_channels, out_channels, 1),
            nn.Upsample(scale_factor = 2)
        )
    
    def forward(self, x):
        return self.upsample(x)

In [16]:
#Head
class DetectionLayer(nn.Module):
    def __init__(self, in_channels, anchors, num_classes, img_size=IMAGE_SIZE):
        super().__init__()

        self.pred = nn.Sequential(
            BasicConvBlock(in_channels, in_channels*2, 3),
            nn.Conv2d(in_channels*2, (num_classes+5)*3, 1)
        )
        self.anchors = anchors
        self.num_classes = num_classes
        self.img_size = img_size

    def forward(self, x):
        batch_size = x.size(0)
        grid_size = x.size(2)
        
        output = self.pred(x)
        output = output.view(batch_size, 3, self.num_classes+5, grid_size, grid_size) 
        output = output.permute(0, 1, 3, 4, 2)

        output = output.contiguous()

        # 추가 처리 필요
        
        return output

In [17]:
class Yolov3(nn.Module):
    def __init__(self, num_classes):
        super().__init__()

        self.num_classes = num_classes

        self.darknet53 = DarkNet53()

        self.fpn_feature_block1 = FPN_featureBlock(1024, 512)
        self.detectionlayer1 = DetectionLayer(512, num_classes)
        self.upsampling1 = UpSampling(512, 256)

        self.fpn_feature_block2 = FPN_featureBlock(512+256, 256)
        self.detectionlayer2 = DetectionLayer(256, num_classes)
        self.upsampling2 = UpSampling(256, 128)
        
        self.fpn_feature_block3 = FPN_featureBlock(256+128, 128)
        self.detectionlayer3 = DetectionLayer(128, num_classes)

    def forward(self, x):
        
        self.feature1, self.feature2, self.feature3 = self.darknet53(x)
        
        x = self.fpn_feature_block1(self.feature3)
        output1 = self.detectionlayer1(x)
        x = self.upsampling1(x)

        x = self.fpn_feature_block2(torch.cat([x, self.feature2], dim=1))
        output2 = self.detectionlayer2(x)
        x = self.upsampling2(x)

        x = self.fpn_feature_block3(torch.cat([x, self.feature1], dim=1))
        output3 = self.detectionlayer3(x)

        return output1, output2, output3

In [18]:
x = torch.randn((1, 3, 416, 416))
model = Yolov3(num_classes = 20).to(device)
out = model(x)
print(out[0].shape) # torch.Size([1, 3, 13, 13, 25])
print(out[1].shape) # torch.Size([1, 3, 26, 26, 25])
print(out[2].shape) # torch.Size([1, 3, 52, 52, 25]) 

torch.Size([1, 3, 13, 13, 25])
torch.Size([1, 3, 26, 26, 25])
torch.Size([1, 3, 52, 52, 25])


In [19]:
from torchinfo import summary

summary(model, input_size = (1, 3, 416, 416), device = device)

Layer (type:depth-idx)                                       Output Shape              Param #
Yolov3                                                       [1, 3, 13, 13, 25]        --
├─DarkNet53: 1-1                                             [1, 256, 52, 52]          --
│    └─BasicConvBlock: 2-1                                   [1, 32, 416, 416]         --
│    │    └─Sequential: 3-1                                  [1, 32, 416, 416]         928
│    └─Sequential: 2-2                                       [1, 64, 208, 208]         --
│    │    └─BasicConvBlock: 3-2                              [1, 64, 208, 208]         18,560
│    │    └─ResidualBlock: 3-3                               [1, 64, 208, 208]         20,672
│    └─Sequential: 2-3                                       [1, 128, 104, 104]        --
│    │    └─BasicConvBlock: 3-4                              [1, 128, 104, 104]        73,984
│    │    └─Sequential: 3-5                                  [1, 128, 104, 104]   

In [20]:
model

Yolov3(
  (darknet53): DarkNet53(
    (conv1): BasicConvBlock(
      (conv): Sequential(
        (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): LeakyReLU(negative_slope=0.1, inplace=True)
      )
    )
    (block1): Sequential(
      (0): BasicConvBlock(
        (conv): Sequential(
          (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): LeakyReLU(negative_slope=0.1, inplace=True)
        )
      )
      (1): ResidualBlock(
        (residual): Sequential(
          (0): BasicConvBlock(
            (conv): Sequential(
              (0): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          

# **YOLO_V3 Loss** 

In [71]:
#IMAGE_SIZE
# Anchor boxes for each feature map scaled between 0 and 1 
# 3 feature maps at 3 different scales based on YOLOv3 paper 
ANCHORS = [ 
    [(0.28, 0.22), (0.38, 0.48), (0.9, 0.78)], 
    [(0.07, 0.15), (0.15, 0.11), (0.14, 0.29)], 
    [(0.02, 0.03), (0.04, 0.07), (0.08, 0.06)], 
] 

GRID_SIZE = [IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8] # [13, 26, 52]

scaled_anchors = ( 
    torch.tensor(ANCHORS) * torch.tensor(GRID_SIZE).unsqueeze(1).unsqueeze(1).repeat(1,3,2) 
)

scaled_anchors

tensor([[[ 3.6400,  2.8600],
         [ 4.9400,  6.2400],
         [11.7000, 10.1400]],

        [[ 1.8200,  3.9000],
         [ 3.9000,  2.8600],
         [ 3.6400,  7.5400]],

        [[ 1.0400,  1.5600],
         [ 2.0800,  3.6400],
         [ 4.1600,  3.1200]]])

In [22]:
# Defining a function to calculate Intersection over Union (IoU) 
def iou(box1, box2):
    # IoU score for prediction and label 
    # box1 (prediction) and box2 (label) are both in [x, y, width, height] format 
      
    # Box coordinates of prediction 
    b1_x1 = box1[..., 0:1] - box1[..., 2:3] / 2
    b1_y1 = box1[..., 1:2] - box1[..., 3:4] / 2
    b1_x2 = box1[..., 0:1] + box1[..., 2:3] / 2
    b1_y2 = box1[..., 1:2] + box1[..., 3:4] / 2

    # Box coordinates of ground truth 
    b2_x1 = box2[..., 0:1] - box2[..., 2:3] / 2
    b2_y1 = box2[..., 1:2] - box2[..., 3:4] / 2
    b2_x2 = box2[..., 0:1] + box2[..., 2:3] / 2
    b2_y2 = box2[..., 1:2] + box2[..., 3:4] / 2

    # Get the coordinates of the intersection rectangle 
    x1 = torch.max(b1_x1, b2_x1) 
    y1 = torch.max(b1_y1, b2_y1) 
    x2 = torch.min(b1_x2, b2_x2) 
    y2 = torch.min(b1_y2, b2_y2) 
    # Make sure the intersection is at least 0 
    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0) 

    # Calculate the union area 
    box1_area = abs((b1_x2 - b1_x1) * (b1_y2 - b1_y1)) 
    box2_area = abs((b2_x2 - b2_x1) * (b2_y2 - b2_y1)) 
    union = box1_area + box2_area - intersection 

    # Calculate the IoU score 
    epsilon = 1e-6
    iou_score = intersection / (union + epsilon) 

    # Return IoU score 
    return iou_score 

def iouOnWH(box1, box2):  
    # box1:bbox WH, box2:anchors WH
    # Calculate intersection area 
    intersection_area = torch.min(box1[..., 0], box2[..., 0]) * torch.min(box1[..., 1], box2[..., 1]) 

    # Calculate union area 
    box1_area = box1[0] * box1[1] 
    box2_area = box2[..., 0] * box2[..., 1] 
    union_area = box1_area + box2_area - intersection_area 

    # Calculate IoU score 
    iou_score = intersection_area / union_area 

    # Return IoU score 
    return iou_score

class YOLOLoss(nn.Module): 
    def __init__(self): 
        super().__init__() 
        self.mse = nn.MSELoss() 
        self.bce = nn.BCEWithLogitsLoss() 
        self.cross_entropy = nn.CrossEntropyLoss() 
        self.sigmoid = nn.Sigmoid() 
      
    def forward(self, pred, target, anchors): 
        # Identifying which cells in target have objects  
        # and which have no objects 
        obj = target[..., 0] == 1
        no_obj = target[..., 0] == 0
  
        # Calculating No object loss 
        no_object_loss = self.bce( 
            (pred[..., 0:1][no_obj]), (target[..., 0:1][no_obj]), 
        ) 
  
          
        # Reshaping anchors to match predictions 
        anchors = anchors.reshape(1, 3, 1, 1, 2) 
        # Box prediction confidence 
        box_preds = torch.cat([self.sigmoid(pred[..., 1:3]), 
                               torch.exp(pred[..., 3:5]) * anchors 
                            ],dim=-1) 
        # Calculating intersection over union for prediction and target 
        ious = iou(box_preds[obj], target[..., 1:5][obj]).detach() 
        # Calculating Object loss 
        object_loss = self.mse(self.sigmoid(pred[..., 0:1][obj]), 
                               ious * target[..., 0:1][obj]) 
  
          
        # Predicted box coordinates 
        pred[..., 1:3] = self.sigmoid(pred[..., 1:3]) 
        # Target box coordinates 
        target[..., 3:5] = torch.log(1e-6 + target[..., 3:5] / anchors) 
        # Calculating box coordinate loss 
        box_loss = self.mse(pred[..., 1:5][obj], 
                            target[..., 1:5][obj]) 
  
          
        # Claculating class loss 
        class_loss = self.cross_entropy((pred[..., 5:][obj]), 
                                   target[..., 5][obj].long()) 
  
        # Total loss 
        return ( 
            box_loss 
            + object_loss 
            + no_object_loss 
            + class_loss 
        )

In [27]:
from tqdm import tqdm

# Define the train function to train the model 
def training_loop(loader, model, optimizer, loss_fn, scaler, scaled_anchors): 
    # Creating a progress bar 
    progress_bar = tqdm(loader, leave=True) 
  
    # Initializing a list to store the losses 
    losses = [] 
  
    # Iterating over the training data 
    for _, (x, y, filename) in enumerate(progress_bar): 
        x = x.to(device) 

        y0, y1, y2 = ( 
            y[0].to(device), 
            y[1].to(device), 
            y[2].to(device), 
        ) 
  
        with torch.cuda.amp.autocast(): 
            # Getting the model predictions 
            feature_map1, feature_map2, feature_map3 = model(x) 
            # Calculating the loss at each scale 
            loss = ( 
                  loss_fn(feature_map1, y0, scaled_anchors[0]) 
                + loss_fn(feature_map2, y1, scaled_anchors[1]) 
                + loss_fn(feature_map3, y2, scaled_anchors[2]) 
            ) 
  
        # Add the loss to the list 
        losses.append(loss.item()) 
  
        # Reset gradients 
        optimizer.zero_grad() 
  
        # Backpropagate the loss 
        scaler.scale(loss).backward() 
  
        # Optimization step 
        scaler.step(optimizer) 
  
        # Update the scaler for next iteration 
        scaler.update() 
  
        # update progress bar with loss 
        mean_loss = sum(losses) / len(losses) 
        progress_bar.set_postfix(loss=mean_loss)

In [28]:
import torch.optim as optim

# Defining the optimizer 
optimizer = optim.Adam(model.parameters(), lr=learning_rate) 
  
# Defining the loss function 
loss_fn = YOLOLoss() 
  
# Defining the scaler for mixed precision training 
scaler = torch.cuda.amp.GradScaler() 

# Training the model 
for e in range(1, epochs+1): 
    print("Epoch:", e) 
    training_loop(dataloaders["train"], model, optimizer, loss_fn, scaler, scaled_anchors) 
  
    # Saving the model 
    if save_model: 
        save_checkpoint(model, optimizer, filename=f"checkpoint.pth.tar")

Epoch: 1


  0%|                                                                                           | 0/87 [00:01<?, ?it/s]

x ->  tensor([[[[ 1.0673,  1.0844,  1.0844,  ...,  0.8447,  0.8618,  0.8618],
          [ 1.0844,  1.0844,  1.0844,  ...,  0.8447,  0.8447,  0.8447],
          [ 1.0844,  1.0844,  1.1015,  ...,  0.8447,  0.8276,  0.8276],
          ...,
          [ 0.9817,  0.9646,  0.9646,  ..., -1.7583, -1.6384, -1.2788],
          [ 0.9817,  0.9988,  0.9817,  ..., -1.6727, -1.6555, -1.5699],
          [ 0.9817,  0.9988,  0.9817,  ..., -1.2617, -1.2617, -1.2103]],

         [[ 1.0630,  1.0805,  1.0805,  ...,  1.1155,  1.1331,  1.1331],
          [ 1.0805,  1.0805,  1.0805,  ...,  1.1155,  1.1155,  1.1155],
          [ 1.0630,  1.0630,  1.0630,  ...,  1.1155,  1.0980,  1.0980],
          ...,
          [ 0.8704,  0.8529,  0.8704,  ..., -1.6856, -1.5280, -1.1604],
          [ 0.8880,  0.8880,  0.8704,  ..., -1.5630, -1.5455, -1.4755],
          [ 0.8880,  0.8880,  0.8704,  ..., -1.1429, -1.1429, -1.0903]],

         [[ 0.7054,  0.7228,  0.7228,  ...,  1.3851,  1.4025,  1.4025],
          [ 0.7228,  0.7




AttributeError: 'numpy.ndarray' object has no attribute 'to'