In [1]:
!nvidia-smi

Mon Mar  6 19:32:20 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:02:00.0 Off |                  N/A |
| 43%   72C    P2   141W / 250W |   7620MiB / 11264MiB |     95%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:04:00.0 Off |                  N/A |
| 23%   33C    P0    59W / 250W |      2MiB / 11264MiB |      0%      Default |
|       

In [5]:
#Torch Modules
import torch
import torchvision
import torch.nn as nn
from torchvision.datasets import VOCDetection
from torchvision import ops
import torch.utils as tu
from torchvision import transforms
from torch.nn.utils.rnn import pad_sequence

#Other modules
import os, random
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET

#Image processing modules
from PIL import Image, ImageDraw, ImageFont
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from Model.model_utils.pascal_loader import VOCDetDataset, getClassDicts
import utils

In [3]:
#from Model.AnchorGen import AnchorGenerator
#from Model.Backbone_VGG16 import BackboneNetwork
#from Model.rpn import RegionProposalNetwork
from Model.one_stage_detector import OneStageDetector

train_loader = dl.getCordTorchDatasetLoader("params_cord_initial.yaml", split='train')

In [6]:
config = utils.load_config_file("params_cord_initial.yaml")

In [7]:
test_img_path = './Data/VOC2007_TEST/JPEGImages/'
test_ann_path = './Data/VOC2007_TEST/Annotations/'

In [8]:
voc_test_dataset = VOCDetDataset(test_img_path, test_ann_path, getClassDicts()[0], (config['IMG_WIDTH'], config['IMG_HEIGHT'],3))
voc_test_loader = tu.data.DataLoader(voc_test_dataset, batch_size=16, shuffle=True)

In [9]:
for img_batch, bboxes_batch, classes_batch in voc_test_loader:
    img_data = img_batch
    bboxes_data = bboxes_batch
    classes_data = classes_batch
    break

In [10]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

In [11]:
device.index

1

In [12]:
img_width, img_height, img_depth = config["IMG_WIDTH"], config["IMG_HEIGHT"], config["CHANNELS"]

In [13]:
model = OneStageDetector((img_depth, img_width, img_height),
                 conf_score_weight = config['CONF_LOSS_WEIGHT'], bbox_weight = config['BBOX_LOSS_WEIGHT'],
                 pos_anchor_thresh = config['POS_ANCHOR_THRESH'], neg_anchor_thresh = config['NEG_ANCHOR_THRESH'], anc_ratio=config['ANCHOR_RATIO'], 
                 anchor_scales=config['ANCHOR_SCALES'], anchor_ratios = config['ANCHOR_RATIOS'], stride=config['STRIDE'],device=device)



In [15]:
state_dict = torch.load("Saved_Models/OSD_E_200_B_64_256x256_Loss_Checkpoint.pt", map_location=device)

In [22]:
state_dict

OrderedDict([('backbone.bbNet.0.weight',
              tensor([[[[-5.6231e-01,  1.3631e-01,  5.2348e-01],
                        [-5.9124e-01,  3.5021e-01,  7.6029e-01],
                        [-6.9825e-01, -5.4408e-02,  4.7869e-01]],
              
                       [[ 1.6659e-01,  2.2956e-03, -8.8970e-02],
                        [ 3.5687e-02, -7.7853e-02, -2.6787e-01],
                        [ 1.2411e-01, -1.8036e-01, -1.3986e-01]],
              
                       [[ 3.0220e-01, -1.7562e-01, -4.3751e-01],
                        [ 4.6675e-01, -9.2461e-02, -4.9700e-01],
                        [ 6.2376e-01,  9.3478e-03, -2.8785e-01]]],
              
              
                      [[[ 2.3218e-01,  1.2583e-01,  1.8560e-01],
                        [-4.2861e-01, -2.4458e-01,  2.4503e-01],
                        [-2.5136e-01,  1.4074e-01, -6.9487e-03]],
              
                       [[-1.4049e-01, -2.1950e-01,  1.4996e-01],
                        [-8.4113e-

In [16]:
model.load_state_dict(state_dict)
model.eval()

OneStageDetector(
  (backbone): BackboneNetwork(
    (bbNet): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace=True)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace=True)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace=True)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace=True)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (15): ReLU(inpla

In [18]:
props, confs = model.inference(img_batch)

In [21]:
confs

tensor([[[  4.9186],
         [  5.8456],
         [  5.9417],
         ...,
         [ -7.7152],
         [ -7.9731],
         [ -7.1450]],

        [[  5.6485],
         [  6.5904],
         [  6.8486],
         ...,
         [ -7.5435],
         [ -7.9952],
         [ -7.3721]],

        [[  4.2533],
         [  4.3744],
         [  4.0316],
         ...,
         [ -9.0504],
         [ -9.2635],
         [ -8.2458]],

        ...,

        [[  5.3930],
         [  6.3817],
         [  6.8254],
         ...,
         [-12.5802],
         [-12.0086],
         [ -9.6217]],

        [[  4.6199],
         [  4.8601],
         [  4.6273],
         ...,
         [-10.3349],
         [-10.5345],
         [ -9.3215]],

        [[  5.5806],
         [  6.7898],
         [  7.4167],
         ...,
         [-10.0146],
         [ -9.8356],
         [ -8.2106]]], device='cuda:1')

In [23]:
class OSDTrainer():
    def __init__(self, rank, config_name=None):
        assert config_name != None, "YAML Configuration File Name Under Configs Folder Is Needed"
        
        self.config = utils.load_config_file(config_name)
        self.rank = rank
        #self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        #Defining training dataset and model
        
        self.cls2idx, self.idx2cls = getClassDicts()
        self.img_width, self.img_height, self.img_depth = self.config["IMG_WIDTH"], self.config["IMG_HEIGHT"], self.config["CHANNELS"]
        
        self.train_dataset = VOCDetDataset(self.config['PASCAL_TRAIN_IMG_PATH'], self.config['PASCAL_TRAIN_ANN_PATH'],
                                                       cls_dict=self.cls2idx, 
                                                       target_size=(self.img_width, self.img_height, self.img_depth))
        
        self.train_loader = torch.utils.data.DataLoader(self.train_dataset, batch_size=self.config['BATCH'], shuffle=True)
        
        self.model = OneStageDetector((self.img_depth, self.img_width, self.img_height),
                 conf_score_weight = self.config['CONF_LOSS_WEIGHT'], bbox_weight = self.config['BBOX_LOSS_WEIGHT'],
                 pos_anchor_thresh = self.config['POS_ANCHOR_THRESH'], neg_anchor_thresh = self.config['NEG_ANCHOR_THRESH'], anc_ratio=self.config['ANCHOR_RATIO'], 
                 anchor_scales=self.config['ANCHOR_SCALES'], anchor_ratios = self.config['ANCHOR_RATIOS'], stride=self.config['STRIDE'],device=self.rank)
        
        #Defining Optimizer
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config["L_RATE"])
        
        #Defining the Model in terms of DistributedDataParallel
        self.model = DDP(self.model, device_ids=[self.rank], find_unused_parameters=True)
        
    
    def getModel():
        return self.model
    
    def printOutput(self, eCount, eLoss, time):

        print("(GPU {0}, Epoch {1}, Duration(s) {2}) ==> Mean Loss :: {3}".format(self.rank,eCount, str(datetime.timedelta(seconds = time)), eLoss), flush=True)
    
    
    def single_batch_run(self, images, bboxes, conf_scores):
        self.optimizer.zero_grad()
        
        model_loss = self.model(images, bboxes, conf_scores)
        
        model_loss.backward()
        
        self.optimizer.step()
        
        return model_loss
    
    def checkpointSaver(self, loss, prev_loss, epoch, final=False, duration=0):
        if self.rank == 0:
            save_dict = {
                    "epoch" : epoch +1,
                    "loss" : loss,
                    "model_dict" : self.model.module.state_dict(),
                    "optimizer_dict" : self.optimizer.state_dict()
                }
            
            #Saving final model
            if final:
                print("Saving Final Model ==> Trained Epochs : {0} | Final Loss : {1} | Training Duration : {2}".format(
                    epoch+1, loss, str(datetime.timedelta(seconds=duration))
                ), flush=True)
                
                torch.save(save_dict, self.config['SAVE_PATH_CHECKPOINT'] + f"last_checkpoint.pt")
                
                print("Model Saved", flush=True)
                return 0
            #Saving Checkpoint
            if ((epoch+1) % self.config['SAVE_ENTRY']) == 0:
                print("Saving Model At Epoch {0}...".format(epoch+1), flush=True)
                torch.save(save_dict, self.config['SAVE_PATH_CHECKPOINT'] + f"checkpoint_{epoch+1}.pt")
                print("Model Saved", flush=True)
            #Saving best model
            if (loss < prev_loss):
                print("Saving Best Model", flush=True)
                torch.save(save_dict, self.config['SAVE_PATH_BEST'] + "best_model.py")
                print("Best Model Saved", flush=True)
                return loss
        
        return prev_loss
            
    
    def trainModel(self):
        model_train_time_start = time.time()
        model_final_loss = 0
        for epoch in range(self.config['EPOCHS']):
            
            print(f"=====(GPU {self.rank}, Epoch {epoch})=====", flush=True)
            #Starting epoch timer
            start_time = time.time()
            #Setting min_loss to a reasonably hight amount
            prev_min_loss = 1e6
            #Storing epcoh total loss without storing history
            epoch_total_loss = 0.0
            num_batches = 0.0
            #One epoch run
            for imgs, bboxes, classes in self.train_loader:
                #Starting timer for each batch
                batch_timer = time.time()
                #Transferring data to same device
                imgs = imgs.to(self.rank)
                bboxes = bboxes.to(self.rank)
                classes = classes.to(self.rank)
                
                #Get loss for one parse
                eLoss = self.single_batch_run(imgs, bboxes, classes)
                epoch_total_loss += float(eLoss)
                num_batches += float(1)
                
                print(f"Batch {num_batches - 1} ==> Batch Loss :: {eLoss} | Duration :: {str(datetime.timedelta(seconds=time.time() - batch_timer))}", flush=True)
            
            epoch_total_loss /= num_batches
            
            #Ending epoch timer
            self.printOutput(epoch, epoch_total_loss, time.time() - start_time)
            
            prev_min_loss = self.checkpointSaver(epoch_total_loss, prev_min_loss, epoch)
            model_final_loss = epoch_total_loss
                
        #Finished Training
        self.checkpointSaver(model_final_loss, 0, self.config['EPOCHS'], final=True, duration=time.time() - model_train_time_start)
        print("Finished Training", flush=True)


SyntaxError: invalid syntax (902769574.py, line 71)