In [1]:
import torch
import torch.utils.data as data
import pynvml

import cv2
import numpy as np

import matplotlib.pyplot as plt


import os
import sys

from tqdm import tqdm
from IPython.display import clear_output


sys.path.append('/home/meribejayson/Desktop/Projects/SharkCNN/training_models/YOLO3D/yolo_dataloaders')

from train_dataset import SharkYOLODataset

In [2]:
torch.manual_seed(12)

if not torch.cuda.is_available():
    raise Exception("Couldn't find CUDA")

device = torch.device("cuda")

pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)

megaset_path = "/home/meribejayson/Desktop/Projects/SharkCNN/datasets-reduced/megaset/"
megaset_train_images_path = "/home/meribejayson/Desktop/Projects/SharkCNN/datasets-reduced/megaset/train/images/"
megaset_train_labels_path = "/home/meribejayson/Desktop/Projects/SharkCNN/datasets-reduced/megaset/train/labels/"

image_width = 1920
image_height = 1080

In [4]:
shark_dataset = SharkYOLODataset(30)
data_loader = data.DataLoader(shark_dataset, batch_size=500_000, num_workers=1)

In [5]:
# state_dict = torch.load("./train-1/ann_weights_train_1.tar")

In [6]:
model = ANN(85)
# model.load_state_dict(state_dict)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=0.0001)

In [7]:
LARGE_NUM = 2e120
target_loss_change = 1e-6
exps_in_iter = (image_height * image_width * 2)
kappa = 1 / 323
kappa_inv = 323
coef = (1 + kappa) / 2

def train_model(model, optimizer, data_loader):
    model.train()
    last_average_loss = LARGE_NUM
    curr_average_loss = 0
    curr_iter = 1

    while(np.abs(curr_average_loss - last_average_loss) > target_loss_change):
        
        total_iter_avg_loss = 0
        total_points = 0

        for point in data_loader:
            data_inputs = point[:, :-1].to(device).float()
            data_labels = point[:, -1].to(device).float()

            preds = model(data_inputs).squeeze(dim=1)

            weights = torch.ones_like(data_labels)
            weights[data_labels == 1.0] = kappa_inv
            weights *= coef
            
            loss_module = nn.BCELoss(weight=weights)
            loss = loss_module(preds, data_labels.float())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_iter_avg_loss += loss.item() * data_inputs.size(0)
            total_points += data_inputs.size(0)
 
        last_average_loss = curr_average_loss
        curr_average_loss = total_iter_avg_loss / total_points
        
        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        clear_output(wait=True)
        print(f'Current iteration: {curr_iter - 1}, Average Loss: {last_average_loss}')
        print(f'Current iteration: {curr_iter}, Average Loss: {curr_average_loss}')
        print(f"CPU Usage: {psutil.cpu_percent()}% GPU memory usage: {int(info.used / info.total)}% \n")

        print("Current Parameters:")
        for name, param in model.named_parameters():
            if param.requires_grad:
                print(name, param.data)

        curr_iter += 1

In [8]:
train_model(model, optimizer, data_loader)

Current iteration: 45, Average Loss: 0.27442852685078517
Current iteration: 46, Average Loss: 0.2717382101698514
CPU Usage: 3.8% GPU memory usage: 0% 

Current Parameters:
linear1.weight tensor([[ 0.0490,  0.2394, -0.5217,  ..., -0.0095, -0.0756, -0.0362],
        [-0.0345, -0.0076,  0.2410,  ..., -0.0865, -0.0619, -0.0473],
        [-0.0220,  0.0895,  0.0922,  ...,  0.0495, -0.1137,  0.1581],
        ...,
        [-0.6443,  0.6687,  0.6992,  ...,  0.0747,  0.0101, -0.0342],
        [ 0.0294,  0.1260, -0.2077,  ..., -0.0171, -0.1179, -0.0024],
        [ 0.0500, -0.0952,  0.2169,  ...,  0.0110, -0.1195, -0.0551]],
       device='cuda:0')
linear1.bias tensor([ 0.3492,  0.1381,  0.0276, -0.9464,  0.3556, -0.0162, -0.3628, -0.1456,
        -0.2543,  0.0064, -0.1276,  0.0140,  0.0838,  0.2611,  0.1023, -0.2456,
         0.1899, -0.2152,  0.0683, -0.0017,  0.6944,  0.0526,  0.1186, -0.6829,
        -0.0296,  0.3092, -0.2002, -0.2000, -0.1548,  0.1092, -0.0898, -0.4122,
         0.1425, -0.35

KeyboardInterrupt: 

In [9]:
pynvml.nvmlShutdown()
state_dict = model.state_dict()
print(state_dict)
torch.save(state_dict, "ann_weights_train_4.tar")

OrderedDict([('linear1.weight', tensor([[ 0.0490,  0.2394, -0.5217,  ..., -0.0095, -0.0756, -0.0362],
        [-0.0345, -0.0076,  0.2410,  ..., -0.0865, -0.0619, -0.0473],
        [-0.0220,  0.0895,  0.0922,  ...,  0.0495, -0.1137,  0.1581],
        ...,
        [-0.6443,  0.6687,  0.6992,  ...,  0.0747,  0.0101, -0.0342],
        [ 0.0294,  0.1260, -0.2077,  ..., -0.0171, -0.1179, -0.0024],
        [ 0.0500, -0.0952,  0.2169,  ...,  0.0110, -0.1195, -0.0551]],
       device='cuda:0')), ('linear1.bias', tensor([ 0.3492,  0.1381,  0.0276, -0.9464,  0.3556, -0.0162, -0.3628, -0.1456,
        -0.2543,  0.0064, -0.1276,  0.0140,  0.0838,  0.2611,  0.1023, -0.2456,
         0.1899, -0.2152,  0.0683, -0.0017,  0.6944,  0.0526,  0.1186, -0.6829,
        -0.0296,  0.3092, -0.2002, -0.2000, -0.1548,  0.1092, -0.0898, -0.4122,
         0.1425, -0.3569,  0.2583,  0.1021,  0.2713,  0.1865,  0.3710,  0.6602,
         0.1535,  0.3393], device='cuda:0')), ('linear2.weight', tensor([[ 4.5717e-02, -1.4