# Importation

In [2]:
# ! pip install kaggle
# ! mkdir ~/.kaggle
# ! cp kaggle.json ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json
# ! kaggle datasets download -d enzodurand/boudingboxonlyhanddataset
# ! unzip boudingboxonlyhanddataset.zip

In [3]:
import os
import copy
import cv2
# import wandb
import numpy as np
import pandas as pd
from tqdm import tqdm
from time import time
from sklearn import preprocessing
from matplotlib import pyplot as plt

import torchvision
from torchvision import models, transforms
from torchvision.io import read_image

import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset

# !pip uninstall albumentations
# !pip install albumentations==0.4.6
import albumentations as A
from albumentations.pytorch import ToTensorV2

# GPU/TPU setup

In [4]:
## TPU
# !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev
# import torch_xla
# import torch_xla.core.xla_model as xm
# device = xm.xla_device()
# torch.set_default_tensor_type('torch.FloatTensor')

## GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

## Weight and biases
# wandb.login()

cuda:0


In [5]:
!nvidia-smi

Mon Apr 25 19:02:54 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 511.65       Driver Version: 511.65       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   67C    P8     5W /  N/A |      0MiB /  6144MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Global variables 

In [6]:
INPUT_SIZE = 400
N_CLASS = 4
WHERE = "home"

In [7]:
if WHERE=="colab":
    PATH_LABELS = "/content/index_label_bbox.csv"
    PATH_IMG = "/content/output/output"
    PATH_LABELS_VALID = "/content/index_label_bbox_validation.csv"
    PATH_IMG_VALID = "/content/output_validation/output_validation"
    BATCH_SIZE = 32
elif WHERE=="kaggle":
    PATH_LABELS = "../input/boudingboxonlyhanddataset/index_label_bbox.csv"
    PATH_IMG = "../input/boudingboxonlyhanddataset/output/output"
    PATH_LABELS_VALID = "../input/boudingboxonlyhanddataset/index_label_bbox_validation.csv"
    PATH_IMG_VALID = "../input/boudingboxonlyhanddataset/output_validation/output_validation"
    BATCH_SIZE = 64
elif WHERE=="home":
    PATH_LABELS = "../../../data_labels/bounding_box_model/done/index_label_bbox.csv"
    PATH_IMG = "../../../data_labels/bounding_box_model/done/output"
    PATH_LABELS_VALID = "../../../data_labels/bounding_box_model/done_validation/index_label_bbox_validation.csv"
    PATH_IMG_VALID = "../../../data_labels/bounding_box_model/done_validation/output_validation"
    BATCH_SIZE = 4

# Data functions

In [8]:
class HandGestureDataset(Dataset):
    def __init__(self, annotations_file, img_dir, transform=None):
        self.img_labels = pd.read_csv(annotations_file)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, os.listdir(self.img_dir)[idx])
        image = read_image(img_path)
        path = str("output/"+os.listdir(self.img_dir)[idx]).split("/")[0]
        line = self.img_labels["index"] == str(path+"/"+os.listdir(self.img_dir)[idx])
        x, y, x_end, y_end = self.img_labels.loc[line]["x"].item(),\
                                self.img_labels.loc[line]["y"].item(),\
                                self.img_labels.loc[line]["x_end"].item(),\
                                self.img_labels.loc[line]["y_end"].item()
        x, y, x_end, y_end = x/INPUT_SIZE, y/INPUT_SIZE, x_end/INPUT_SIZE, y_end/INPUT_SIZE
        
        image = image/255
        
#         image = image.permute(1,2,0)
#         if self.transform:
#             transformed = self.transform(image=np.array(image), bboxes=[[x,y,x_end,y_end]])
#         transformed_image = transformed['image']
#         transformed_bboxes = transformed['bboxes']
#         return transformed_image, transformed_bboxes

        if self.transform:
            transformed = self.transform(image)
        label = [x, y, x_end, y_end]
        return {"image":image, "label":label}

In [9]:
def draw_predictions(image, preds):
    startX, startY, endX, endY = preds
    # scale the predicted bounding box coordinates based on the image
    # dimensions    
    startX = int(startX * INPUT_SIZE)
    startY = int(startY * INPUT_SIZE)
    endX = int(endX * INPUT_SIZE)
    endY = int(endY * INPUT_SIZE)
#     print(startX, startY, endX, endY)
    # draw the predicted bounding box on the image
    image = image.numpy().copy()
    cv2.rectangle(image, (startX, startY), (endX, endY), (0, 255, 0), 2)
    # show the output image
    plt.imshow(image)
    plt.show()

def prepare_data_vgg(data_type):
    ## Parameters fitting vgg/imagenet
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]

    transformVGGTrainAlbu = A.Compose([
        A.VerticalFlip(p=0.3),
        A.HorizontalFlip(p=0.5),
        A.Blur(p=0.3, blur_limit=5),
        A.RandomBrightnessContrast(p=0.3),
        A.RandomGamma(p=0.3),
        A.ChannelShuffle(p=0.3),
        A.Rotate(p=0.5, limit=60),
#         A.Downscale(p=0.3, scale_min=0.6, scale_max=0.9),
#         A.ShiftScaleRotate(p=0.3),
#         A.ElasticTransform(p=0.3, border_mode=cv2.BORDER_REFLECT_101, alpha_affine=40),
#         A.RGBShift(r_shift_limit=0.3, g_shift_limit=0.3, b_shift_limit=30, p=0.3),
#         A.Normalize(mean=mean, std=std),
        A.Resize(INPUT_SIZE, INPUT_SIZE, p=1),
        ToTensorV2(),
    ], bbox_params=A.BboxParams(format='albumentations', label_fields=""))
    transformVGGValidAlbu = A.Compose([
#         A.Normalize(mean=mean, std=std),
        A.Resize(INPUT_SIZE, INPUT_SIZE, p=1),
        ToTensorV2(),
    ], bbox_params=A.BboxParams(format='albumentations', label_fields=""))
    
    transformVGGTrain = torchvision.transforms.Compose([
        torchvision.transforms.ToPILImage(),
        torchvision.transforms.Resize(size=(INPUT_SIZE, INPUT_SIZE)),
        torchvision.transforms.ToTensor(),
    ])
    transformVGGValid = torchvision.transforms.Compose([
        torchvision.transforms.ToPILImage(),
        torchvision.transforms.Resize(size=(INPUT_SIZE, INPUT_SIZE)),
        torchvision.transforms.ToTensor(),
    ])

    if data_type == "custom":
        ## Custom dataset
#         VGG_dataset_train = HandGestureDataset(PATH_LABELS, PATH_IMG, transformVGGTrainAlbu)
#         VGG_dataset_valid = HandGestureDataset(PATH_LABELS_VALID, PATH_IMG_VALID, transformVGGValidAlbu)
        VGG_dataset_train = HandGestureDataset(PATH_LABELS, PATH_IMG, transformVGGTrain)
        VGG_dataset_valid = HandGestureDataset(PATH_LABELS_VALID, PATH_IMG_VALID, transformVGGValid)
        VGG_trainloader = torch.utils.data.DataLoader(VGG_dataset_train, batch_size=BATCH_SIZE, pin_memory=True, shuffle=True)
        VGG_validloader = torch.utils.data.DataLoader(VGG_dataset_valid, batch_size=BATCH_SIZE, pin_memory=True, shuffle=True)

    return VGG_trainloader, VGG_validloader

# Loading data into pytorch dataset and dataloader objects

In [10]:
VGG_trainloader, VGG_validloader = prepare_data_vgg("custom")

In [11]:
# for img, bbox in VGG_trainloader:
#     res = []
#     for e in bbox:
#         res_ = []
#         for elt in e:
#             res_.append(elt.numpy())
#         res.append(np.array(res_))
#     res = np.array(res).T.squeeze()
#     cpt = 0
#     for i, l in zip(img, res):
#         draw_predictions(i.permute(1,2,0), l)

# for img, bbox in VGG_validloader:
#     res = []
#     for e in bbox:
#         res_ = []
#         for elt in e:
#             res_.append(elt.numpy())
#         res.append(np.array(res_))
#     res = np.array(res).T.squeeze()
#     cpt = 0
#     for i, l in zip(img, res):
#         draw_predictions(i.permute(1,2,0), l)

In [12]:
# for item in VGG_trainloader:
#     x, y = item["image"], item["label"]
#     base_img = item["image"]
#     x = item["image"].to(device)
#     res = []
#     for e in y:
#         res.append(np.array(e))
#     res = np.array(res).T
#     y = torch.as_tensor(res)
#     y = y.to(torch.float32)
#     y = y.to(device)
#     for i in range(4):
#         draw_predictions(base_img[i].permute(1,2,0), y.cpu()[i])

# Model functions

In [13]:
# def train(model, epochs, train_loader, valid_loader, learning_rate, patience, feature_extract=False):
#     ## Early stopping variables
#     es = EarlyStopping(patience=patience)
#     terminate_training = False
#     best_model_wts = copy.deepcopy(model.state_dict())
#     best_loss = np.inf
#     model = model.to(device)
#     ## Training only the parameters where we require gradient since we are fine-tuning
#     params_to_update = model.parameters()
#     print("params to learn:")
#     if feature_extract:
#         params_to_update = []
#         for name,param in model.named_parameters():
#             if param.requires_grad == True:
#                 params_to_update.append(param)
#                 print("\t", name)
#     else:
#         for name,param in model.named_parameters():
#             if param.requires_grad == True:
#                 print("\t", name)
                
#     ## Setting up our optimizer
#     optim = torch.optim.Adam(params_to_update, lr=learning_rate)

#     ## Setting up our loss function
#     loss = nn.MSELoss()

#     ## Running the train loop
#     print(f"running {model.name}")
#     for epoch in range(epochs):
#         cumloss, count = 0, 0
#         model.train()
#         for x,y in train_loader:
#             optim.zero_grad()
#             x = x.to(device)
#             x = x.float()
#             res = []
#             for e in y:
#                 res_ = []
#                 for elt in e:
#                     res_.append(elt.numpy())
#                 res.append(np.array(res_))
#             res = np.array(res).T.squeeze()
# #             print("/"*20)
# #             print(res)
# #             print("/"*20)
#             y = torch.as_tensor(res)
#             y = y.to(torch.float32)
#             y = y.to(device)
#             yhat = model(x)
#             l = loss(yhat, y)
#             l.backward()
#             # xm.optimizer_step(optim, barrier=True)
#             optim.step()
#             cumloss += l * len(x)
#             count += len(x)
#         print("epoch :", epoch, end="")
#         loss_ = cumloss.cpu().item()/count
# #         wandb.log({'train_loss': loss_})
#         print(", train_loss: ", loss_, end="")
#         if epoch % 1 == 0:
#             model.eval()
#             with torch.no_grad():
#                 valid_cumloss, count = 0, 0
#                 for x,y in valid_loader:
#                     x = x.to(device)
#                     x = x.float()
#                     res = []
#                     for e in y:
#                         res_ = []
#                         for elt in e:
#                             res_.append(elt.numpy())
#                         res.append(np.array(res_))
#                     res = np.array(res).T.squeeze()
# #                     print("ù"*20)
# #                     print(res)
# #                     print("ù"*20)
#                     y = torch.as_tensor(res)
#                     y = y.to(torch.float32)
#                     y = y.to(device)
#                     yhat = model(x)
#                     valid_cumloss += loss(yhat,y) * len(x)
#                     count += len(x)
#                 valid_loss_ = valid_cumloss.cpu().item()/count
# #                 wandb.log({'valid_loss': valid_loss_})
#                 print(", valid_loss: ", valid_loss_)
#                 ## Early stopping
#                 if valid_cumloss/count < best_loss:
#                     best_loss = valid_cumloss/count
#                     best_model_wts = copy.deepcopy(model.state_dict())
#                 if es.step(valid_cumloss.cpu().item()/count):
#                     terminate_training = True
#                     break
#         if terminate_training:
#             break
#     print('Best val loss: {:4f}'.format(best_loss))
#     ## Returns the best model
#     model.load_state_dict(best_model_wts)
#     return model

# def set_parameter_requires_grad(model, feature_extract):
#     if feature_extract:
#         for name,p in model.named_parameters():
#             if "features" in name:
#                 p.requires_grad = False    
#             else:
#                 p.requires_grad = True  

In [14]:
def train(model, epochs, train_loader, valid_loader, learning_rate, patience, feature_extract=False):
    ## Early stopping variables
    es = EarlyStopping(patience=patience)
    terminate_training = False
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = np.inf
    model = model.to(device)
    ## Training only the parameters where we require gradient since we are fine-tuning
    params_to_update = model.parameters()
    print("params to learn:")
    if feature_extract:
        params_to_update = []
        for name,param in model.named_parameters():
            if param.requires_grad == True:
                params_to_update.append(param)
                print("\t", name)
    else:
        for name,param in model.named_parameters():
            if param.requires_grad == True:
                print("\t", name)
                
    ## Setting up our optimizer
    optim = torch.optim.Adam(params_to_update, lr=learning_rate)

    ## Setting up our loss function
    loss = nn.MSELoss()

    ## Running the train loop
    print(f"running {model.name}")
    for epoch in range(epochs):
        cumloss, count = 0, 0
        model.train()
        for item in train_loader:
            x, y = item["image"], item["label"]
            x = x.to(device)
            res = []
            for e in y:
                res.append(np.array(e))
            res = np.array(res).T
            y = torch.as_tensor(res)
            y = y.to(torch.float32)
            y = y.to(device)
#             print(x.shape)
#             print(x)
            yhat = model(x)
            l = loss(yhat, y)
            l.backward()
            # xm.optimizer_step(optim, barrier=True)
            optim.step()
            cumloss += l * len(x)
            count += len(x)
        print("epoch :", epoch, end="")
        loss_ = cumloss.cpu().item()/count
#         wandb.log({'train_loss': loss_})
        print(", train_loss: ", loss_, end="")
        if epoch % 1 == 0:
            model.eval()
            with torch.no_grad():
                valid_cumloss, count = 0, 0
                for item in valid_loader:
                    x, y = item["image"], item["label"]
                    x = x.to(device)
                    res = []
                    for e in y:
                        res.append(np.array(e))
                    res = np.array(res).T
                    y = torch.as_tensor(res)
                    y = y.to(torch.float32)
                    y = y.to(device)
                    yhat = model(x)
                    valid_cumloss += loss(yhat,y) * len(x)
                    count += len(x)
                valid_loss_ = valid_cumloss.cpu().item()/count
#                 wandb.log({'valid_loss': valid_loss_})
                print(", valid_loss: ", valid_loss_)
                ## Early stopping
                if valid_cumloss/count < best_loss:
                    best_loss = valid_cumloss/count
                    best_model_wts = copy.deepcopy(model.state_dict())
                if es.step(valid_cumloss.cpu().item()/count):
                    terminate_training = True
                    break
        if terminate_training:
            break
    print('Best val loss: {:4f}'.format(best_loss))
    ## Returns the best model
    model.load_state_dict(best_model_wts)
    return model

def set_parameter_requires_grad(model, feature_extract):
    if feature_extract:
        for name,p in model.named_parameters():
            if "features" in name:
                p.requires_grad = False    
            else:
                p.requires_grad = True  

# Loading the model and modifying the classifier part

In [15]:
## Loading vgg16 model pretrained on imagenet
vgg = models.vgg16(pretrained=True)

vgg.classifier = nn.Sequential(nn.Linear(25088, 4096), 
                               nn.ReLU(), 
                            #    nn.Dropout(0.5),        
                               nn.Linear(4096, 1024), 
                               nn.ReLU(), 
                            #    nn.Dropout(0.5),        
                               nn.Linear(1024, 256),
                               nn.ReLU(), 
                            #    nn.Dropout(0.5),        
                               nn.Linear(256, N_CLASS),
                               nn.Sigmoid())

print(vgg.eval())

## Sets all the requires grad of the classifier layers to True
set_parameter_requires_grad(vgg, True)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

# Implementing early stopping

In [16]:
class EarlyStopping(object):
    def __init__(self, mode='min', min_delta=0, patience=10, percentage=False):
        self.mode = mode
        self.min_delta = min_delta
        self.patience = patience
        self.best = None
        self.num_bad_epochs = 0
        self.is_better = None
        self._init_is_better(mode, min_delta, percentage)
        if patience == 0:
            self.is_better = lambda a, b: True
            self.step = lambda a: False

    def step(self, metrics):
        if self.best is None:
            self.best = metrics
            return False
        if np.isnan(metrics):
            return True
        if self.is_better(metrics, self.best):
            self.num_bad_epochs = 0
            self.best = metrics
        else:
            self.num_bad_epochs += 1
        if self.num_bad_epochs >= self.patience:
            return True
        return False

    def _init_is_better(self, mode, min_delta, percentage):
        if mode not in {'min', 'max'}:
            raise ValueError('mode ' + mode + ' is unknown!')
        if not percentage:
            if mode == 'min':
                self.is_better = lambda a, best: a < best - min_delta
            if mode == 'max':
                self.is_better = lambda a, best: a > best + min_delta
        else:
            if mode == 'min':
                self.is_better = lambda a, best: a < best - (
                            best * min_delta / 100)
            if mode == 'max':
                self.is_better = lambda a, best: a > best + (
                            best * min_delta / 100)

# Training only the modified parts of the classifier

In [17]:
# os.environ['WANDB_NOTEBOOK_NAME'] = '4096_5e-6'
# wandb.init(project="jetson-autonomous-driving")

In [18]:
print(len(VGG_trainloader))
print(len(VGG_validloader))

## Fine-tuning the model on our data
vgg.name = "VGG"

best_model = train(model=vgg, 
                   epochs=1000, 
                   train_loader=VGG_trainloader, 
                   valid_loader=VGG_validloader, 
                   learning_rate=5e-5,
                   patience=20) ## metric for earlystopping : val_loss 

536
151
params to learn:
	 classifier.0.weight
	 classifier.0.bias
	 classifier.2.weight
	 classifier.2.bias
	 classifier.4.weight
	 classifier.4.bias
	 classifier.6.weight
	 classifier.6.bias
running VGG
epoch : 0, train_loss:  0.043847247735777896, valid_loss:  0.041951097127211054
epoch : 1, train_loss:  0.023530193229219808, valid_loss:  0.03046403612409319
epoch : 2, train_loss:  0.017276680291588627, valid_loss:  0.037674102276266615
epoch : 3, train_loss:  0.014552805850754923, valid_loss:  0.027346851817793227
epoch : 4, train_loss:  0.011532006868675573, valid_loss:  0.02592500420503838
epoch : 5, train_loss:  0.009839233177811352, valid_loss:  0.026187801677919304
epoch : 6, train_loss:  0.008495748932681867, valid_loss:  0.029846137544245418
epoch : 7, train_loss:  0.0077325185733055, valid_loss:  0.024242180922498736
epoch : 8, train_loss:  0.007385311286840866, valid_loss:  0.027686660867988867
epoch : 9, train_loss:  0.006651598126140993, valid_loss:  0.02585089642344123


# Checking predictions

In [None]:
# with torch.no_grad():
#     for item in VGG_trainloader:
#         x, y = item["image"], item["label"]
#         base_img = item["image"]
#         x = item["image"].to(device)
#         res = []
#         for e in y:
#             res.append(np.array(e))
#         res = np.array(res).T
#         y = torch.as_tensor(res)
#         y = y.to(torch.float32)
#         y = y.to(device)
#         yhat = best_model(x)
#         for i in range(4):
#             draw_predictions(base_img[i].permute(1,2,0), yhat.cpu()[i])

with torch.no_grad():
    for x, y in VGG_validloader:
        x = x.to(device)
        x = x.float()
        yhat = best_model(x)
        for i in range(4):
            draw_predictions(x[i].cpu().permute(1,2,0), yhat.cpu()[i])

# Saving the model in .pth and .onnx extension

In [None]:
PATH = "./"
torch.save(best_model.state_dict(), os.path.join(PATH,"boundingbox_vgg_last.pth"))
# from google.colab import files
# files.download(os.path.join(PATH,"boundingbox_vgg_last.pth"))

In [None]:
# del vgg
# del best_model

In [None]:
# model = models.vgg16(pretrained=True)
# model.classifier[0] = nn.Linear(25088, 8192)
# model.classifier[3] = nn.Linear(8192, 1024)
# model.classifier[6] = nn.Linear(1024, N_CLASS)
# model.load_state_dict(torch.load(os.path.join(PATH,"vgg.pth"), map_location='cpu'))
# model.eval() 

# dummy_input = torch.randn(BATCH_SIZE, 3, INPUT_SIZE, INPUT_SIZE)  
# torch.onnx.export(model,   
#                   dummy_input, 
#                   "vgg.onnx",
#                   export_params=True,
#                   do_constant_folding=True, 
#                   input_names = ['modelInput'],
#                   output_names = ['modelOutput'])