[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/johnpolsh/inf721-tpfinal/blob/main/colab/Object_detection_model.ipynb)
## Setup
### Download dependencies

In [None]:
!pip install pandas matplotlib numpy wget zipfile torch==2.0.1 torchinfo

* obs: use the following line if tring to run this on a local machine. Somethimes torch seems to be unable to detect the cuda device

In [None]:
!pip install torch==2.0.1+cu118 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

### Default imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import torch
import wget
import zipfile

# NOTE: if an cuda device is available, the line bellow will evaluate to '2.0.1+cu118'
torch.__version__ 

### Select back-end device

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)

print(f"Using {device} as default device")

## New Dataset
### Setting up

In [None]:
dataset_zip_path = "utensils.zip"
dataset_src_url = "https://homepages.inf.ed.ac.uk/rbf/UTENSILS/raw.zip"
if not os.path.isfile(dataset_zip_path):
    !wget $dataset_src_url -O $dataset_zip_path

dataset_root_base_path = "dataset/utensils"
if not os.path.isdir(dataset_root_base_path):
    with zipfile.ZipFile(dataset_zip_path, 'r') as zip_ref:
        zip_ref.extractall(dataset_root_base_path)

dataset_root_path = "\"" + os.path.join(dataset_root_base_path, "RAW IMAGES") + "\""

!ls $dataset_root_path

### Calculate dataset normalization mean/std

In [None]:
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.datasets import ImageFolder

our_dataset = ImageFolder(root=dataset_root_path)

norm_mean = (0.,)
norm_std = (0.,)
dataset_len = len(our_dataset)
make_tensor = transforms.ToTensor()
for img, _ in our_dataset:
    img = make_tensor(img).numpy().transpose((1, 2, 0))
    w, h, c = img.shape
    img = np.resize(img, (w * h, 3))
    norm_mean += img.mean(0)
    norm_std += img.std(0)

norm_mean /= dataset_len
norm_std /= dataset_len
print(f"Dataset normalization mean: {norm_mean}, std: {norm_std}")

### Torch dataset

In [None]:
from torch.utils.data import DataLoader

transforms_train = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomRotation(degrees=(0, 80)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.ColorJitter(brightness=.4, contrast=.5, hue=.2),
    transforms.GaussianBlur(kernel_size=(3, 7), sigma=(0.2, 4)),
    transforms.ToTensor(),
    transforms.Normalize(norm_mean, norm_std)
])
our_dataset = ImageFolder(root=dataset_root_path, transform=transforms_train)

batch_size = 64
train_dataloader = DataLoader(our_dataset, batch_size=batch_size, shuffle=True, generator=torch.Generator(device=device))

print(f"Training dataset has {len(our_dataset)} examples")
print(f"Training dataloader has {len(train_dataloader)} batches")
print("Using transforms:")
print(transforms_train)

### Visualizing samples

In [None]:
from random import randint

def sample():
    img, lbl = our_dataset[randint(0, dataset_len - 1)]
    img = np.array(img)
    return (img, lbl)

plt.figure(figsize=(16, 9))
plt.subplot(2, 2, 1)
img, lbl = sample()
plt.imshow(img)
plt.title(classes[lbl])

plt.subplot(2, 2, 2)
img, lbl = sample()
plt.imshow(img)
plt.title(classes[lbl])

plt.subplot(2, 2, 3)
img, lbl = sample()
plt.imshow(img)
plt.title(classes[lbl])

plt.subplot(2, 2, 4)
img, lbl = sample()
plt.imshow(img)
plt.title(classes[lbl])
plt.show()

### Sanity check

In [None]:
from torchvision.utils import make_grid

iterator = iter(train_dataloader)
images, labels = next(iterator)

plt.figure(figsize=(16, 9))
img_grid = make_grid(images)
img = img_grid.numpy().transpose((1, 2, 0))
img = norm_std * img + norm_mean
img = np.clip(img, 0, 1)
plt.imshow(img)
plt.show()
print([classes[labels[i]] for i in range(batch_size)])

# Model
### Our architecture definition

In [None]:
from torch import nn

def _make_divisible(v, divisor, min_value=None):
    """
    This function is taken from the original tf repo.
    It ensures that all layers have a channel number that is divisible by 8
    It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    :param v:
    :param divisor:
    :param min_value:
    :return:
    """
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    # Make sure that round down does not go down by more than 10%.
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v


#dw
class DepthWiseConvolution(nn.Sequential):
    def __init__(self, in_fts, stride = 1):
        super(DepthWiseConvolution,self).__init__(
            nn.Conv2d(in_fts,in_fts,kernel_size=(3,3),stride=stride,padding=(1,1), groups=in_fts, bias=False),
            nn.BatchNorm2d(in_fts),
            nn.ReLU6(inplace=True))


#pw
class PointWiseConvolution(nn.Sequential):
    def __init__(self,in_fts,out_fts):
        super(PointWiseConvolution,self).__init__(
            nn.Conv2d(in_fts,out_fts,kernel_size=(1,1),bias=False),
            nn.BatchNorm2d(out_fts),
            nn.ReLU6(inplace=True))


class ConvBNReLU(nn.Sequential):
    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1, norm_layer=None):
        padding = (kernel_size - 1) // 2
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        super(ConvBNReLU, self).__init__(
            nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
            norm_layer(out_planes),
            nn.ReLU6(inplace=True)
        )



class Bottleneck(nn.Module):
    def __init__(self,inp, oup, stride, expand_ratio, norm_layer=nn.BatchNorm2d):
        super(Bottleneck, self).__init__()
        self.stride = stride

        hidden_dim = int(round(inp*expand_ratio))
        layers = []
        self.use_res_connect = self.stride == 1 and inp == oup

        #pw
        if expand_ratio != 1:
            layers.append(PointWiseConvolution(inp,hidden_dim))

        #dw
        layers.extend([
            DepthWiseConvolution(hidden_dim,stride),
            #pw-linear
            nn.Conv2d(hidden_dim,oup,1,1,0,bias=False),
            nn.BatchNorm2d(oup)])

        self.conv = nn.Sequential(*layers)


    def forward(self, x):
        if self.use_res_connect:
            return x + self.conv(x)
        else:
            return self.conv(x)


class OurObjectDetectionNet(nn.Module):
    def __init__(self, bottleneckLayerDetail, inp = 3, num_classes=len(classes), width_mult=1.0, round_nearest=8):
        super(OurObjectDetectionNet, self).__init__()

        self.out = None

        bloco = Bottleneck
        inverted_residual_setting = bottleneckLayerDetail

        input_channel = 32
        last_channel = 1280

        input_channel = _make_divisible(input_channel*width_mult,round_nearest)
        self.last_channel = _make_divisible(last_channel*width_mult,round_nearest)

        #first layer
        features = [ConvBNReLU(inp, input_channel, stride=2)]

        #build layers
        for t, c, n, s in inverted_residual_setting:
            output_channel = _make_divisible(c*width_mult,round_nearest)
            for i in range(n):
                stride = s if i == 0 else 1
                features.append(bloco(input_channel,output_channel,stride = stride,expand_ratio=t))
                input_channel = output_channel


        #last layer
        features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1))

        #make sequential
        self.features = nn.Sequential(*features)

        #classificador
        self.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(self.last_channel, num_classes))

    def __forward_impl(self, x):
        x = self.features(x)
        x = nn.functional.adaptive_avg_pool2d(x,1).reshape(x.shape[0],-1)
        x = self.classifier(x)

        return x

    def forward(self, x):
        x = self.__forward_impl(x)
        return x


### Model declaration

In [None]:
from torchinfo import summary

bottleneckLayerDetail = [
    # t, c, n, s
    [1, 16, 1, 1],
    [6, 24, 2, 2],
    [6, 32, 3, 2],
    [6, 64, 4, 2],
    [6, 96, 3, 1],
    [6, 160, 3, 2],
    [6, 320, 1, 1],
]

our_model = OurObjectDetectionNet(bottleneckLayerDetail)
summary(our_model, (1, 3, 224, 224), col_names=("input_size", "output_size",
                                                      "num_params", "kernel_size",
                                                      "mult_adds"))

### MobileNet V2

In [None]:
from torchvision.models import mobilenet_v2, MobileNet_V2_Weights

mobilenet_model = mobilenet_v2(weights=MobileNet_V2_Weights.IMAGENET1K_V2)

## Training
### Loss function

In [None]:
loss_fn = nn.CrossEntropyLoss()

### Optimizer

In [None]:
optimizer = torch.optim.Adam(our_model.parameters(), lr=0.05)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

### Training loop

In [None]:
from sklearn.metrics import accuracy_score

def train_one_epoch(model, batch_i, loss_function, optim, sched=None):
    model.train()
    avg_loss = .0
    run_loss = .0
    predictions = []
    targets = []
    for i, (x_t, y_t) in enumerate(train_dataloader):
        i += 1
        x_t = x_t.to(device)
        y_t = y_t.to(device)

        optim.zero_grad()

        if torch.cuda.is_available():
            y_hat = nn.parallel.data_parallel(model, x_t)
        else:
            y_hat = model(x_t)

        loss = loss_function(y_hat, y_t)
        avg_loss += loss.item()
        run_loss += loss.item()

        _, predicted = torch.max(y_hat, 1)
        predictions.extend(predicted.cpu().numpy())
        targets.extend(y_t.cpu().numpy())

        if i % batch_i == 0:
            print(f"\tbatch {i} avg loss {run_loss / batch_i}")
            run_loss = .0

        loss.backward()
        optim.step()
    
    if sched:
        sched.step()

    return (avg_loss / i, accuracy_score(targets, predictions))

acc_history_train = []
loss_history_train = []
def train(n_epochs, model, model_save_dir, loss_function, optim, sched=None):
    best_loss = 1_000_000
    for epoch in range(n_epochs):
        print(f"========================================================\nEPOCH {epoch + 1}")
        avg_loss, accuracy = train_one_epoch(model, 5, loss_function, optim, sched)
        loss_history_train.append(avg_loss)
        acc_history_train.append(accuracy)
        print(f"avg train loss: {avg_loss}, train accuracy: {accuracy * 100:.2f}")

        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save(model.state_dict(), model_save_dir)

    print(f"\nTraining finished")

### Training

In [None]:
train(100, our_model, "our_model.pth", loss_fn, optimizer, scheduler)

### Visualize model improvement

In [None]:
plt.figure(figsize=(16,9))

plt.subplot(1, 2, 1)
plt.plot(acc_history_train)
plt.xlabel('epochs')
plt.ylabel('accurary')
plt.title('train accuracy')

plt.subplot(1, 2, 2)
plt.plot(loss_history_train)
plt.xlabel('epochs')
plt.ylabel('loss')
plt.title('train loss')
plt.show()

### Convert model for mobile

In [None]:
import wget
import os.path

if not os.path.isfile("convert.py"):
    wget.download(
        "https://raw.githubusercontent.com/johnpolsh/inf721-tpfinal/main/colab/convert.py",
        "convert.py")

In [None]:
from convert import convert_for_mobile

convert_for_mobile(mobilenet_model, "object_detection")