In [None]:
import torch
import torch.nn as nn
import torchvision.models
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F

import albumentations as A
from albumentations.pytorch import ToTensorV2

from tqdm import tqdm
from PIL import Image
from PIL.Image import open as open_image
import cv2
import matplotlib.pyplot as plt
import numpy as np

import os
from os.path import join as path_join

from time import time

  check_for_updates()


### Get the data

In [None]:
if not os.path.exists("data.zip"):
    import gdown
    url = 'https://drive.google.com/uc?id=10f1H2T-5W-BiqabHHtlZ4ASs19TZmg8R'
    output = 'data.zip'
    gdown.download(url, output, quiet=False)
    del gdown
    del url
    del output
    !unzip data.zip

Downloading...
From (original): https://drive.google.com/uc?id=10f1H2T-5W-BiqabHHtlZ4ASs19TZmg8R
From (redirected): https://drive.google.com/uc?id=10f1H2T-5W-BiqabHHtlZ4ASs19TZmg8R&confirm=t&uuid=66079dc6-f93d-4551-9e3d-52bf70f163c8
To: /content/data.zip
100%|██████████| 979M/979M [00:04<00:00, 212MB/s]


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: data/train/images/064.Ring_billed_Gull/Ring_Billed_Gull_0098_51410.jpg  
  inflating: data/train/images/064.Ring_billed_Gull/Ring_Billed_Gull_0115_51891.jpg  
  inflating: data/train/images/064.Ring_billed_Gull/Ring_Billed_Gull_0106_52729.jpg  
  inflating: data/train/images/064.Ring_billed_Gull/Ring_Billed_Gull_0056_51523.jpg  
  inflating: data/train/images/064.Ring_billed_Gull/Ring_Billed_Gull_0113_51525.jpg  
  inflating: data/train/images/064.Ring_billed_Gull/Ring_Billed_Gull_0009_51301.jpg  
  inflating: data/train/images/064.Ring_billed_Gull/Ring_Billed_Gull_0117_51363.jpg  
  inflating: data/train/images/064.Ring_billed_Gull/Ring_Billed_Gull_0104_52614.jpg  
  inflating: data/train/images/064.Ring_billed_Gull/Ring_Billed_Gull_0108_51108.jpg  
  inflating: data/train/images/064.Ring_billed_Gull/Ring_Billed_Gull_0029_52613.jpg  
  inflating: data/train/images/064.Ring_billed_Gull/Ring_Billed_Gull_0119_5

In [None]:
def set_random_seed(seed: int):
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    if "random" in globals():
        random.seed(seed)

set_random_seed(1)

### Utilities (0.5 point)

Complete dataset to load prepared images and masks. Don't forget to use augmentations.

Some of the images are 1 channels, so use `gray2rgb`.

In [None]:
# for class_name in os.listdir("data/train/images"):
#     class_images_dir: str = path_join("data/train/images", class_name)
#     class_gt_masks_dir: str = path_join("data/train/gt", class_name)
#     assert os.path.exists(class_gt_masks_dir), f"no dir with ground truth masks {class_gt_masks_dir} for the class {class_name}"
#     for fname in os.listdir(class_images_dir):
#         gt_mask_file_path = path_join(class_gt_masks_dir, BirdsDataset._gt_mask_filename_from_img_filename(fname))
#         assert os.path.exists(gt_mask_file_path), \
#             f"no ground truth mask {gt_mask_file_path} for the file {fname} with class {class_name}"

#         mask = np.array(open_image(gt_mask_file_path))
#         if mask.ndim != 2:
#             assert len(mask.shape) == 3 and mask.shape[-1] == 2, mask.shape
#             print(gt_mask_file_path)
#             print(path_join(class_images_dir, fname))
#             # print(mask)
#             # display(Image.fromarray(mask[:,:,0]))
#             # display(Image.fromarray(mask[:,:,1]))
#             assert mask[:,:,1].ndim == 2
#             assert 255 in np.unique(mask[:,:,1])
#             # print(mask)
#             # assert False

#         # not_gt_mask = mask != WHITE_PIXEL_VALUE
#         # if not not np.all(not_gt_mask):
#         #     # print(f"{np.unique(mask)}\n{mask.reshape(-1)}")
#         #     pass
#         # else:
#         #     pass
#         #     # print(gt_mask_file_path)
#         # mask[not_gt_mask] = 0.0
#         # mask.astype(np.float32)

In [None]:
IMG_HEIGHT = 224
IMG_WIDTH = 224
CHANNELS_COUNT = 3
PIXEL_MAX_VALUE = 255
WHITE_PIXEL_VALUE = PIXEL_MAX_VALUE
MASK_PIXEL_VALUE = WHITE_PIXEL_VALUE

def gray2rgb(img: np.ndarray) -> np.ndarray:
    match img.shape:
        case (_, _):
            return np.dstack([img, img, img])
        case (_, _, 3):
            return img
        case _:
            raise ValueError(f"Invalid img.shape: {img.shape}")

def get_iou(gt, pred):
    pred = pred > 0.5
    un = (gt | pred).sum()
    assert un > 0 or (gt & pred).sum() == 0
    return (gt & pred).sum() / un if un else 0

def imagenet_mean() -> tuple[float, float, float]:
    return (0.485, 0.456, 0.406)

def imagenet_std() -> tuple[float, float, float]:
    return (0.229, 0.224, 0.225)

class BirdsDataset(Dataset):
    def __init__(self, folder, is_train: None | bool = None) -> None:
        assert is_train is None or isinstance(is_train, bool)

        images_folder: str = os.path.join(folder, 'images')
        gt_folder: str = os.path.join(folder, 'gt')

        calc_manually = False
        if calc_manually:
            approx_mean = np.zeros(shape=(CHANNELS_COUNT,))
            approx_std = np.zeros(shape=(CHANNELS_COUNT,))

        img_index_to_class_name: list[str] = []
        img_index_to_file_name: list[str] = []
        for class_name in os.listdir(images_folder):
            class_images_dir: str = path_join(images_folder, class_name)
            class_gt_masks_dir: str = path_join(gt_folder, class_name)
            assert os.path.exists(class_gt_masks_dir), f"no dir with ground truth masks {class_gt_masks_dir} for the class {class_name}"
            for fname in os.listdir(class_images_dir):
                gt_mask_file_path = path_join(class_gt_masks_dir, BirdsDataset._gt_mask_filename_from_img_filename(fname))
                assert os.path.exists(gt_mask_file_path), \
                    f"no ground truth mask {gt_mask_file_path} for the file {fname} with class {class_name}"

                if calc_manually:
                    img_pixels: np.ndarray = gray2rgb(np.asarray(open_image(path_join(class_images_dir, fname)))).astype(np.float32)
                    assert len(img_pixels.shape) == 3 and img_pixels.shape[2] == CHANNELS_COUNT
                    img_pixels /= PIXEL_MAX_VALUE
                    approx_mean += img_pixels.mean(axis=(0, 1))
                    approx_std += img_pixels.std(axis=(0, 1))

                img_index_to_file_name.append(fname)

                # Note: only 1 instance of bytes sequence corresponding to the `class_name`
                #  exists in memory due to the reference semantic of the Python language
                img_index_to_class_name.append(class_name)

        if calc_manually:
            total_images = len(img_index_to_class_name)
            assert total_images == len(img_index_to_file_name) > 0
            approx_mean /= total_images
            approx_std /= total_images

            assert approx_mean.ndim == 1 and len(approx_mean) == CHANNELS_COUNT \
                and np.all((0 < approx_mean) & (approx_mean < PIXEL_MAX_VALUE))
            assert approx_std.ndim == 1 and len(approx_std) == CHANNELS_COUNT \
                and np.all((0 < approx_std) & (approx_std < PIXEL_MAX_VALUE))

        if is_train is None:
            is_train = "train" in folder
        self.transform = BirdsDataset.make_data_transformer(is_train=is_train)
        self._img_index_to_class_name = img_index_to_class_name
        self._img_index_to_file_name = img_index_to_file_name
        self._images_folder = images_folder
        self._gt_folder = gt_folder

    @staticmethod
    def preprocess_image(img: np.ndarray) -> np.ndarray:
        # assert np.issubdtype(img.dtype, np.integer)
        # assert np.all(img <= MASK_PIXEL_VALUE)
        # assert np.all(0 <= img)
        return gray2rgb(img)

    @staticmethod
    def preprocess_mask(mask: np.ndarray) -> np.ndarray:
        if mask.ndim != 2:
            mask = mask[:,:,0]
        # assert np.issubdtype(mask.dtype, np.integer)
        mask = (mask == MASK_PIXEL_VALUE).astype(np.uint8)
        # assert np.all((mask == 0) ^ (mask == 1))
        return mask

    def __getitem__(self, index: int) -> tuple[torch.Tensor, torch.Tensor]:
        img, gt_mask = self._read_img_and_gt_mask_by_index(index)
        match self.transform(image=BirdsDataset.preprocess_image(img), mask=BirdsDataset.preprocess_mask(gt_mask)):
            case {"image": t_img, "mask": t_msk}:
                return t_img, t_msk.long()
        assert False

    def __len__(self) -> int:
        return len(self._img_index_to_file_name)

    @staticmethod
    def make_data_transformer(is_train: bool = False, mean=imagenet_mean(), std=imagenet_std()):
        assert len(mean) == len(std) == CHANNELS_COUNT
        actions: list[A.BasicTransform] = [
            A.Normalize(mean=mean, std=std, max_pixel_value=PIXEL_MAX_VALUE),
        ]
        if is_train:
            actions.extend([
                A.RandomResizedCrop(size=(IMG_HEIGHT, IMG_WIDTH)),
                A.HorizontalFlip(p=0.3),
                A.RandomBrightnessContrast(brightness_limit=(-0.1, 0.1), contrast_limit=(-0.1, 0.1), p=0.3),
            ])
        else:
            actions.extend([
                A.Resize(height=IMG_HEIGHT, width=IMG_WIDTH),
            ])
        actions.extend([
            ToTensorV2(),
        ])
        return A.Compose(actions)

    def _read_img_and_gt_mask_by_index(self, i: int) -> tuple[np.ndarray, np.ndarray]:
        cnm = self._img_index_to_class_name[i]
        fnm = self._img_index_to_file_name[i]
        return (
            np.asarray(open_image(path_join(self._images_folder, cnm, fnm))),
            np.asarray(open_image(path_join(self._gt_folder, cnm, self._gt_mask_filename_from_img_filename(fnm)))),
        )

    @staticmethod
    def _gt_mask_filename_from_img_filename(img_fname: str) -> str:
        return f"{img_fname.removesuffix('jpg')}png"


### Architecture (1 point)
Your task for today is to build your own Unet to solve the segmentation problem.

As an encoder, you can use pre-trained on IMAGENET models(or parts) from torchvision. The decoder must be trained from scratch.
It is forbidden to use data not from the `data` folder.

I advise you to experiment with the number of blocks so as not to overfit on the training sample and get good quality on validation.

In [None]:
from torchvision import models

class DecoderBlock(nn.Module):
    def __init__(self, in_channels, mid_channels, out_channels):
        super().__init__()
        # self.upconv = nn.ConvTranspose2d(in_channels=in_channels, out_channels=out_channels, kernel_size=4, stride=2, padding=1)
        self.upsample = nn.UpsamplingNearest2d(scale_factor=2)
        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=mid_channels, kernel_size=3, padding=1)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2d(in_channels=mid_channels, out_channels=out_channels, kernel_size=3, padding=1)
        self.relu2 = nn.ReLU()

    def forward_with_resid(self, resid: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
        return self.forward(torch.cat((resid, self.upsample(x)), dim=1))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.relu2(self.conv2(self.relu1(self.conv1(x))))

class Unet(nn.Module):
    def __init__(self):
        super().__init__()

        # pretrained_model_to_steal_layers_from
        enc_model: torchvision.models.resnet.ResNet = models.resnet152(weights=models.ResNet152_Weights.DEFAULT)

        B = 2
        assert enc_model.conv1(torch.zeros((B, CHANNELS_COUNT, IMG_HEIGHT, IMG_WIDTH))).shape == torch.Size((B, 64, IMG_HEIGHT // 2, IMG_WIDTH // 2))
        assert enc_model.bn1(torch.zeros((B, 64, IMG_HEIGHT // 2, IMG_WIDTH // 2))).shape == torch.Size((B, 64, IMG_HEIGHT // 2, IMG_WIDTH // 2))
        assert enc_model.relu(torch.zeros((B, 64, IMG_HEIGHT // 2, IMG_WIDTH // 2))).shape == torch.Size((B, 64, IMG_HEIGHT // 2, IMG_WIDTH // 2))

        assert enc_model.maxpool(torch.zeros((B, 64, IMG_HEIGHT // 2, IMG_WIDTH // 2))).shape == torch.Size((B, 64, IMG_HEIGHT // 4, IMG_WIDTH // 4))
        assert enc_model.layer1(torch.zeros((B, 64, IMG_HEIGHT // 4, IMG_WIDTH // 4))).shape == torch.Size((B, 256, IMG_HEIGHT // 4, IMG_WIDTH // 4))

        assert enc_model.layer2(torch.zeros((B, 256, IMG_HEIGHT // 4, IMG_WIDTH // 4))).shape == torch.Size((B, 512, IMG_HEIGHT // 8, IMG_WIDTH // 8))

        assert enc_model.layer3(torch.zeros((B, 512, IMG_HEIGHT // 8, IMG_WIDTH // 8))).shape == torch.Size((B, 1024, IMG_HEIGHT // 16, IMG_WIDTH // 16))

        self.inp = nn.Sequential(enc_model.conv1, enc_model.bn1, enc_model.relu)

        # encoder blocks
        self.encoder1 = nn.Sequential(enc_model.maxpool, enc_model.layer1)
        self.encoder2= enc_model.layer2
        self.encoder3= enc_model.layer3

        # decoder blocks
        self.decoder1 = DecoderBlock(in_channels=512 + 1024, mid_channels=1024, out_channels=512)
        self.decoder2 = DecoderBlock(in_channels=256 + 512, mid_channels=512, out_channels=256)
        self.decoder3 = DecoderBlock(in_channels=64 + 256, mid_channels=128, out_channels=64)

        self.out = nn.Sequential(
            nn.UpsamplingNearest2d(scale_factor=2),
            nn.Conv2d(in_channels=64, out_channels=2, kernel_size=1),
        )

    def forward(self, x: torch.Tensor):
        r1 = self.inp(x)
        r2 = self.encoder1(r1)
        r3 = self.encoder2(r2)
        x = self.encoder3(r3)
        x = self.decoder1.forward_with_resid(r3, x)
        x = self.decoder2.forward_with_resid(r2, x)
        x = self.decoder3.forward_with_resid(r1, x)
        return self.out(x)

    def mutable_parameters(self, recurse=True):
        return self.parameters(recurse)


In [None]:
# assert Unet()(torch.zeros(1, CHANNELS_COUNT, IMG_HEIGHT, IMG_WIDTH)).shape == torch.Size((1, 2, IMG_HEIGHT, IMG_WIDTH))

### Train script (0.5 point)

Complete the train and predict scripts.

In [None]:
def get_model(path) -> Unet:
    model = Unet()
    model.load_state_dict(torch.load(path, weights_only=False))
    model.eval()
    return model

In [None]:
def train_segmentation_model(data_path, load_from_state: int = -1):
    BATCH_SIZE = 8
    N_EPOCH = 15
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_dataset = BirdsDataset(data_path + 'train')
    val_dataset = BirdsDataset(data_path + 'val')
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

    assert N_EPOCH > load_from_state >= -1
    if load_from_state == -1:
        model = Unet()
    else:
        model = get_model(f"model_{load_from_state}.pth")
    model = model.to(DEVICE)
    optimizer = torch.optim.Adam(params=model.mutable_parameters(), lr=0.0007)
    criterion = nn.CrossEntropyLoss()
    losses_train, losses_val, ious_train, ious_val = [], [], [], []

    for epoch in (epochs_bar := tqdm(range(load_from_state + 1, N_EPOCH))):
        epochs_bar.set_description(f"Epoch #{epoch}...\n", refresh=True)

        model.train()
        losses_over_batches = []
        ious_over_batches = []
        for inputs, masks in tqdm(train_dataloader):
            masks = masks.to(DEVICE, non_blocking=True)
            inputs = inputs.to(DEVICE)

            optimizer.zero_grad()
            masks_pred = model(inputs)
            loss = criterion(masks_pred, masks)

            loss.backward()
            optimizer.step()

            losses_over_batches.append(loss.detach().cpu().item())
            ious_over_batches.append(get_iou(masks == 1, masks_pred.detach().argmax(1) == 1).cpu().item())


        losses_train.append(np.mean(losses_over_batches))
        ious_train.append(np.mean(ious_over_batches))

        model.eval()
        losses_over_batches = []
        ious_over_batches = []
        with torch.no_grad():
            for inputs, masks in tqdm(val_dataloader):
                masks = masks.to(DEVICE, non_blocking=True)
                inputs = inputs.to(DEVICE)

                masks_pred = model(inputs).detach()
                loss = criterion(masks_pred, masks)

                losses_over_batches.append(loss.cpu().item())
                ious_over_batches.append(get_iou(masks == 1, masks_pred.argmax(1) == 1).cpu().item())

        losses_val.append(np.mean(losses_over_batches))
        ious_val.append(np.mean(ious_over_batches))

        torch.save(model.state_dict(), f'model_{epoch}.pth')

        print(f"Epoch: {epoch}, train loss: {losses_train[-1]}, val loss: {losses_val[-1]}, train iou: {ious_train[-1]}, val iou: {ious_val[-1]}")

Note: in the code below training is started from the 8 epoch because google colab runtime has been disconnected several times

In [None]:
set_random_seed(0xdeadbeef)
train_segmentation_model('data/', load_from_state=8)

Epoch #9...
:   0%|          | 0/6 [00:00<?, ?it/s]
  0%|          | 0/1048 [00:00<?, ?it/s][A
  0%|          | 1/1048 [00:00<09:48,  1.78it/s][A
  0%|          | 2/1048 [00:01<08:52,  1.96it/s][A
  0%|          | 3/1048 [00:01<08:36,  2.02it/s][A
  0%|          | 4/1048 [00:01<08:27,  2.06it/s][A
  0%|          | 5/1048 [00:02<08:17,  2.10it/s][A
  1%|          | 6/1048 [00:02<08:13,  2.11it/s][A
  1%|          | 7/1048 [00:03<08:11,  2.12it/s][A
  1%|          | 8/1048 [00:03<08:14,  2.10it/s][A
  1%|          | 9/1048 [00:04<08:11,  2.11it/s][A
  1%|          | 10/1048 [00:04<08:09,  2.12it/s][A
  1%|          | 11/1048 [00:05<08:06,  2.13it/s][A
  1%|          | 12/1048 [00:05<08:04,  2.14it/s][A
  1%|          | 13/1048 [00:06<08:02,  2.14it/s][A
  1%|▏         | 14/1048 [00:06<08:03,  2.14it/s][A
  1%|▏         | 15/1048 [00:07<08:03,  2.14it/s][A
  2%|▏         | 16/1048 [00:07<08:02,  2.14it/s][A
  2%|▏         | 17/1048 [00:08<08:01,  2.14it/s][A
  2%|▏      

Epoch: 9, train loss: 0.1492978117023488, val loss: 0.07698106761513786, train iou: 0.7371730265840319, val iou: 0.7724241350184787



  0%|          | 0/1048 [00:00<?, ?it/s][A
  0%|          | 1/1048 [00:00<09:51,  1.77it/s][A
  0%|          | 2/1048 [00:01<09:08,  1.91it/s][A
  0%|          | 3/1048 [00:01<08:54,  1.96it/s][A
  0%|          | 4/1048 [00:02<08:48,  1.97it/s][A
  0%|          | 5/1048 [00:02<08:42,  2.00it/s][A
  1%|          | 6/1048 [00:03<08:42,  2.00it/s][A
  1%|          | 7/1048 [00:03<08:41,  2.00it/s][A
  1%|          | 8/1048 [00:04<08:41,  1.99it/s][A
  1%|          | 9/1048 [00:04<08:39,  2.00it/s][A
  1%|          | 10/1048 [00:05<08:39,  2.00it/s][A
  1%|          | 11/1048 [00:05<08:40,  1.99it/s][A
  1%|          | 12/1048 [00:06<08:38,  2.00it/s][A
  1%|          | 13/1048 [00:06<08:36,  2.00it/s][A
  1%|▏         | 14/1048 [00:07<08:36,  2.00it/s][A
  1%|▏         | 15/1048 [00:07<08:37,  2.00it/s][A
  2%|▏         | 16/1048 [00:08<08:36,  2.00it/s][A
  2%|▏         | 17/1048 [00:08<08:39,  1.98it/s][A
  2%|▏         | 18/1048 [00:09<08:39,  1.98it/s][A
  2%|▏    

Epoch: 10, train loss: 0.13771487693195694, val loss: 0.07660577484321865, train iou: 0.7500839281884086, val iou: 0.762005717408928



  0%|          | 0/1048 [00:00<?, ?it/s][A
  0%|          | 1/1048 [00:00<10:33,  1.65it/s][A
  0%|          | 2/1048 [00:01<09:49,  1.77it/s][A
  0%|          | 3/1048 [00:01<09:26,  1.85it/s][A
  0%|          | 4/1048 [00:02<09:06,  1.91it/s][A
  0%|          | 5/1048 [00:02<08:56,  1.94it/s][A
  1%|          | 6/1048 [00:03<08:49,  1.97it/s][A
  1%|          | 7/1048 [00:03<08:44,  1.99it/s][A
  1%|          | 8/1048 [00:04<08:43,  1.99it/s][A
  1%|          | 9/1048 [00:04<08:42,  1.99it/s][A
  1%|          | 10/1048 [00:05<08:39,  2.00it/s][A
  1%|          | 11/1048 [00:05<08:40,  1.99it/s][A
  1%|          | 12/1048 [00:06<08:39,  1.99it/s][A
  1%|          | 13/1048 [00:06<08:38,  2.00it/s][A
  1%|▏         | 14/1048 [00:07<08:36,  2.00it/s][A
  1%|▏         | 15/1048 [00:07<08:37,  2.00it/s][A
  2%|▏         | 16/1048 [00:08<08:35,  2.00it/s][A
  2%|▏         | 17/1048 [00:08<08:36,  2.00it/s][A
  2%|▏         | 18/1048 [00:09<08:35,  2.00it/s][A
  2%|▏    

Epoch: 11, train loss: 0.1373705704241479, val loss: 0.08361289510503411, train iou: 0.7526221587998039, val iou: 0.7558181644840674



  0%|          | 0/1048 [00:00<?, ?it/s][A
  0%|          | 1/1048 [00:00<09:56,  1.75it/s][A
  0%|          | 2/1048 [00:01<09:16,  1.88it/s][A
  0%|          | 3/1048 [00:01<09:02,  1.92it/s][A
  0%|          | 4/1048 [00:02<09:01,  1.93it/s][A
  0%|          | 5/1048 [00:02<09:02,  1.92it/s][A
  1%|          | 6/1048 [00:03<09:01,  1.93it/s][A
  1%|          | 7/1048 [00:03<09:06,  1.90it/s][A
  1%|          | 8/1048 [00:04<09:08,  1.90it/s][A
  1%|          | 9/1048 [00:04<08:57,  1.93it/s][A
  1%|          | 10/1048 [00:05<08:50,  1.96it/s][A
  1%|          | 11/1048 [00:05<08:47,  1.97it/s][A
  1%|          | 12/1048 [00:06<08:44,  1.98it/s][A
  1%|          | 13/1048 [00:06<08:41,  1.98it/s][A
  1%|▏         | 14/1048 [00:07<08:41,  1.98it/s][A
  1%|▏         | 15/1048 [00:07<08:40,  1.98it/s][A
  2%|▏         | 16/1048 [00:08<08:39,  1.99it/s][A
  2%|▏         | 17/1048 [00:08<08:38,  1.99it/s][A
  2%|▏         | 18/1048 [00:09<08:41,  1.98it/s][A
  2%|▏    

Epoch: 12, train loss: 0.1365308440219298, val loss: 0.0745892412643033, train iou: 0.7580127419178723, val iou: 0.7692200317978859



  0%|          | 0/1048 [00:00<?, ?it/s][A
  0%|          | 1/1048 [00:00<10:15,  1.70it/s][A
  0%|          | 2/1048 [00:01<09:24,  1.85it/s][A
  0%|          | 3/1048 [00:01<09:03,  1.92it/s][A
  0%|          | 4/1048 [00:02<08:57,  1.94it/s][A
  0%|          | 5/1048 [00:02<08:52,  1.96it/s][A
  1%|          | 6/1048 [00:03<08:48,  1.97it/s][A
  1%|          | 7/1048 [00:03<08:46,  1.98it/s][A
  1%|          | 8/1048 [00:04<08:52,  1.95it/s][A
  1%|          | 9/1048 [00:04<08:58,  1.93it/s][A
  1%|          | 10/1048 [00:05<09:00,  1.92it/s][A
  1%|          | 11/1048 [00:05<09:04,  1.91it/s][A
  1%|          | 12/1048 [00:06<09:07,  1.89it/s][A
  1%|          | 13/1048 [00:06<09:00,  1.91it/s][A
  1%|▏         | 14/1048 [00:07<08:54,  1.94it/s][A
  1%|▏         | 15/1048 [00:07<08:49,  1.95it/s][A
  2%|▏         | 16/1048 [00:08<08:46,  1.96it/s][A
  2%|▏         | 17/1048 [00:08<08:45,  1.96it/s][A
  2%|▏         | 18/1048 [00:09<08:42,  1.97it/s][A
  2%|▏    

Epoch: 13, train loss: 0.13241025060415268, val loss: 0.07108319519003006, train iou: 0.7617922636275073, val iou: 0.7756700082258745



  0%|          | 0/1048 [00:00<?, ?it/s][A
  0%|          | 1/1048 [00:00<10:12,  1.71it/s][A
  0%|          | 2/1048 [00:01<09:20,  1.87it/s][A
  0%|          | 3/1048 [00:01<09:10,  1.90it/s][A
  0%|          | 4/1048 [00:02<08:56,  1.94it/s][A
  0%|          | 5/1048 [00:02<08:50,  1.97it/s][A
  1%|          | 6/1048 [00:03<08:47,  1.98it/s][A
  1%|          | 7/1048 [00:03<08:47,  1.97it/s][A
  1%|          | 8/1048 [00:04<08:46,  1.98it/s][A
  1%|          | 9/1048 [00:04<08:44,  1.98it/s][A
  1%|          | 10/1048 [00:05<08:41,  1.99it/s][A
  1%|          | 11/1048 [00:05<08:48,  1.96it/s][A
  1%|          | 12/1048 [00:06<08:55,  1.93it/s][A
  1%|          | 13/1048 [00:06<08:58,  1.92it/s][A
  1%|▏         | 14/1048 [00:07<09:01,  1.91it/s][A
  1%|▏         | 15/1048 [00:07<09:02,  1.91it/s][A
  2%|▏         | 16/1048 [00:08<08:54,  1.93it/s][A
  2%|▏         | 17/1048 [00:08<08:49,  1.95it/s][A
  2%|▏         | 18/1048 [00:09<08:46,  1.96it/s][A
  2%|▏    

Epoch: 14, train loss: 0.12767688871027177, val loss: 0.07402137070047585, train iou: 0.7712522351559792, val iou: 0.7644855085421692





You can also experiment with models and write a small report about results. If the report will be meaningful, you will receive an extra point.

### Testing (8 points)
Your model will be tested on the new data, similar to validation, so use techniques to prevent overfitting the model.

* IoU > 0.85 — 8 points
* IoU > 0.80 — 7 points
* IoU > 0.75 — 6 points
* IoU > 0.70 — 5 points
* IoU > 0.60 — 4 points
* IoU > 0.50 — 3 points
* IoU > 0.40 — 2 points
* IoU > 0.30 — 1 points

In [21]:
def read_and_preprocess_image(img_path: str) -> torch.Tensor:
    img = BirdsDataset.preprocess_image(np.asarray(open_image(img_path)))
    return BirdsDataset.make_data_transformer()(image=img)["image"]

def resize_mask(gt_mask: np.ndarray) -> np.ndarray:
    gt_mask = BirdsDataset.preprocess_mask(gt_mask)
    mask = BirdsDataset.make_data_transformer()(image=gt_mask, mask=gt_mask)["mask"].numpy()
    assert tuple(mask.shape) == (IMG_HEIGHT, IMG_WIDTH)
    assert np.all((mask == 0) ^ (mask == 1))
    return mask

def predict(model, img_path: str) -> np.ndarray:
    model.eval()
    model_device = next(model.parameters()).device
    with torch.no_grad():
        x = read_and_preprocess_image(img_path).unsqueeze(0).to(model_device)
        logits_pred = model(x).detach().squeeze(0)
        assert tuple(logits_pred.shape) == (2, IMG_HEIGHT, IMG_WIDTH)
        class_pred = logits_pred.argmax(0)
        assert tuple(class_pred.shape) == (IMG_HEIGHT, IMG_WIDTH)
        return class_pred.cpu().numpy()

In [18]:
model = get_model('model_14.pth').to('cuda')

In [24]:
ious, times = [], []
test_dir = 'data/val/'

for class_name in tqdm(sorted(os.listdir(os.path.join(test_dir, 'images')))):
    for img_name in sorted(os.listdir(os.path.join(test_dir, 'images', class_name))):
        img_path=os.path.join(test_dir, 'images', class_name, img_name)
        t_start = time()
        pred = predict(model, img_path)
        times.append(time() - t_start)

        gt_name = img_name.replace('jpg', 'png')
        gt = np.asarray(Image.open(os.path.join(test_dir, 'gt', class_name, gt_name)), dtype = np.uint8)
        if len(gt.shape) > 2:
            gt = gt[:, :, 0]

        assert tuple(pred.shape) == (IMG_HEIGHT, IMG_WIDTH)
        assert np.all((pred == 0) ^ (pred == 1))
        gt = resize_mask(gt)
        assert tuple(gt.shape) == (IMG_HEIGHT, IMG_WIDTH)
        iou = get_iou(gt == 1, pred > 0.5)
        ious.append(iou)

np.mean(ious), np.mean(times)

100%|██████████| 200/200 [00:55<00:00,  3.63it/s]


(0.730613930432201, 0.036360246528093706)

### Compression (1 point)

Try to speed up the model in any way without losing more than 1% in iou score.
For example [torch2trt](https://github.com/NVIDIA-AI-IOT/torch2trt)

In [67]:
# %cd /content
# %rm -r torch2trt
# %pwd
# !ls -trailh /content

In [68]:
!sudo apt update
!sudo apt upgrade
!sudo apt install tensorrt
!dpkg-query -W tensorrt

!python3 -m pip install --upgrade pip
!python3 -m pip install wheel
!python3 -m pip install onnxruntime
!python3 -m pip cache remove "tensorrt*"
!python3 -m pip install --upgrade tensorrt tensorrt-lean tensorrt-dispatch
!python3 -m pip install tensorrt

!git clone https://github.com/NVIDIA-AI-IOT/torch2trt
%cd torch2trt
!python setup.py install
!cmake -B build . && cmake --build build --target install && ldconfig
%cd scripts
!bash build_contrib.sh
%cd ../..

[33m0% [Working][0m            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
[33m0% [Connecting to archive.ubuntu.com (185.125.190.81)] [Connecting to security.[0m                                                                               Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Ign:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy Release [5,713 B]
Hit:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 5,713 

In [75]:
import sys
sys.path.append("/usr/local/lib/python3.10/dist-packages/torch2trt-0.5.0-py3.10.egg")
sys.path.append("/content/torch2trt")
from torch2trt import torch2trt

def get_fast_model(device=None):
    EXAMPLE_BATCH_SIZE = 20
    model = torch2trt(Unet(), [torch.zeros(EXAMPLE_BATCH_SIZE, CHANNELS_COUNT, IMG_HEIGHT, IMG_WIDTH)])
    model.eval()
    if device is not None:
        model = model.to(device)
    return model


In [None]:
fast_model = get_fast_model().to('cuda')

In [None]:
ious, times = [], []
test_dir = 'data/val/'

for class_name in tqdm(sorted(os.listdir(os.path.join(test_dir, 'images')))):
    for img_name in sorted(os.listdir(os.path.join(test_dir, 'images', class_name))):
        t_start = time()
        pred = predict(fast_model, os.path.join(test_dir, 'images', class_name, img_name))
        times.append(time() - t_start)

        gt_name = img_name.replace('jpg', 'png')
        gt = np.asarray(Image.open(os.path.join(test_dir, 'gt', class_name, gt_name)), dtype = np.uint8)
        if len(gt.shape) > 2:
            gt = gt[:, :, 0]

        gt = resize_mask(gt)
        iou = get_iou(gt == 1, pred > 0.5)
        ious.append(iou)

np.mean(ious), np.mean(times)

**Bonus:** For the best iou score on test(without compression) in group you will get 1.5, 1, 0.5 extra points(for 1st, 2nd, 3rd places).