# Dog Heart Vertebral Heart Size Point Detection 
# 1. Build an object detection model using pytorch

First, I create two `Dataset` classes: `LabeledDogHeartDataset` (for labeled data) and `UnlabeledDogHeartDataset` (for unlabeled data). These two classes share some common functionalities. Hence, they inherit from a `BaseDogHearDataset`:

In [2]:
import os
from typing import List, Tuple, Dict, Literal

from PIL import Image
from scipy.io import loadmat

import numpy as np
import torch
import torchvision.transforms as T
from torch.utils.data import Dataset


class BaseDogHeartDataset(Dataset):

    def __init__(
        self, 
        dataroot: str, 
        image_resolution: Tuple[int, int], 
        has_labels: bool,
    ):
        super().__init__()
        self.dataroot: str = dataroot
        self.image_resolution: Tuple[int, int] = image_resolution
        self.image_folder: str = os.path.join(dataroot, 'Images')
        self.image_filenames: List[str] = sorted(os.listdir(self.image_folder))
        self.has_labels: bool = has_labels
        if self.has_labels:
            self.point_folder: str = os.path.join(dataroot, 'Labels')
            self.point_filenames: List[str] = sorted(os.listdir(self.point_folder))

    def __len__(self) -> int:
        return len(self.image_filenames)

    def transform(self, input: Image.Image) -> torch.Tensor:
        transformer = T.Compose([
            T.ToTensor(),
            T.Resize(size=self.image_resolution),
            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        return transformer(input)


class LabeledDogHeartDataset(BaseDogHeartDataset):

    def __init__(self, dataroot: str, image_resolution: Tuple[int, int]):
        super().__init__(dataroot, image_resolution, has_labels=True)

    # implement
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, str, str]:
        # Load images and masks
        image_path: str = os.path.join(self.image_folder, self.image_filenames[idx])
        point_path: str = os.path.join(self.point_folder, self.point_filenames[idx])
        image: Image.Image = Image.open(image_path).convert("RGB")
        
        width_original, height_original = image.size
        image_tensor: torch.Tensor = self.transform(input=image)
        height_new, width_new = image_tensor.shape[1], image_tensor.shape[2]
        
        mat: Dict[Literal['six_points', 'VHS'], np.array] = loadmat(file_name=point_path)
        six_points: torch.Tensor = torch.as_tensor(mat['six_points'], dtype=torch.float32)
        # Resize image to any size and maintain original points
        six_points[:, 0] = width_new / width_original * six_points[:, 0]
        six_points[:, 1] = height_new / height_original * six_points[:, 1]
        # Normalize
        six_points = six_points / height_new

        vhs: torch.Tensor = torch.as_tensor(mat['VHS'], dtype=torch.float32).reshape(-1)
        return image_tensor, six_points, vhs, image_path, point_path


class UnlabeledDogHeartDataset(BaseDogHeartDataset):

    def __init__(self, dataroot: str, image_resolution: Tuple[int, int]):
        super().__init__(dataroot, image_resolution, has_labels=False)

    # implement
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, str]:
        # Load images
        image_path: str = os.path.join(self.image_folder, self.image_filenames[idx])
        image: Image.Image = Image.open(image_path).convert("RGB")
        image_tensor: torch.Tensor = self.transform(input=image)
        return image_tensor, image_path


Create dataset instances:

In [3]:
train_dataset = LabeledDogHeartDataset(dataroot='Dog_Heart_VHS/train', image_resolution=(512, 512))
val_dataset = LabeledDogHeartDataset(dataroot='Dog_Heart_VHS/validation', image_resolution=(512, 512))

test_dataset = UnlabeledDogHeartDataset(dataroot='Dog_Heart_VHS/test', image_resolution=(512, 512))

## Model Architecture:

In this project, I built a `Vision Transformer (ViT)` from scratch (https://arxiv.org/abs/2010.11929). This architecrure can be described by the following figure:

<div style="background-color:white; width:1000px">
    <img src="https://raw.githubusercontent.com/hiepdang-ml/dnn_project_two/master/assets/architecture.png"/>
</div>

First, I build the `PatchPositionEmbedding` layer:

In [4]:
from typing import Tuple, List

import torch
import torch.nn as nn
import torch.nn.functional as F


class PatchPositionEmbedding(nn.Module):

    def __init__(
        self, 
        in_channels: int, 
        patch_size: int, 
        embedding_dim: int, 
        image_size: Tuple[int, int],
    ):
        super().__init__()
        self.in_channels: int = in_channels
        self.patch_size: int = patch_size
        self.embedding_dim: int = embedding_dim
        self.image_size: Tuple[int, int] = image_size
        self.n_hpatches: int = image_size[0] // patch_size
        self.n_wpatches: int = image_size[1] // patch_size
        self.n_patches: int = self.n_hpatches * self.n_wpatches
        self.projector = nn.Conv2d(
            in_channels=in_channels, out_channels=embedding_dim,
            kernel_size=patch_size, stride=patch_size,
        )

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        assert input.ndim == 4  # (batch_size, n_channels, height, width)
        batch_size: int = input.shape[0]
        output: torch.Tensor = self.projector(input)
        assert output.shape == (batch_size, self.embedding_dim, self.n_hpatches, self.n_wpatches)
        output: torch.Tensor = output.flatten(start_dim=2, end_dim=-1)
        assert output.shape == (batch_size, self.embedding_dim, self.n_patches)
        return output.permute(0, 2, 1)

The Transformer Encoder contains a stack of multiple `TransformerBlock`:

In [5]:
class TransformerBlock(nn.Module):

    def __init__(
        self, 
        embedding_dim: int, 
        n_heads: int, 
        dropout: float,
    ):
        super().__init__()
        self.embedding_dim: int = embedding_dim
        self.n_heads: int = n_heads

        assert embedding_dim % n_heads == 0, f'embedding_dim must be divisible by n_heads'
        self.head_embedding_dim: int = self.embedding_dim // self.n_heads
        
        self.qkv = nn.Linear(in_features=embedding_dim, out_features=embedding_dim * 3)
        self.attention = nn.MultiheadAttention(
            embed_dim=embedding_dim, num_heads=n_heads, 
            dropout=dropout, batch_first=False,
        )
        self.projector1 = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
        self.projector2 = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
        self.dropout = nn.Dropout(p=dropout)
        self.layer_norm1 = nn.LayerNorm(normalized_shape=embedding_dim)
        self.layer_norm2 = nn.LayerNorm(normalized_shape=embedding_dim)

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        assert input.ndim == 3
        assert input.shape[2] == self.embedding_dim
        batch_size: int = input.shape[0]
        n_patches: int = input.shape[1]

        residual: torch.Tensor = input.clone()
        
        # LayerNorm
        input: torch.Tensor = self.layer_norm1(input)
        
        # Multihead Attention
        qkv: torch.Tensor = self.qkv(input)
        assert qkv.shape == (batch_size, n_patches, self.embedding_dim * 3)
        qkv: torch.Tensor = qkv.reshape(batch_size, n_patches, 3, self.embedding_dim)
        qkv: torch.Tensor = qkv.permute(2, 1, 0, 3)
        assert qkv.shape == (3, n_patches, batch_size, self.embedding_dim)
        queries: torch.Tensor = qkv[0]
        keys: torch.Tensor = qkv[1]
        values: torch.Tensor = qkv[2]
        output, _ = self.attention(query=queries, key=keys, value=values)
        assert output.shape == (n_patches, batch_size, self.embedding_dim)
        output: torch.Tensor = output.permute(1, 0, 2)
        output = F.gelu(self.projector1(output))

        # Residual Connection
        output = residual + output
        residual: torch.Tensor = output.clone()
        # LayerNorm
        output = self.layer_norm2(output)
        # MLP
        output = F.gelu(self.projector2(output))
        # Residual Connection
        output = residual + output
        assert output.shape == (batch_size, n_patches, self.embedding_dim)
        return output


In [6]:
class TransformerEncoder(nn.Module):

    def __init__(
        self, 
        embedding_dim: int, 
        n_heads: int, 
        depth: int, 
        dropout: float
    ):
        super().__init__()
        self.embedding_dim: int = embedding_dim
        self.n_heads: int = n_heads
        self.depth: int = depth
        self.dropout: float = dropout

        self.blocks = nn.Sequential(
            *[TransformerBlock(embedding_dim, n_heads, dropout) for _ in range(depth)]
        )
        self.layer_norm = nn.LayerNorm(normalized_shape=embedding_dim)

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        assert input.ndim == 3
        assert input.shape[2] == self.embedding_dim
        batch_size: int = input.shape[0]
        n_patches: int = input.shape[1]

        output: torch.Tensor = self.blocks(input)
        output: torch.Tensor = self.layer_norm(output)
        assert output.shape == (batch_size, n_patches, self.embedding_dim)
        return output

We also need an `OrthogonalLayer` to ensure `AB` is perpendicular to `CD`:

In [7]:
class OrthogonalLayer(nn.Module):

    def __init__(self):
        super().__init__()

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        batch_size: int = input.shape[0]
        assert input.shape == (batch_size, 6, 2)
        s: torch.Tensor = - (input[:, 0, 0] - input[:, 1, 0]) / (input[:, 0, 1] - input[:, 1, 1])
        y3: torch.Tensor = s * (input[:, 3, 0] - input[:, 2, 0]) + input[:, 2, 1]
        output = input.clone()
        output[:, 3, 1] = y3
        assert output.shape == input.shape
        return output

Lastly, we stack all of the above modules to form the Vision Transformer model:

In [8]:
class VisionTransformer(nn.Module):

    def __init__(
        self, 
        in_channels: int, 
        patch_size: int, 
        embedding_dim: int, 
        image_size: Tuple[int, int], 
        depth: int, 
        n_heads: int, 
        dropout: float, 
    ):
        super().__init__()
        self.in_channels: int = in_channels
        self.out_channels: int = 12
        self.patch_size: int = patch_size
        self.embedding_dim: int = embedding_dim
        self.image_size: Tuple[int, int] = image_size
        self.depth: int = depth
        self.n_heads: int = n_heads
        self.dropout: float = dropout

        self.patch_embedding = PatchPositionEmbedding(in_channels, patch_size, embedding_dim, image_size)
        self.encoder = TransformerEncoder(embedding_dim, n_heads, depth, dropout)
        self.orthogonalizer = OrthogonalLayer()

        scale_pos: float = self.patch_embedding.n_patches * embedding_dim
        self.pos_embedding = nn.Parameter(
            data=torch.rand(1, self.patch_embedding.n_patches, embedding_dim) / scale_pos
        )
        self.mlp_head = nn.Sequential(*[
            nn.Linear(in_features=self.patch_embedding.n_patches * embedding_dim, out_features=1024), nn.ReLU(), nn.Dropout(p=0.1),
            nn.Linear(in_features=1024, out_features=512), nn.ReLU(), nn.Dropout(p=0.1),
            nn.Linear(in_features=512, out_features=self.out_channels),
        ])

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        assert input.ndim == 4
        batch_size, n_channels, image_height, image_width = input.shape
        output: torch.Tensor = self.patch_embedding(input)
        assert output.shape == (batch_size, self.patch_embedding.n_patches, self.embedding_dim)
        output: torch.Tensor = output + self.pos_embedding
        output: torch.Tensor = self.encoder(output)
        assert output.shape == (batch_size, self.patch_embedding.n_patches, self.embedding_dim)
        output: torch.Tensor = output.flatten(start_dim=1, end_dim=-1)
        output: torch.Tensor = self.mlp_head(output).reshape(batch_size, 6, 2)
        output: torch.Tensor = output.reshape(batch_size, 6, 2)
        return self.orthogonalizer(output)

Let's test on random input:

In [9]:
net = VisionTransformer(
    in_channels=1, patch_size=32, embedding_dim=256,
    image_size=(512, 512), depth=2, n_heads=16, dropout=0.1,
)
x = torch.rand(8, 1, 512, 512)
y = net(x)

print(x.shape)
print(y.shape)

torch.Size([8, 1, 512, 512])
torch.Size([8, 6, 2])


# 2. Train your model using [Dog VHS Dataset](https://yuad-my.sharepoint.com/:f:/g/personal/youshan_zhang_yu_edu/ErguFJBE4y9KqzEdWWNlXzMBkTbsBaNX9l856SyvQauwJg?e=L3JOuN)

# 3.Evaluate your model using the test images with the [software](https://github.com/YoushanZhang/Dog-Cardiomegaly_VHS)

# 4. Your results should be achieved 85%. VHS = 6(AB+CD)/EF

## (10 points, accuracy < 75% --> 0 points)

# 5. Show the comprison between predictions and ground truth
## You need to add the title with: image name, predicted VHS and Ground Truth VHS
<p align="center">
  <img src="Com.png" width="60%"> 
</p>


# Please show the comprison results of images: 1420.png, 1479.png and 1530.png from Valid dataset

# 6. Write a three-page report using LaTex and upload your paper to ResearchGate or Arxiv, and put your paper link here.


# 7. Grading rubric

(1). Code ------- 20 points (you also need to upload your final model as a pt file, prediction CSV file and add paper link)

(2). Grammer ---- 20 points

(3). Introduction & related work --- 10 points

(4). Method  ---- 20 points

(5). Results ---- 20 points

(6). Discussion - 10 points

# 8. Bonus points (10 points if your accuracy is higer than 87.3%)