# Dog Heart Vertebral Heart Size Point Detection 
# 1. Build an object detection model using pytorch

First, I create two `Dataset` classes: `LabeledDogHeartDataset` (for labeled data) and `UnlabeledDogHeartDataset` (for unlabeled data). These two classes share some common functionalities. Hence, they inherit from a `BaseDogHearDataset`:

In [1]:
import os
from typing import List, Tuple, Dict, Literal

from PIL import Image
from scipy.io import loadmat

import numpy as np
import torch
import torchvision.transforms as T
from torch.utils.data import Dataset


class BaseDogHeartDataset(Dataset):

    def __init__(
        self, 
        dataroot: str, 
        image_resolution: Tuple[int, int], 
        has_labels: bool,
    ):
        super().__init__()
        self.dataroot: str = dataroot
        self.image_resolution: Tuple[int, int] = image_resolution
        self.image_folder: str = os.path.join(dataroot, 'Images')
        self.image_filenames: List[str] = sorted(os.listdir(self.image_folder))
        self.has_labels: bool = has_labels
        if self.has_labels:
            self.point_folder: str = os.path.join(dataroot, 'Labels')
            self.point_filenames: List[str] = sorted(os.listdir(self.point_folder))

    def __len__(self) -> int:
        return len(self.image_filenames)

    def transform(self, input: Image.Image) -> torch.Tensor:
        transformer = T.Compose([
            T.ToTensor(),
            T.Resize(size=self.image_resolution),
            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        return transformer(input)


class LabeledDogHeartDataset(BaseDogHeartDataset):

    def __init__(self, dataroot: str, image_resolution: Tuple[int, int]):
        super().__init__(dataroot, image_resolution, has_labels=True)

    # implement
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, str, str]:
        # Load images and masks
        image_path: str = os.path.join(self.image_folder, self.image_filenames[idx])
        point_path: str = os.path.join(self.point_folder, self.point_filenames[idx])
        image: Image.Image = Image.open(image_path).convert("RGB")
        
        width_original, height_original = image.size
        image_tensor: torch.Tensor = self.transform(input=image)
        height_new, width_new = image_tensor.shape[1], image_tensor.shape[2]
        
        mat: Dict[Literal['six_points', 'VHS'], np.array] = loadmat(file_name=point_path)
        six_points: torch.Tensor = torch.as_tensor(mat['six_points'], dtype=torch.float32)
        # Resize image to any size and maintain original points
        six_points[:, 0] = width_new / width_original * six_points[:, 0]
        six_points[:, 1] = height_new / height_original * six_points[:, 1]
        # Normalize
        six_points = six_points / height_new

        vhs: torch.Tensor = torch.as_tensor(mat['VHS'], dtype=torch.float32).reshape(-1)
        return image_tensor, six_points, vhs, image_path, point_path


class UnlabeledDogHeartDataset(BaseDogHeartDataset):

    def __init__(self, dataroot: str, image_resolution: Tuple[int, int]):
        super().__init__(dataroot, image_resolution, has_labels=False)

    # implement
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, str]:
        # Load images
        image_path: str = os.path.join(self.image_folder, self.image_filenames[idx])
        image: Image.Image = Image.open(image_path).convert("RGB")
        image_tensor: torch.Tensor = self.transform(input=image)
        return image_tensor, image_path


Create dataset instances:

In [2]:
train_dataset = LabeledDogHeartDataset(dataroot='Dog_Heart_VHS/train', image_resolution=(512, 512))
val_dataset = LabeledDogHeartDataset(dataroot='Dog_Heart_VHS/validation', image_resolution=(512, 512))

test_dataset = UnlabeledDogHeartDataset(dataroot='Dog_Heart_VHS/test', image_resolution=(512, 512))

## Model Architecture:

In this project, I built a `Vision Transformer (ViT)` from scratch (https://arxiv.org/abs/2010.11929). This architecrure can be described by the following figure:



# 2. Train your model using [Dog VHS Dataset](https://yuad-my.sharepoint.com/:f:/g/personal/youshan_zhang_yu_edu/ErguFJBE4y9KqzEdWWNlXzMBkTbsBaNX9l856SyvQauwJg?e=L3JOuN)

# 3.Evaluate your model using the test images with the [software](https://github.com/YoushanZhang/Dog-Cardiomegaly_VHS)

# 4. Your results should be achieved 85%. VHS = 6(AB+CD)/EF

## (10 points, accuracy < 75% --> 0 points)

# 5. Show the comprison between predictions and ground truth
## You need to add the title with: image name, predicted VHS and Ground Truth VHS
<p align="center">
  <img src="Com.png" width="60%"> 
</p>


# Please show the comprison results of images: 1420.png, 1479.png and 1530.png from Valid dataset

# 6. Write a three-page report using LaTex and upload your paper to ResearchGate or Arxiv, and put your paper link here.


# 7. Grading rubric

(1). Code ------- 20 points (you also need to upload your final model as a pt file, prediction CSV file and add paper link)

(2). Grammer ---- 20 points

(3). Introduction & related work --- 10 points

(4). Method  ---- 20 points

(5). Results ---- 20 points

(6). Discussion - 10 points

# 8. Bonus points (10 points if your accuracy is higer than 87.3%)