<a href="https://colab.research.google.com/github/laurenwylee/handsome-dan-tracker/blob/main/handsome-dan-ml-model/handsomedandetector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import torch
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.transforms import functional as F
from torchvision import transforms as T
from PIL import Image
import xml.etree.ElementTree as ET
from sklearn.metrics import precision_score, recall_score, f1_score
import time
from tqdm import tqdm
from torch.cuda.amp import GradScaler, autocast
from google.colab import files
import torch.nn as nn


import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"  # Enable CUDA debugging


In [None]:
# clear storage
torch.cuda.empty_cache()
import gc
gc.collect()

7454

In [None]:
class HandsomeDanDataset(torch.utils.data.Dataset):
    def __init__(self, root, transforms=None):
        self.root = root
        self.transforms = transforms

        # collect image file names
        self.dan_imgs = list(sorted(os.listdir(os.path.join(root, "handsomedanpics"))))
        self.non_dan_imgs = list(sorted(os.listdir(os.path.join(root, "nondanpics"))))

        # filter only .jpg files with corresponding .xml files for handsomedanpics
        self.dan_imgs = [
            f for f in self.dan_imgs if f.endswith(".jpg") and os.path.exists(os.path.join(root, "handsomedanpics", f.replace(".jpg", ".xml")))
        ]

        # separate non-Dan images with and without dogs
        self.non_dan_dog_imgs = [
            f for f in self.non_dan_imgs if f.endswith(".jpg") and os.path.exists(os.path.join(root, "nondanpics", f.replace(".jpg", ".xml")))
        ]
        self.non_dan_random_imgs = [f for f in self.non_dan_imgs if f.endswith(".jpg") and f not in self.non_dan_dog_imgs]
        # self.non_dan_random_imgs = [f for f in self.non_dan_imgs if f.endswith(".jpg")]

    def __getitem__(self, idx):
        # handsome dan pics
        if idx < len(self.dan_imgs):
            img_path = os.path.join(self.root, "handsomedanpics", self.dan_imgs[idx])
            label_path = img_path.replace(".jpg", ".xml")
            img = Image.open(img_path).convert("RGB")

            tree = ET.parse(label_path)
            root = tree.getroot()

            # if valid bounding box
            boxes = []
            labels = []
            for member in root.findall("object"):
                bndbox = member.find("bndbox")
                xmin = int(bndbox.find("xmin").text)
                ymin = int(bndbox.find("ymin").text)
                xmax = int(bndbox.find("xmax").text)
                ymax = int(bndbox.find("ymax").text)
                if xmin < xmax and ymin < ymax:
                    boxes.append([xmin, ymin, xmax, ymax])
                    labels.append(1)  # 1 for Handsome Dan

            boxes = torch.as_tensor(boxes, dtype=torch.float32)
            labels = torch.as_tensor(labels, dtype=torch.int64)
            image_id = torch.tensor([idx])
            target = {
                "boxes": boxes,
                "labels": labels,
                "image_id": image_id
            }
        else:
            # non-dan dog pics
            non_dan_idx = idx - len(self.dan_imgs)
            if non_dan_idx < len(self.non_dan_dog_imgs):
                img_path = os.path.join(self.root, "nondanpics", self.non_dan_dog_imgs[non_dan_idx])
                label_path = img_path.replace(".jpg", ".xml")
                img = Image.open(img_path).convert("RGB")

                tree = ET.parse(label_path)
                root = tree.getroot()

                boxes = []
                labels = []
                for member in root.findall("object"):
                    bndbox = member.find("bndbox")
                    xmin = int(bndbox.find("xmin").text)
                    ymin = int(bndbox.find("ymin").text)
                    xmax = int(bndbox.find("xmax").text)
                    ymax = int(bndbox.find("ymax").text)
                    if xmin < xmax and ymin < ymax:
                        boxes.append([xmin, ymin, xmax, ymax])
                        labels.append(2)  # 2 for other dogs

                boxes = torch.as_tensor(boxes, dtype=torch.float32)
                labels = torch.as_tensor(labels, dtype=torch.int64)
                image_id = torch.tensor([idx])
                target = {
                    "boxes": boxes,
                    "labels": labels,
                    "image_id": image_id
                }
            else:
              # random pics, no bounding box
              non_dan_random_idx = non_dan_idx - len(self.non_dan_dog_imgs)
              # non_dan_random_idx = idx - len(self.dan_imgs)
              img_path = os.path.join(self.root, "nondanpics", self.non_dan_random_imgs[non_dan_random_idx])
              img = Image.open(img_path).convert("RGB")

              boxes = torch.zeros((0, 4), dtype=torch.float32)
              labels = torch.zeros((0,), dtype=torch.int64)
              image_id = torch.tensor([idx])
              target = {
                  "boxes": boxes,
                  "labels": labels,
                  "image_id": image_id
              }

        if self.transforms:
            img = self.transforms(img)

        return img, target

    def __len__(self):
        return len(self.dan_imgs) + len(self.non_dan_dog_imgs) + len(self.non_dan_random_imgs)
        # return len(self.dan_imgs )+ len(self.non_dan_random_imgs)


def get_transform(train):
    transforms = [T.ToTensor()]
    # data augmentations
    if train:
        transforms.extend([
            T.RandomHorizontalFlip(0.5),
            T.RandomVerticalFlip(0.5),
            # T.RandomRotation(degrees=30),
            T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
            # T.RandomResizedCrop(size=(224, 224), scale=(0.8, 1.0))
        ])
    return T.Compose(transforms)

def collate_fn(batch):
    return tuple(zip(*batch))


In [None]:
# create a folder in your google drive called data
# data should have /data/training/ and /data/validation/
# each of these should have a folder for handsomedanpics and nondanpics
# every photo in handsomedanpics should have a bounding box around handsome dan and a corresponding xml file
# every photo of a dog in nondanpics should have a bounding box around the dan and a corresponding xml file --> done by Stanford dogs dataset on Kaggle
!cp -r drive/MyDrive/data /content/

In [None]:
# load training and testing datasets
dataset = HandsomeDanDataset('/content/data/train', get_transform(train=True))
data_loader = torch.utils.data.DataLoader(
    dataset, batch_size=4, shuffle=True, num_workers=4, collate_fn=collate_fn
)

dataset_val = HandsomeDanDataset('/content/data/validation', get_transform(train=False))
data_loader_val = torch.utils.data.DataLoader(
    dataset_val, batch_size=4, shuffle=False, num_workers=4, collate_fn=collate_fn
)

In [None]:
# use pretrained model, all we are doing is adjusting weights and finetuning since we don't have a super large dataset
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, 3) # 3 classes for random pics, handsome dan, other dogs

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")

# if torch.cuda.is_available():
#     print(torch.cuda.get_device_name(0))
#     print('Memory Usage:')
#     print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
#     print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

model.to(device)

# adjust class weights (random, HD, other dogs)
class_weights = torch.tensor([1.2, 1.5, 1.0]).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# set hyperparameters
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.001, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

Using device: cuda


In [None]:
# 10 epochs, evaluate at end, set min confidence level for handsome dan to .3
num_epochs = 10
for epoch in range(num_epochs):
    start_time = time.time()
    model.train()
    epoch_loss = 0
    for images, targets in tqdm(data_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        try:
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

            optimizer.zero_grad()
            losses.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0)
            optimizer.step()
            epoch_loss += losses.item()

        except Exception as e:
            print(f"Error: {e}")
            continue

    lr_scheduler.step()

    # Evaluation
    model.eval()
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for images, targets in data_loader_val:
            images = list(img.to(device) for img in images)
            outputs = model(images)
            for target, output in zip(targets, outputs):
                labels = target["labels"].cpu().numpy()
                all_labels.append(1 if 1 in labels else 0)
                preds = output["labels"].cpu().numpy()
                scores = output["scores"].cpu().numpy()
                if len(preds) > 0 and any(pred == 1 and score > 0.3 for pred, score in zip(preds, scores)):
                    all_preds.append(1)
                else:
                    all_preds.append(0)
    print("Predictions")
    print(all_preds)
    print("Labels")
    print(all_labels)

    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    epoch_time = time.time() - start_time
    print(f"Epoch: {epoch+1}, Loss: {epoch_loss/len(data_loader)}, Precision: {precision}, Recall: {recall}, F1: {f1}, Time: {epoch_time:.2f} sec")

Epoch 1/10: 100%|██████████| 348/348 [08:04<00:00,  1.39s/it]


Predictions
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Labels
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Epoch: 1, Loss: 0.1866998724243336, Precision: 1.0, Recall: 0.9230769230769231, F1: 0.9600000000000001, Time: 488.59 sec


Epoch 2/10: 100%|██████████| 348/348 [08:00<00:00,  1.38s/it]


Predictions
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Labels
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Epoch: 2, Loss: 0.13449467340719767, Precision: 1.0, Recall: 1.0, F1: 1.0, Time: 484.59 sec


Epoch 3/10: 100%|██████████| 348/348 [08:04<00:00,  1.39s/it]


Predictions
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0]
Labels
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Epoch: 3, Loss: 0.12417328619430291, Precision: 0.8666666666666667, Recall: 1.0, F1: 0.9285714285714286, Time: 487.88 sec


Epoch 4/10: 100%|██████████| 348/348 [08:01<00:00,  1.38s/it]


Predictions
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Labels
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Epoch: 4, Loss: 0.11839266719788998, Precision: 1.0, Recall: 1.0, F1: 1.0, Time: 485.00 sec


Epoch 5/10: 100%|██████████| 348/348 [08:01<00:00,  1.38s/it]


Predictions
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
Labels
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Epoch: 5, Loss: 0.11673090788642822, Precision: 0.9285714285714286, Recall: 1.0, F1: 0.962962962962963, Time: 485.14 sec


Epoch 6/10: 100%|██████████| 348/348 [08:08<00:00,  1.40s/it]


Predictions
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Labels
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Epoch: 6, Loss: 0.11667563989169724, Precision: 1.0, Recall: 1.0, F1: 1.0, Time: 492.59 sec


Epoch 7/10: 100%|██████████| 348/348 [07:56<00:00,  1.37s/it]


Predictions
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Labels
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Epoch: 7, Loss: 0.11538437122329213, Precision: 1.0, Recall: 1.0, F1: 1.0, Time: 480.58 sec


Epoch 8/10: 100%|██████████| 348/348 [07:56<00:00,  1.37s/it]


Predictions
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Labels
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Epoch: 8, Loss: 0.11528274099524508, Precision: 1.0, Recall: 1.0, F1: 1.0, Time: 480.66 sec


Epoch 9/10: 100%|██████████| 348/348 [08:01<00:00,  1.38s/it]


Predictions
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Labels
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Epoch: 9, Loss: 0.11727625229021256, Precision: 1.0, Recall: 1.0, F1: 1.0, Time: 485.54 sec


Epoch 10/10: 100%|██████████| 348/348 [08:02<00:00,  1.39s/it]


Predictions
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Labels
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Epoch: 10, Loss: 0.11538999809765782, Precision: 1.0, Recall: 1.0, F1: 1.0, Time: 486.17 sec


In [None]:
# separate cell to isolate evaluation

model.eval()
all_labels = []
all_preds = []
with torch.no_grad():
    for images, targets in data_loader:
        images = list(img.to(device) for img in images)
        outputs = model(images)
        for target, output in zip(targets, outputs):
            labels = target["labels"].cpu().numpy()
            all_labels.append(1 if 1 in labels else 0)
            preds = output["labels"].cpu().numpy()
            scores = output["scores"].cpu().numpy()
            if len(preds) > 0 and any(pred == 1 and score > 0.3 for pred, score in zip(preds, scores)):
                all_preds.append(1)
            else:
                all_preds.append(0)
print("Predictions")
print(all_preds)
print("Labels")
print(all_labels)

precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)

print(f"Precision: {precision}, Recall: {recall}, F1: {f1}")

Predictions
[0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 

In [None]:
# true pos and true neg levels
totX = 0
totY = 0
for i in range(len(all_preds)):
  if all_labels[i] == 1 and all_preds[i] == 1:
    totX += 1
  if all_labels[i] == 1:
    totY += 1
print(totX/totY)

totA = 0
totB = 0
for i in range(len(all_preds)):
  if all_labels[i] == 0 and all_preds[i] == 0:
    totA += 1
  if all_labels[i] == 0:
    totB += 1
print(totA/totB)

0.9324009324009324
0.9178794178794178


In [None]:
# save and downlaod model
torch.save(model.state_dict(), 'handsome_dan_detector.pth')
files.download('handsome_dan_detector.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>