<a href="https://colab.research.google.com/github/kyrajeep/DL_Projects/blob/master/detect_mask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install transformers
!pip install timm



In [6]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from bs4 import BeautifulSoup
import torchvision
from torchvision import transforms, datasets, models
import torch
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from PIL import Image
import matplotlib.pyplot as plt
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
import matplotlib.patches as patches
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report


In [7]:
from transformers import AutoModelForImageClassification, ViTImageProcessor
import os

In [8]:
!pip install kaggle
from google.colab import files
files.upload()



Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"kjpark5","key":"71b412d5a628a73927410116e6b58329"}'}

In [9]:
# use the Kaggle API to directly download data
!mkdir ~/.kaggle
#files.upload()
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
#!kaggle datasets list


In [None]:
!kaggle datasets download -d andrewmvd/face-mask-detection
!unzip face-mask-detection.zip


In [11]:
!ls

annotations  face-mask-detection.zip  images  sample_data


In [12]:
images = list(sorted(os.listdir("images/")))
labels = list(sorted(os.listdir("annotations/")))

In [13]:
def generate_box(obj):

    xmin = int(obj.find('xmin').text)
    ymin = int(obj.find('ymin').text)
    xmax = int(obj.find('xmax').text)
    ymax = int(obj.find('ymax').text)

    return [xmin, ymin, xmax, ymax]

def generate_label(obj):
    if obj.find('name').text == "with_mask":
        return 1
    elif obj.find('name').text == "mask_weared_incorrect":
        return 2
    return 0

def generate_target(image_id, file):
    with open(file) as f:
        data = f.read()
        soup = BeautifulSoup(data, 'xml')
        objects = soup.find_all('object')

        num_objs = len(objects)

        # Bounding boxes for objects
        # In coco format, bbox = [xmin, ymin, width, height]
        # In pytorch, the input should be [xmin, ymin, xmax, ymax]
        boxes = []
        labels = []
        for i in objects:
            boxes.append(generate_box(i))
            labels.append(generate_label(i))
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # Labels (In my case, I only one class: target class or background)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        # Tensorise img_id
        img_id = torch.tensor([image_id])
        # Annotation is in dictionary format
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = img_id

        return target

In [14]:
class MaskDataset(object):
    def __init__(self, transforms):
        self.transforms = transforms
        # load all image files, sorting them to
        # ensure that they are aligned
        self.imgs = list(sorted(os.listdir("images/")))
#         self.labels = list(sorted(os.listdir("/kaggle/input/face-mask-detection/annotations/")))

    def __getitem__(self, idx):
        # load images ad masks
        file_image = 'maksssksksss'+ str(idx) + '.png'
        file_label = 'maksssksksss'+ str(idx) + '.xml'
        img_path = os.path.join("images/", file_image)
        label_path = os.path.join("annotations/", file_label)
        img = Image.open(img_path).convert("RGB")
        #Generate Label
        target = generate_target(idx, label_path)

        if self.transforms is not None:
            img = self.transforms(img)

        return img, target

    def __len__(self):
        return len(self.imgs)



In [15]:
data_transform = transforms.Compose([
        transforms.ToTensor(),
    ])


In [16]:
#batch data with different sizes without manual padding
#https://stackoverflow.com/questions/65279115/how-to-use-collate-fn-with-dataloaders
def collate_fn(batch):
    return tuple(zip(*batch))

dataset = MaskDataset(data_transform)

In [17]:
train_size=int(len(dataset)*0.7)
test_size=len(dataset)-train_size
print('Length of dataset is', len(dataset), '\nLength of training set is :',train_size,'\nLength of test set is :', test_size)


Length of dataset is 853 
Length of training set is : 597 
Length of test set is : 256


In [18]:
trainset, testset=torch.utils.data.random_split(dataset,[train_size,test_size])

data_loader = torch.utils.data.DataLoader(
 trainset, batch_size=4, collate_fn=collate_fn)


In [19]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
for imgs, annotations in data_loader:
    imgs = list(img.to(device) for img in imgs)
    img_size = imgs[0].size()
    annotations = [{k: v.to(device) for k, v in t.items()} for t in annotations]
    print(annotations)
    break


[{'boxes': tensor([[  3., 112.,  12., 122.],
        [ 27., 178.,  35., 186.],
        [ 25., 107.,  35., 117.],
        [ 51., 108.,  61., 120.],
        [ 43., 137.,  53., 148.],
        [ 63., 156.,  71., 166.],
        [ 49.,  80.,  59.,  90.],
        [ 72.,  82.,  77.,  92.],
        [ 55.,  57.,  64.,  66.],
        [ 34.,  31.,  41.,  40.],
        [104., 149., 116., 162.],
        [130., 196., 143., 209.],
        [117., 135., 127., 147.],
        [116., 105., 124., 114.],
        [107.,  89., 115.,  99.],
        [146., 106., 154., 116.],
        [152., 152., 164., 164.],
        [167., 177., 178., 191.],
        [154., 174., 166., 188.],
        [190., 202., 197., 212.],
        [192., 142., 203., 155.],
        [184., 133., 197., 141.],
        [176.,  98., 187., 108.],
        [139.,  80., 150.,  90.],
        [ 89.,  56.,  98.,  63.],
        [ 85.,  46.,  93.,  55.],
        [107.,  57., 117.,  67.],
        [122.,  52., 130.,  60.],
        [149.,  58., 158.,  67.],
   

In [20]:
from transformers import DetrImageProcessor, DetrForObjectDetection
import torch
from PIL import Image
import requests

#url = "http://images.cocodataset.org/val2017/000000039769.jpg"
#image = Image.open(requests.get(url, stream=True).raw)

# use Detr model from huggingface
# you can specify the revision tag if you don't want the timm dependency
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50", revision="no_timm")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50", revision="no_timm")

inputs = processor(images=imgs, return_tensors="pt", do_rescale=False)
outputs = model(**inputs)




preprocessor_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/6.60k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/167M [00:00<?, ?B/s]

In [21]:
import torch.nn as nn
from transformers import AutoModel, AutoConfig
# Modify the model architecture to enable finetuning

# Load pre-trained DeTR model
model_name = "facebook/detr-resnet-50"
config = AutoConfig.from_pretrained(model_name)
print(config)
model = AutoModel.from_pretrained(model_name, config=config)

# Replace classification head
num_classes = 200  # Assuming 3 classes: person, bag, glasses
model.class_embed = nn.Linear(model.config.decoder_layers, num_classes)
model.num_classes = num_classes  # Update number of classes in the config

# Freeze pre-trained layers
for param in model.parameters():
    param.requires_grad = False

# Make final layers trainable
for param in model.class_embed.parameters():
    param.requires_grad = True


DetrConfig {
  "_name_or_path": "facebook/detr-resnet-50",
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "architectures": [
    "DetrForObjectDetection"
  ],
  "attention_dropout": 0.0,
  "auxiliary_loss": false,
  "backbone": "resnet50",
  "backbone_config": null,
  "backbone_kwargs": null,
  "bbox_cost": 5,
  "bbox_loss_coefficient": 5,
  "class_cost": 1,
  "classifier_dropout": 0.0,
  "d_model": 256,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "dice_loss_coefficient": 1,
  "dilation": false,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_coefficient": 0.1,
  "giou_cost": 2,
  "giou_loss_coefficient": 2,
  "id2label": {
    "0": "N/A",
    "1": "person",
    "2": "bicycle",
    "3": "car",
    "4": "motorcycle",
    "5": "airplane",
    "6": "bus",
    "7": "train",
    "8": "truck",
    "9": "boat",
    "10": "t

In [22]:
# convert outputs (bounding boxes and class logits) to COCO API
# let's only keep detections with score > 0.9
target_sizes = [img.shape[-2:] for img in imgs]
results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
            f"Detected {model.config.id2label[label.item()]} with confidence "
            f"{round(score.item(), 3)} at location {box}"
    )

Detected handbag with confidence 0.902 at location [313.33, 166.42, 332.2, 187.9]
Detected handbag with confidence 0.903 at location [321.94, 131.88, 344.85, 164.55]
Detected person with confidence 0.957 at location [148.49, 169.49, 194.79, 225.02]
Detected suitcase with confidence 0.957 at location [221.74, 164.95, 237.06, 196.89]
Detected person with confidence 0.91 at location [15.88, 165.5, 56.66, 225.03]
Detected person with confidence 0.928 at location [89.67, 140.46, 131.67, 224.77]
Detected person with confidence 0.94 at location [54.79, 150.23, 85.91, 224.8]
Detected person with confidence 0.95 at location [179.55, 135.06, 226.3, 214.3]
Detected handbag with confidence 0.921 at location [7.49, 198.42, 28.6, 225.05]
Detected person with confidence 0.953 at location [222.2, 112.38, 271.86, 199.33]
Detected person with confidence 0.927 at location [219.51, 182.85, 268.64, 224.95]
Detected person with confidence 0.979 at location [271.31, 122.89, 314.44, 225.03]


In [23]:
def calculate_iou(box1, box2):
    """
    Calculate IoU (Intersection over Union) between two bounding boxes.

    Arguments:
    box1: List or tuple containing [xmin, ymin, xmax, ymax] of the first box.
    box2: List or tuple containing [xmin, ymin, xmax, ymax] of the second box.

    Returns:
    IoU: Intersection over Union between the two boxes.
    """
    # Calculate intersection (area of overlap)
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])

    intersection = max(0, x2 - x1) * max(0, y2 - y1)

    # Calculate union (area of combined bounding boxes)
    area_box1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area_box2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
    union = area_box1 + area_box2 - intersection

    # Calculate IoU
    iou = intersection / union if union > 0 else 0

    return iou


In [25]:
# Creating a data loader for the test set
test_loader = torch.utils.data.DataLoader(
    testset, batch_size=4, collate_fn=collate_fn
)

# Device configuration
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
threshold = 0.5
# Ground truth and predicted labels
gt_labels_all = []
pred_labels_all = []

# Evaluation loop
for imgs, annotations in test_loader:
    imgs = [img.to(device) for img in imgs]
    img_size = imgs[0].size()
    annotations = [{k: v.to(device) for k, v in t.items()} for t in annotations]
    #model.eval()?
    # Processing the images through the model
    inputs = processor(images=imgs, return_tensors="pt", do_rescale=False)
    outputs = model(**inputs)



    # Post-processing object detection results
    target_sizes = [img.shape[-2:] for img in imgs]
    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

    # Model predictions
    pred_boxes = results["boxes"].tolist()
    pred_labels = results["labels"].tolist()

    # Ground truth annotations
    gt_boxes = [ann["boxes"].tolist() for ann in annotations]
    gt_labels = [ann["labels"].tolist() for ann in annotations]

    # Compare predictions with ground truth
    for pred_box, pred_label, gt_box, gt_label in zip(pred_boxes, pred_labels, gt_boxes, gt_labels):
        # Compare each predicted box with ground truth boxes for the same image
        for i, gt_box_i in enumerate(gt_box):
            iou = calculate_iou(pred_box, gt_box_i)  # Calculate Intersection over Union
            if iou > threshold:  # You need to define a threshold to consider a detection as correct
                pred_label_name = model.config.id2label[pred_label]
                gt_label_name = model.config.id2label[gt_label[i]]
                print(f"Prediction: {pred_box} - Label: {pred_label_name}")
                print(f"Ground Truth: {gt_box_i} - Label: {gt_label_name}")
                print(f"IoU: {iou}")
                gt_labels_all.append(gt_label_name)
                pred_labels_all.append(pred_label_name)


# with no fine-tuning and threshold at 0.6, no image passed the threshold.
# TODO: finetune?

AttributeError: 'DetrModelOutput' object has no attribute 'logits'