In [2]:
#pytorch
import torch
from torch.utils.data import DataLoader, Dataset

#torchvision 
import torchvision
from torchvision import transforms
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.ops import nms

#misc. libraries
from PIL import Image
import cv2 #opencv
import pandas as pd
from sklearn.model_selection import train_test_split
import os

#GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

#split data
data = pd.read_csv("annotations.csv")
data = data.drop(columns=["width", "height"])
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

#build dataset
class CustomDataset(Dataset):
    def __init__(self, image_root, annotation_file):
        self.image_root = image_root
        self.annotation_file = annotation_file

    def __len__(self):
        return len(self.annotation_file)

    def __getitem__(self, index):
        #get image and corresponding annotations (target & labels)
        annotation_row = self.annotation_file.iloc[index] 

        #get image
        image_path = os.path.join(self.image_root, annotation_row["filename"])
        image = Image.open(image_path)
        image = image.convert("RGB")
        transform = transforms.Compose([
            #transforms.Resize(800),
            transforms.ToTensor()
        ])
        image = transform(image)

        #get bbox coordinates
        x1,y1,x2,y2 = annotation_row["xmin"], annotation_row["ymin"], annotation_row["xmax"], annotation_row["ymax"]
        boxes = torch.tensor([[x1,y1,x2,y2]], dtype=torch.float32)

        #get labels (only 1 in this case)
        labels_as_value = {"bread_mold": 1}
        labels = torch.tensor([labels_as_value[annotation_row["class"]]], dtype=torch.int64)

        #RCNN needs target dict.
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels

        return image, target

#account for different amounts of bounding boxes per image.
def collate_fn(batch):
    images, targets = zip(*batch) #seperate images and targets within a batch
    return list(images), list(targets) #return images together and targets together


#create dataloaders
train_dataset = CustomDataset("annotated-data", train_data)
val_dataset = CustomDataset("annotated-data", val_data)

batch_size = 8
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

cuda


In [3]:
#define model and num classes. (one of the classes is "background")
num_classes=2
model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(weights="DEFAULT")
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
model.to(device)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (0): Conv2dNormActivation(
        (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): FrozenBatchNorm2d(16, eps=1e-05)
        (2): Hardswish()
      )
      (1): InvertedResidual(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
            (1): FrozenBatchNorm2d(16, eps=1e-05)
            (2): ReLU(inplace=True)
          )
          (1): Conv2dNormActivation(
            (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d(16, eps=1e-05)
          )
        )
      )
      (2): InvertedResidual(
        (block):

In [6]:
#hyperparameters 
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0005)
num_epochs = 3

best_val_loss = float("inf")
best_epoch = 0

#train the model
for epochs in range(num_epochs):
    model.train()
    batch_train_loss = []
    batch_val_loss = []

    #training data
    for (images, targets) in train_dataloader:
        images = [image.to(device) for image in images]

        for target in targets:
            target["boxes"] = target["boxes"].to(device)
            target["labels"] = target["labels"].to(device)

        optimizer.zero_grad()
        prediction = model(images, targets)
        batch_loss = sum(loss for loss in prediction.values())

        batch_loss.backward()
        optimizer.step()

        batch_train_loss.append(batch_loss.item())

    avg_train_loss = sum(batch_train_loss) / len(batch_train_loss)

    #validation data
    #model.eval() - eval mode does not return losses.
    with torch.no_grad():
        for (images, targets) in val_dataloader:
            images = [image.to(device) for image in images]

            for target in targets:
                target["boxes"] = target["boxes"].to(device)
                target["labels"] = target["labels"].to(device)

            prediction = model(images, targets)
            batch_loss = sum(loss for loss in prediction.values())

            batch_val_loss.append(batch_loss.item())

    avg_val_loss = sum(batch_val_loss) / len(batch_val_loss)

    #save best epoch
    if(avg_val_loss < best_val_loss):
        torch.save(model.state_dict(), "model-weights/best_detector.pth")
        best_val_loss = avg_val_loss
        best_epoch = epochs+1


    print(f"Epoch: {epochs+1} train loss: {avg_train_loss:.4f} val loss: {avg_val_loss:.4f}")
    if(epochs == num_epochs-1):
        print(f"Best epoch: {best_epoch} Best val loss: {best_val_loss:.4f}")

Epoch: 1 train loss: 0.2074 val loss: 0.1843
Epoch: 2 train loss: 0.1845 val loss: 0.1913
Epoch: 3 train loss: 0.1793 val loss: 0.1851
Best epoch: 1 Best val loss: 0.1843


In [14]:
#testing (implemented in main.py)

#preprocess image
image = Image.open("test/test3.jpg")
image = image.convert("RGB")
transform = transforms.Compose([
    transforms.ToTensor()
])
image = transform(image)
image = image.to(device)

#run model
model.eval()
with torch.no_grad():
    pred = model([image])

bboxes, labels, scores = pred[0]["boxes"], pred[0]["labels"], pred[0]["scores"]

# Apply NMS to the filtered bounding boxes
keep = torch.where(scores > 0.1)[0]
nms_indices = nms(bboxes[keep], scores[keep], 0.9)

# Get the final bounding boxes, labels, and scores after NMS
final_bboxes = bboxes[keep][nms_indices]
final_labels = labels[keep][nms_indices]
final_scores = scores[keep][nms_indices]

print(final_scores)

font = cv2.FONT_HERSHEY_SIMPLEX
output_image = cv2.imread("test/test3.jpg")
height, width, _ = output_image.shape
output_image = cv2.resize(output_image, (500, 500))

#scale x and y coordinates
scale_x = 500 / width
scale_y = 500 / height

 
for i in range(len(final_bboxes)): 
    x1,y1,x2,y2 = final_bboxes[i].cpu().numpy().astype("int")

    x1 = int(x1 * scale_x)
    y1 = int(y1 * scale_y)
    x2 = int(x2 * scale_x)
    y2 = int(y2 * scale_y)

    class_name = "mold"
    output_image = cv2.rectangle(output_image, (x1, y1), (x2, y2), (0, 0, 255), 1)
    output_image = cv2.putText(output_image, class_name, (x1, y1 - 10), font, 0.5, (255, 0, 0), 1, cv2.LINE_AA)


cv2.imshow("output", output_image)
cv2.waitKey(0)
cv2.destroyAllWindows()



tensor([0.2952, 0.2795, 0.2281], device='cuda:0')
