In this notebook, im am using DeeplabV3 along with some pre-processed image and pre-defined training set.

1 import libraries

In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset,DataLoader
from torchvision import models
from PIL import Image
import albumentations as A
from albumentations.pytorch import ToTensorV2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import cv2
import pandas as pd
from pycocotools.coco import COCO
from torchvision.models import resnet50, ResNet50_Weights

2 . Then we need to load the data and define the constants that we used.

In [2]:
datalocation = "kagglehub/datasets/wildlifedatasets/seaturtleid2022/versions/3/turtles-data/data"
Batch = 8 # This is the number of training examples utilised in one interation of training. since i do not have a large GPU memory, i used 8
EPOCHS = 10 #The dataset passed through the model for 10 times. I found that 10 is a pretty good fit that it wont lead to overfitting.
Learning_rate = 0.001 # The rate that model updateds its weights
Segments = 4 # there are four segments - background, head, flippers and carapace.
DEVICE = 'cuda' #Use GPU fro training
traning_split = 0.2 # split the dataset based on this - 80% for trainng and 20% for validation

**3. Load and split the dataset, in this step, I split the data set based on timestemp of the image because of the lighting condition. Genearlly the light from morning and afternoon and night would be differnet due to the direction of the sunlight and brightnees of the photo. I was hoping by split the data in this way, the model wont ferver on certain lighting condition.**

In [3]:
# Load COCO annotations
coco = COCO(f"{datalocation}/annotations.json") #load the data using COCO
# Get all image IDs
img_ID = coco.getImgIds()
print(f"Total images in dataset: {len(img_ID)}") #print out total number of images

# Create metadata DataFrame for all images
def get_metadata(img_ID, coco):
    metadata_list = []
    for img_id in img_ID:
        img = coco.loadImgs(img_id)[0]
        metadata_list.append(img)
    return pd.DataFrame(metadata_list)

all_metadata = get_metadata(img_ID, coco)

# Convert timestamp to datetime and categorize into morning, afternoon, night
all_metadata['timetest'] = pd.to_datetime(all_metadata['timestamp'], format="%Y:%m:%d %H:%M:%S", errors='coerce')
all_metadata['time_category'] = all_metadata['timetest'].dt.hour.apply(
    lambda x: 'morning' if 8 <= x < 12 else ('afternoon' if 12 <= x < 18 else ('night' if 18 <= x < 24 or 0 <= x < 8 else 'night'))
)

# Split metadata into morning, afternoon, and night
morning_metadata = all_metadata[all_metadata['time_category'] == 'morning']
afternoon_metadata = all_metadata[all_metadata['time_category'] == 'afternoon']
night_metadata = all_metadata[all_metadata['time_category'] == 'night']

#split the dataset from each time period
train_morning, val_morning = train_test_split(morning_metadata['id'].tolist(), test_size=traning_split, random_state=34)
train_afternoon, val_afternoon = train_test_split(afternoon_metadata['id'].to_list(), test_size=traning_split, random_state=34)
train_night, val_night = train_test_split(night_metadata['id'].to_list(), test_size=traning_split, random_state=34)

# Combine splits from each time category
train_ids = train_morning + train_afternoon + train_night
val_ids = val_morning + val_afternoon + val_night

#print out total number for traing set and validation set just to make sure they add up.
print(f"Training set size: {len(train_ids)} images")
print(f"Validation set size: {len(val_ids)} images")

loading annotations into memory...
Done (t=2.63s)
creating index...
index created!
Total images in dataset: 8729
Training set size: 6982 images
Validation set size: 1747 images


**4. define my own dataset class - so i can used it to pre-process my image - ie. re-size and etc. I am using albumentations for this task**

In [13]:
class TurtleDataset(Dataset):
    def __init__(self, IDs,coco,transform=None):
        self.IDs = IDs
        self.coco = coco
        self.transform = transform
        
    def __len__(self):
        return len(self.IDs)
    
    def __getitem__(self,number):
        id = self.IDs[number]
        info = self.coco.loadImgs(id)[0] #load imfo from coco for each image, so we can load it
        path = os.path.join(datalocation,info['file_name']) # get the image location from it's info in coco
        image = np.array(Image.open(path).convert("RGB"))
        
        maskIDs = self.coco.getAnnIds(imgIds = id) # get all the anns for each image
        masks = self.coco.loadAnns(maskIDs)#load the annotations  - segmentations masks for the annotation id
        
        final_mask = np.zeros((info['height'],info['width']),dtype=np.uint8) # create a empty mask with the same size as the image.
        
        for mask in masks:
            final_mask = np.maximum(final_mask,self.coco.annToMask(mask)*mask["category_id"]) # for each segement, load the ann from coco to binary mask first and the time it by our category ID. So that each pixel in the mask represents the segment for that object.
         # this mask will be the final mask that we use for trainig and validation - it will have all the segment information in one mask instead of 4.
        
        if self.transform: 
            pre_process = self.transform(image=image, mask = final_mask) # pre_process process including resize image and normalize image
            image = pre_process['image'] 
            final_mask = pre_process['mask']
         
        return image,final_mask

**5 pre-processing step. In this step, I'll resize the all the image to the same size. Normalize the image and feed it to tensorV2 using albumentations**

In [5]:
training_Transform = A.Compose([A.Resize(256,256),#resize the image to 256*256
                                A.HorizontalFlip(p=0.5), #readom flip the image to increas the dataset diversity.
                                A.Normalize(mean=(0.485,0.456,0.406),
                                            std=(0.229,0.224,0.225)), #normalsation for each image, the number was recommended by resnet-50, but this could be changed based on my dataset - this step need to be done!!!
                                ToTensorV2()
                                ])

val_Transform = A.Compose([A.Resize(256,256),#resize the image to 256*256
                                A.Normalize(mean=(0.485,0.456,0.406),
                                            std=(0.229,0.224,0.225)), #normalsation for each image, the number was recommended by resnet-50, but this could be changed based on my dataset - this step need to be done!!!
                                ToTensorV2()
                                ])

**6. create the data set and load the pass the dataset to dataloader, so we can pass it to the model**

In [14]:
train_set = TurtleDataset(train_ids,coco,transform=training_Transform) #load the training data to my class, and transform the data.
val_set = TurtleDataset(val_ids,coco,transform=training_Transform)

train_loader = DataLoader(train_set,batch_size=Batch,shuffle=True)# pass the train_set to the dataloader, so we can pass it to the model
val_loader = DataLoader(val_set,batch_size=Batch,shuffle=False)

**7. In this step, I am choosing to load a pre-trained deepLabv3 model on ImageNet and modify the final layer to predict four classes**

In [7]:
model = models.segmentation.deeplabv3_resnet50(weights='COCO_WITH_VOC_LABELS_V1') #load the deeplabv3 model with resnet50 backbone, It's a 50 layer cnn that act as a feature extractor.
#load the default weight that recommended by the offical website.

model.classifier[4] = nn.Conv2d(256,Segments,kernel_size=(1,1)) #replace the last layer of the classifier to our own number of segments, start with kernel size 1*1 so make th calculation faster

model.to(DEVICE) #move the model to GPU


DeepLabV3(
  (backbone): IntermediateLayerGetter(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Se

In [8]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr = Learning_rate)

**7. we are going to train the model with our image**

In [15]:
for e in range(EPOCHS):
    model.train()  # Set model to training mode
    running_loss = 0.0

    for imgs, masks in train_loader:
        imgs, masks = imgs.to(DEVICE), masks.to(DEVICE)

        # CrossEntropyLoss expects target to be of type LongTensor
        masks = masks.long()

        # Forward pass through the model
        output = model(imgs)['out']
        print(f"Output shape: {output.shape}, Mask shape: {masks.shape}")

        # Compute loss
        loss = criterion(output, masks)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()  # Fix typo here
        optimizer.step()

        # Accumulate running loss
        running_loss += loss.item()

    # Calculate average training loss for the epoch
    average_loss = running_loss / len(train_loader)
    print(f"Epoch [{e + 1}/{EPOCHS}], Training Loss: {average_loss:.3f}")

    # Validation loop
    model.eval()  # Set model to evaluation mode
    val_loss = 0.0

    with torch.no_grad():
        for val_imgs, val_masks in val_loader:
            # Move validation images and masks to the device (GPU or CPU)
            val_imgs, val_masks = val_imgs.to(DEVICE), val_masks.to(DEVICE)

            # Forward pass through the model
            valout = model(val_imgs)['out']
            # Compute loss
            loss = criterion(valout, val_masks.long())
            val_loss += loss.item()

    # Calculate average validation loss for the epoch
    average_valloss = val_loss / len(val_loader)
    print(f"Epoch [{e + 1}/{EPOCHS}], Validation Loss: {average_valloss:.3f}")

# Save the model's state dictionary
torch.save(model.state_dict(), "deeplabv3_seaturtle_standard.pth")


            
        

Output shape: torch.Size([8, 4, 256, 256]), Mask shape: torch.Size([8, 256, 256])
Output shape: torch.Size([8, 4, 256, 256]), Mask shape: torch.Size([8, 256, 256])
Output shape: torch.Size([8, 4, 256, 256]), Mask shape: torch.Size([8, 256, 256])
Output shape: torch.Size([8, 4, 256, 256]), Mask shape: torch.Size([8, 256, 256])
Output shape: torch.Size([8, 4, 256, 256]), Mask shape: torch.Size([8, 256, 256])
Output shape: torch.Size([8, 4, 256, 256]), Mask shape: torch.Size([8, 256, 256])
Output shape: torch.Size([8, 4, 256, 256]), Mask shape: torch.Size([8, 256, 256])
Output shape: torch.Size([8, 4, 256, 256]), Mask shape: torch.Size([8, 256, 256])
Output shape: torch.Size([8, 4, 256, 256]), Mask shape: torch.Size([8, 256, 256])
Output shape: torch.Size([8, 4, 256, 256]), Mask shape: torch.Size([8, 256, 256])
Output shape: torch.Size([8, 4, 256, 256]), Mask shape: torch.Size([8, 256, 256])
Output shape: torch.Size([8, 4, 256, 256]), Mask shape: torch.Size([8, 256, 256])
Output shape: to