In [2]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pathlib
import pandas as pd
import cv2
import glob
import xml.etree.ElementTree as ET

import tensorflow as tf
print(tf.__version__)
from tensorflow import keras
from tensorflow.keras.applications import MobileNetV3Large
from sklearn.model_selection import train_test_split

import PIL
from PIL import Image 
from PIL.ImageDraw import Draw

2.19.0


In [2]:
def xml_to_csv(path):
    """
    Convert XML files to CSV
    Args:
        path: Path to the folder containing XML files
        skipNegatives: If True, skip images with no burger annotations
    Returns:
        Pandas DataFrame with columns ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
    """
    xml_list = []
    
    # Get all XML files in the directory
    for xml_file in glob.glob(path + '/*.xml'):
        tree = ET.parse(xml_file)
        root = tree.getroot()
        
        # Get image size
        width = int(root.find('size/width').text)
        height = int(root.find('size/height').text)
        filename = root.find('filename').text

        # Check if there are any burger objects
        objects = root.findall('object')
        
        # Function to safely check if an object is a burger
        def is_burger(obj):
            name_tag = obj.find('name')
            if name_tag is not None and name_tag.text is not None:
                # Remove any whitespace and check if it's 'burger'
                return name_tag.text.strip() == 'burger'
            return False
        
        has_burger = any(is_burger(obj) for obj in objects) if objects else False
        
        if not has_burger:
            xml_list.append({
                'filename': filename,
                'width': width,
                'height': height,
                'class': 0,
                'xmin': 0,
                'ymin': 0,
                'xmax': 0,
                'ymax': 0
            })
        else:
            for obj in objects:
                if obj.find('name').text.strip() == 'burger':
                    bbox = obj.find('bndbox')
                    xml_list.append({
                        'filename': filename,
                        'width': width,
                        'height': height,
                        'class': 1,  # BURGER
                        'xmin': int(bbox.find('xmin').text),
                        'ymin': int(bbox.find('ymin').text),
                        'xmax': int(bbox.find('xmax').text),
                        'ymax': int(bbox.find('ymax').text)
                    })
    
    # Convert to DataFrame
    df = pd.DataFrame(xml_list)
    columns = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
    df = df[columns]
    return df

In [3]:
df = xml_to_csv('Images')
# Print some statistics
print(f"\nTotal annotations found: {len(df)}")
print(f"Number of burger annotations (class 1): {len(df[df['class'] == 1])}")
print(f"Number of non-burger images (class 0): {len(df[df['class'] == 0])}")

# Save to CSV
output_file = 'Dataset/annotations.csv'


Total annotations found: 1939
Number of burger annotations (class 1): 1483
Number of non-burger images (class 0): 456


In [4]:
df.to_csv(output_file, index=False)
print(f"\nSaved annotations to {output_file}")
print("\nFirst few rows of the CSV:")
print(df.head())


Saved annotations to Dataset/annotations.csv

First few rows of the CSV:
                                         filename  width  height  class  xmin  \
0   5_png.rf.e85cf15dded411bf7a41692f82be156d.jpg    640     360      1   225   
1  81_png.rf.b9cdc89c8ab390f442bd80fa9c71e72b.jpg    640     360      1   132   
2  33_png.rf.0e1d003e6ee90bb6ce23c9e4a999687d.jpg    640     360      1   237   
3  41_png.rf.b53130139bf796b74fee4c811a11eb4f.jpg    640     360      1   223   
4  25_png.rf.8b22e889293a9504aede42107e84e5ee.jpg    640     360      1   210   

   ymin  xmax  ymax  
0    37   386   211  
1   118   374   361  
2   125   413   325  
3    36   429   267  
4    49   453   324  


In [5]:
def split_dataset(csv_path, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, random_state=202):
    """
    Split the dataset into train, validation and test sets while maintaining class balance
    
    Args:
        csv_path: Path to the annotations CSV file
        train_ratio: Ratio of training data (default: 0.7)
        val_ratio: Ratio of validation data (default: 0.15)
        test_ratio: Ratio of test data (default: 0.15)
        random_state: Random seed for reproducibility
    """
    # Read the CSV file
    df = pd.read_csv(csv_path)
    
    # Create separate dataframes for positive and negative samples
    negative_samples = df[df['class'] == 0]
    positive_samples = df[df['class'] == 1]
    
    print(f"Total samples: {len(df)}")
    print(f"Positive samples (class 1): {len(positive_samples)}")
    print(f"Negative samples (class 0): {len(negative_samples)}")
    
    # First split: separate test set
    neg_train_val, neg_test = train_test_split(
        negative_samples, 
        test_size=test_ratio, 
        random_state=random_state
    )
    
    pos_train_val, pos_test = train_test_split(
        positive_samples, 
        test_size=test_ratio, 
        random_state=random_state
    )
    
    # Second split: separate train and validation from the remaining data
    val_ratio_adjusted = val_ratio / (train_ratio + val_ratio)
    
    neg_train, neg_val = train_test_split(
        neg_train_val, 
        test_size=val_ratio_adjusted, 
        random_state=random_state
    )
    
    pos_train, pos_val = train_test_split(
        pos_train_val, 
        test_size=val_ratio_adjusted, 
        random_state=random_state
    )
    
    # Combine positive and negative samples for each set
    train_df = pd.concat([pos_train, neg_train])
    val_df = pd.concat([pos_val, neg_val])
    test_df = pd.concat([pos_test, neg_test])
    
    # Shuffle each dataset
    train_df = train_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    val_df = val_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    test_df = test_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    # Print statistics
    print("\nDataset split statistics:")
    print(f"Training set: {len(train_df)} samples")
    print(f"  - Positive (class 1): {len(train_df[train_df['class'] == 1])}")
    print(f"  - Negative (class 0): {len(train_df[train_df['class'] == 0])}")
    
    print(f"\nValidation set: {len(val_df)} samples")
    print(f"  - Positive (class 1): {len(val_df[val_df['class'] == 1])}")
    print(f"  - Negative (class 0): {len(val_df[val_df['class'] == 0])}")
    
    print(f"\nTest set: {len(test_df)} samples")
    print(f"  - Positive (class 1): {len(test_df[test_df['class'] == 1])}")
    print(f"  - Negative (class 0): {len(test_df[test_df['class'] == 0])}")
    
    # Save the splits to CSV files
    train_df.to_csv('Dataset/train.csv', index=False)
    val_df.to_csv('Dataset/val.csv', index=False)
    test_df.to_csv('Dataset/test.csv', index=False)
    
    return train_df, val_df, test_df

In [6]:
train_df, val_df, test_df = split_dataset('Dataset/annotations.csv')

Total samples: 1939
Positive samples (class 1): 1483
Negative samples (class 0): 456

Dataset split statistics:
Training set: 1355 samples
  - Positive (class 1): 1037
  - Negative (class 0): 318

Validation set: 292 samples
  - Positive (class 1): 223
  - Negative (class 0): 69

Test set: 292 samples
  - Positive (class 1): 223
  - Negative (class 0): 69


In [7]:
TRAIN_CSV_FILE = 'Dataset/train.csv'
VAL_CSV_FILE = 'Dataset/val.csv'
TEST_CSV_FILE = 'Dataset/test.csv'
IMAGE_DIR = 'Images'

def prepare_dataset(csv_file, image_dir):
    """
    Prepare images, targets and labels from a CSV file
    
    Args:
        csv_file: Path to CSV file containing annotations
        image_dir: Directory containing the images
        
    Returns:
        images: numpy array of images
        targets: dictionary containing class and bbox outputs
        labels: numpy array of class labels
    """
    # Read the CSV file
    df = pd.read_csv(csv_file)
    
    images = []
    bbox_targets = []
    class_labels = []
    
    for index, row in df.iterrows():
        # Extract data from row
        filename = row['filename']
        width = row['width']
        height = row['height']
        class_label = row['class']  # Assuming this is 0 or 1 for binary classification
        xmin = row['xmin']
        ymin = row['ymin']
        xmax = row['xmax']
        ymax = row['ymax']
        
        # Load and preprocess image
        image_path = os.path.join(image_dir, filename)
        img = keras.preprocessing.image.load_img(image_path, target_size=(360, 640))  # Fixed size for all images
        img_array = keras.preprocessing.image.img_to_array(img)
        img_array = img_array / 255.0  # Normalize pixel values
        
        # Normalize bounding box coordinates
        bbox = [
            round(xmin/width, 4),
            round(ymin/height, 4),
            round(xmax/width, 4),
            round(ymax/height, 4)
        ]
        
        images.append(img_array)
        bbox_targets.append(bbox)
        class_labels.append(class_label)
    
    # Convert lists to numpy arrays
    images = np.array(images, dtype=np.float32)
    bbox_targets = np.array(bbox_targets, dtype=np.float32)
    class_labels = np.array(class_labels, dtype=np.float32)
    
    # Create targets dictionary for multi-output model
    targets = {
        'class_output': class_labels,
        'bbox_output': bbox_targets
    }
    
    return images, targets

# Prepare train, validation and test sets
print("Preparing training data...")
train_images, train_targets = prepare_dataset(TRAIN_CSV_FILE, IMAGE_DIR)

print("Preparing validation data...")
val_images, val_targets = prepare_dataset(VAL_CSV_FILE, IMAGE_DIR)

print("Preparing test data...")
test_images, test_targets = prepare_dataset(TEST_CSV_FILE, IMAGE_DIR)

# Print shapes to verify
print("\nDataset shapes:")
print(f"Training images: {train_images.shape}")
print(f"Training class labels: {train_targets['class_output'].shape}")
print(f"Training bbox targets: {train_targets['bbox_output'].shape}")
print(f"\nValidation images: {val_images.shape}")
print(f"Validation class labels: {val_targets['class_output'].shape}")
print(f"Validation bbox targets: {val_targets['bbox_output'].shape}")
print(f"\nTest images: {test_images.shape}")
print(f"Test class labels: {test_targets['class_output'].shape}")
print(f"Test bbox targets: {test_targets['bbox_output'].shape}")

Preparing training data...
Preparing validation data...
Preparing test data...

Dataset shapes:
Training images: (1355, 360, 640, 3)
Training class labels: (1355,)
Training bbox targets: (1355, 4)

Validation images: (292, 360, 640, 3)
Validation class labels: (292,)
Validation bbox targets: (292, 4)

Test images: (292, 360, 640, 3)
Test class labels: (292,)
Test bbox targets: (292, 4)


In [8]:
def prepare_dataset(csv_file, image_dir):
    """
    Prepare images, targets and labels from a CSV file with validation checks
    """
    # Read the CSV file
    df = pd.read_csv(csv_file)
    print(f"Number of records in CSV: {len(df)}")
    
    images = []
    bbox_targets = []
    class_labels = []
    
    for index, row in df.iterrows():
        try:
            # Extract data from row
            filename = row['filename']
            width = float(row['width'])  # Convert to float to avoid division issues
            height = float(row['height'])
            class_name = row['class']
            xmin = float(row['xmin'])
            ymin = float(row['ymin'])
            xmax = float(row['xmax'])
            ymax = float(row['ymax'])
            
            # Validate bounding box coordinates
            if xmin >= xmax or ymin >= ymax:
                print(f"Invalid bbox coordinates in row {index}: xmin={xmin}, xmax={xmax}, ymin={ymin}, ymax={ymax}")
                continue
                
            # Load and preprocess image
            image_path = os.path.join(image_dir, filename)
            if not os.path.exists(image_path):
                print(f"Image not found: {image_path}")
                continue
                
            img = keras.preprocessing.image.load_img(
                image_path, 
                target_size=(360, 640)
            )
            img_array = keras.preprocessing.image.img_to_array(img)
            img_array = img_array / 255.0  # Normalize
            
            # Normalize bounding box coordinates
            bbox = [
                max(0.0, min(1.0, round(xmin/width, 4))),
                max(0.0, min(1.0, round(ymin/height, 4))),
                max(0.0, min(1.0, round(xmax/width, 4))),
                max(0.0, min(1.0, round(ymax/height, 4)))
            ]
            
            # Convert class name to integer if needed
            if isinstance(class_name, str):
                class_label = 1 if class_name.lower() == 'defect' else 0
            else:
                class_label = int(class_name)
            
            images.append(img_array)
            bbox_targets.append(bbox)
            class_labels.append(class_label)
            
        except Exception as e:
            print(f"Error processing row {index}: {str(e)}")
            continue
    
    if not images:
        raise ValueError("No valid images were processed!")
    
    # Convert lists to numpy arrays
    images = np.array(images, dtype=np.float32)
    bbox_targets = np.array(bbox_targets, dtype=np.float32)
    class_labels = np.array(class_labels, dtype=np.float32)
    
    # Print shapes for debugging
    print(f"Images shape: {images.shape}")
    print(f"Bounding box targets shape: {bbox_targets.shape}")
    print(f"Class labels shape: {class_labels.shape}")
    
    # Reshape class labels to have shape (n, 1)
    class_labels = class_labels.reshape(-1, 1)
    
    # Create targets dictionary
    targets = {
        'class_output': class_labels,
        'bbox_output': bbox_targets
    }
    
    return images, targets

# Prepare datasets with error handling
try:
    print("\nPreparing training data...")
    train_images, train_targets = prepare_dataset(TRAIN_CSV_FILE, IMAGE_DIR)
    
    print("\nPreparing validation data...")
    val_images, val_targets = prepare_dataset(VAL_CSV_FILE, IMAGE_DIR)
    
    print("\nPreparing test data...")
    test_images, test_targets = prepare_dataset(TEST_CSV_FILE, IMAGE_DIR)
    
    # Final verification of shapes
    print("\nFinal dataset shapes:")
    print(f"Training:")
    print(f"- Images: {train_images.shape}")
    print(f"- Class labels: {train_targets['class_output'].shape}")
    print(f"- Bounding boxes: {train_targets['bbox_output'].shape}")
    
    print(f"\nValidation:")
    print(f"- Images: {val_images.shape}")
    print(f"- Class labels: {val_targets['class_output'].shape}")
    print(f"- Bounding boxes: {val_targets['bbox_output'].shape}")
    
    print(f"\nTest:")
    print(f"- Images: {test_images.shape}")
    print(f"- Class labels: {test_targets['class_output'].shape}")
    print(f"- Bounding boxes: {test_targets['bbox_output'].shape}")
    
except Exception as e:
    print(f"Error during dataset preparation: {str(e)}")


Preparing training data...
Number of records in CSV: 1355
Invalid bbox coordinates in row 2: xmin=0.0, xmax=0.0, ymin=0.0, ymax=0.0
Invalid bbox coordinates in row 4: xmin=0.0, xmax=0.0, ymin=0.0, ymax=0.0
Invalid bbox coordinates in row 7: xmin=0.0, xmax=0.0, ymin=0.0, ymax=0.0
Invalid bbox coordinates in row 12: xmin=0.0, xmax=0.0, ymin=0.0, ymax=0.0
Invalid bbox coordinates in row 16: xmin=0.0, xmax=0.0, ymin=0.0, ymax=0.0
Invalid bbox coordinates in row 38: xmin=0.0, xmax=0.0, ymin=0.0, ymax=0.0
Invalid bbox coordinates in row 40: xmin=0.0, xmax=0.0, ymin=0.0, ymax=0.0
Invalid bbox coordinates in row 41: xmin=0.0, xmax=0.0, ymin=0.0, ymax=0.0
Invalid bbox coordinates in row 50: xmin=0.0, xmax=0.0, ymin=0.0, ymax=0.0
Invalid bbox coordinates in row 60: xmin=0.0, xmax=0.0, ymin=0.0, ymax=0.0
Invalid bbox coordinates in row 63: xmin=0.0, xmax=0.0, ymin=0.0, ymax=0.0
Invalid bbox coordinates in row 69: xmin=0.0, xmax=0.0, ymin=0.0, ymax=0.0
Invalid bbox coordinates in row 73: xmin=0.0

In [3]:
def create_model():
    base_model = MobileNetV3Large(
        input_shape=(360,640,3),
        include_top=False, 
        weights='imagenet'
    )
    base_model.trainable = True

    x = base_model.output

    x = tf.keras.layers.Conv2D(32, (3, 3), strides=(1, 1), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

    ########## block 1 ##########
    x = tf.keras.layers.Conv2D(64, (3, 3), strides=(1, 1), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

    x_shortcut = x

    for i in range(2):
        x = tf.keras.layers.Conv2D(32, (3, 3), strides=(1, 1), padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x = tf.keras.layers.Conv2D(64, (3, 3), strides=(1, 1), padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x = tf.keras.layers.Add()([x_shortcut, x])
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x_shortcut = x


    ########## block 2 ##########
    x = tf.keras.layers.Conv2D(128, (3, 3), strides=(1, 1), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

    x_shortcut = x

    for i in range(2):
        x = tf.keras.layers.Conv2D(64, (3, 3), strides=(1, 1), padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x = tf.keras.layers.Conv2D(128, (3, 3), strides=(1, 1), padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x = tf.keras.layers.Add()([x_shortcut, x])
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x_shortcut = x

    ########## block 3 ##########
    x = tf.keras.layers.Conv2D(256, (3, 3), strides=(2, 2), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

    x_shortcut = x

    for i in range(3):
        x = tf.keras.layers.Conv2D(128, (3, 3), strides=(1, 1), padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x = tf.keras.layers.Conv2D(256, (3, 3), strides=(1, 1), padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x = tf.keras.layers.Add()([x_shortcut, x])
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x_shortcut = x

        
    ########## block 4 ##########
    x = tf.keras.layers.Conv2D(512, (3, 3), strides=(2, 2), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

    x_shortcut = x

    for i in range(5):
        x = tf.keras.layers.Conv2D(256, (3, 3), strides=(1, 1), padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x = tf.keras.layers.Conv2D(512, (3, 3), strides=(1, 1), padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x = tf.keras.layers.Add()([x_shortcut, x])
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x_shortcut = x

    ########## block 5 ##########
    x = tf.keras.layers.Conv2D(1024, (3, 3), strides=(2, 2), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

    x_shortcut = x

    for i in range(8):
        x = tf.keras.layers.Conv2D(512, (3, 3), strides=(1, 1), padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x = tf.keras.layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x = tf.keras.layers.Add()([x_shortcut, x])
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x_shortcut = x

    ########## output layers ##########
    x = tf.keras.layers.Conv2D(512, (3, 3), strides=(1, 1), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

    x = tf.keras.layers.Conv2D(256, (3, 3), strides=(1, 1), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

    x = tf.keras.layers.Conv2D(128, (3, 3), strides=(1, 1), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

    # Classification head
    class_branch = tf.keras.layers.GlobalAveragePooling2D()(x)
    class_branch = tf.keras.layers.Dense(64, activation='relu')(class_branch)
    class_output = tf.keras.layers.Dense(1, activation='sigmoid', name='class_output')(class_branch)

    # Bounding box head
    bbox_branch = tf.keras.layers.GlobalAveragePooling2D()(x)
    bbox_branch = tf.keras.layers.Dense(64, activation='relu')(bbox_branch)
    bbox_output = tf.keras.layers.Dense(4, activation='sigmoid', name='bbox_output')(bbox_branch)

    model = tf.keras.Model(
        inputs=base_model.input, 
        outputs={
            'class_output': class_output,
            'bbox_output': bbox_output
        }
    )
    return model

model = create_model()

  return MobileNetV3(


In [8]:
def custom_loss(y_true, y_pred):
    binary_crossentropy = prob_loss = tf.keras.losses.BinaryCrossentropy(
        reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE
    )
    
    prob_loss = binary_crossentropy(
        tf.concat([y_true[:,:,:,0], y_true[:,:,:,5]], axis=0), 
        tf.concat([y_pred[:,:,:,0], y_pred[:,:,:,5]], axis=0)
    )
    
    xy_loss = tf.keras.losses.MSE(
        tf.concat([y_true[:,:,:,1:3], y_true[:,:,:,6:8]], axis=0), 
        tf.concat([y_pred[:,:,:,1:3], y_pred[:,:,:,6:8]], axis=0)
    )
    
    wh_loss = tf.keras.losses.MSE(
        tf.concat([y_true[:,:,:,3:5], y_true[:,:,:,8:10]], axis=0), 
        tf.concat([y_pred[:,:,:,3:5], y_pred[:,:,:,8:10]], axis=0)
    )
    
    bboxes_mask = get_mask(y_true)
    
    xy_loss = xy_loss * bboxes_mask
    wh_loss = wh_loss * bboxes_mask
    
    return prob_loss + xy_loss + wh_loss

def get_mask(y_true):
    anchor_one_mask = tf.where(
        y_true[:,:,:,0] == 0, 
        0.5, 
        5.0
    )
    
    anchor_two_mask = tf.where(
        y_true[:,:,:,5] == 0, 
        0.5, 
        5.0
    )
    
    bboxes_mask = tf.concat(
        [anchor_one_mask,anchor_two_mask],
        axis=0
    )
    
    return bboxes_mask

In [4]:
optimiser = tf.keras.optimizers.Adam(learning_rate=0.0001)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss={
        'class_output': 'binary_crossentropy',
        'bbox_output': 'mse'
    },
    metrics={
        'class_output': 'accuracy',
        'bbox_output': 'mse'
    }
)

model.summary()

In [10]:
callbacks = [
        tf.keras.callbacks.ModelCheckpoint(
            'best_model.h5',
            save_best_only=True,
            monitor='val_loss'
        ),
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='loss',
            factor=0.2,
            patience=5,
            min_lr=1e-6,
            restore_best_weights=True
        )
    ]

history = model.fit(
        train_images,
        train_targets,
        validation_data=(val_images, val_targets),
        epochs=64,
        batch_size=32,
        callbacks=callbacks
    )

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - bbox_output_loss: 0.0476 - bbox_output_mse: 0.0476 - class_output_accuracy: 0.9205 - class_output_loss: 0.2750 - loss: 0.3226



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 4s/step - bbox_output_loss: 0.0471 - bbox_output_mse: 0.0471 - class_output_accuracy: 0.9222 - class_output_loss: 0.2712 - loss: 0.3184 - val_bbox_output_loss: 0.0518 - val_bbox_output_mse: 0.0518 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.4102 - val_loss: 0.4619 - learning_rate: 1.0000e-04
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - bbox_output_loss: 0.0142 - bbox_output_mse: 0.0142 - class_output_accuracy: 1.0000 - class_output_loss: 0.0441 - loss: 0.0583



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 4s/step - bbox_output_loss: 0.0142 - bbox_output_mse: 0.0142 - class_output_accuracy: 1.0000 - class_output_loss: 0.0439 - loss: 0.0581 - val_bbox_output_loss: 0.0361 - val_bbox_output_mse: 0.0361 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.2414 - val_loss: 0.2775 - learning_rate: 1.0000e-04
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - bbox_output_loss: 0.0091 - bbox_output_mse: 0.0091 - class_output_accuracy: 1.0000 - class_output_loss: 0.0259 - loss: 0.0350



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 3s/step - bbox_output_loss: 0.0091 - bbox_output_mse: 0.0091 - class_output_accuracy: 1.0000 - class_output_loss: 0.0258 - loss: 0.0349 - val_bbox_output_loss: 0.0259 - val_bbox_output_mse: 0.0259 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.1421 - val_loss: 0.1680 - learning_rate: 1.0000e-04
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - bbox_output_loss: 0.0077 - bbox_output_mse: 0.0077 - class_output_accuracy: 1.0000 - class_output_loss: 0.0182 - loss: 0.0259



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 3s/step - bbox_output_loss: 0.0077 - bbox_output_mse: 0.0077 - class_output_accuracy: 1.0000 - class_output_loss: 0.0182 - loss: 0.0259 - val_bbox_output_loss: 0.0193 - val_bbox_output_mse: 0.0192 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.0866 - val_loss: 0.1058 - learning_rate: 1.0000e-04
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - bbox_output_loss: 0.0068 - bbox_output_mse: 0.0068 - class_output_accuracy: 1.0000 - class_output_loss: 0.0137 - loss: 0.0205



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 3s/step - bbox_output_loss: 0.0068 - bbox_output_mse: 0.0068 - class_output_accuracy: 1.0000 - class_output_loss: 0.0137 - loss: 0.0205 - val_bbox_output_loss: 0.0165 - val_bbox_output_mse: 0.0164 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.0565 - val_loss: 0.0729 - learning_rate: 1.0000e-04
Epoch 6/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - bbox_output_loss: 0.0053 - bbox_output_mse: 0.0053 - class_output_accuracy: 1.0000 - class_output_loss: 0.0105 - loss: 0.0158



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 3s/step - bbox_output_loss: 0.0053 - bbox_output_mse: 0.0053 - class_output_accuracy: 1.0000 - class_output_loss: 0.0105 - loss: 0.0158 - val_bbox_output_loss: 0.0153 - val_bbox_output_mse: 0.0153 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.0376 - val_loss: 0.0529 - learning_rate: 1.0000e-04
Epoch 7/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - bbox_output_loss: 0.0049 - bbox_output_mse: 0.0049 - class_output_accuracy: 1.0000 - class_output_loss: 0.0083 - loss: 0.0131



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 3s/step - bbox_output_loss: 0.0049 - bbox_output_mse: 0.0049 - class_output_accuracy: 1.0000 - class_output_loss: 0.0082 - loss: 0.0131 - val_bbox_output_loss: 0.0153 - val_bbox_output_mse: 0.0153 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.0273 - val_loss: 0.0426 - learning_rate: 1.0000e-04
Epoch 8/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - bbox_output_loss: 0.0039 - bbox_output_mse: 0.0039 - class_output_accuracy: 1.0000 - class_output_loss: 0.0068 - loss: 0.0107



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 3s/step - bbox_output_loss: 0.0039 - bbox_output_mse: 0.0039 - class_output_accuracy: 1.0000 - class_output_loss: 0.0068 - loss: 0.0107 - val_bbox_output_loss: 0.0131 - val_bbox_output_mse: 0.0131 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.0193 - val_loss: 0.0324 - learning_rate: 1.0000e-04
Epoch 9/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - bbox_output_loss: 0.0032 - bbox_output_mse: 0.0032 - class_output_accuracy: 1.0000 - class_output_loss: 0.0055 - loss: 0.0086



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 3s/step - bbox_output_loss: 0.0032 - bbox_output_mse: 0.0032 - class_output_accuracy: 1.0000 - class_output_loss: 0.0055 - loss: 0.0086 - val_bbox_output_loss: 0.0129 - val_bbox_output_mse: 0.0128 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.0134 - val_loss: 0.0262 - learning_rate: 1.0000e-04
Epoch 10/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - bbox_output_loss: 0.0028 - bbox_output_mse: 0.0028 - class_output_accuracy: 1.0000 - class_output_loss: 0.0046 - loss: 0.0075



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 3s/step - bbox_output_loss: 0.0029 - bbox_output_mse: 0.0028 - class_output_accuracy: 1.0000 - class_output_loss: 0.0046 - loss: 0.0075 - val_bbox_output_loss: 0.0126 - val_bbox_output_mse: 0.0125 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.0105 - val_loss: 0.0230 - learning_rate: 1.0000e-04
Epoch 11/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - bbox_output_loss: 0.0026 - bbox_output_mse: 0.0026 - class_output_accuracy: 1.0000 - class_output_loss: 0.0038 - loss: 0.0063



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 3s/step - bbox_output_loss: 0.0025 - bbox_output_mse: 0.0025 - class_output_accuracy: 1.0000 - class_output_loss: 0.0038 - loss: 0.0063 - val_bbox_output_loss: 0.0117 - val_bbox_output_mse: 0.0117 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.0081 - val_loss: 0.0198 - learning_rate: 1.0000e-04
Epoch 12/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 3s/step - bbox_output_loss: 0.0023 - bbox_output_mse: 0.0023 - class_output_accuracy: 1.0000 - class_output_loss: 0.0033 - loss: 0.0055 - val_bbox_output_loss: 0.0134 - val_bbox_output_mse: 0.0134 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.0067 - val_loss: 0.0201 - learning_rate: 1.0000e-04
Epoch 13/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - bbox_output_loss: 0.0014 - bbox_output_mse: 0.0014 - class_output_accuracy: 1.0000 - class_output_loss: 0.0028 - loss: 0.0043



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 3s/step - bbox_output_loss: 0.0014 - bbox_output_mse: 0.0014 - class_output_accuracy: 1.0000 - class_output_loss: 0.0028 - loss: 0.0043 - val_bbox_output_loss: 0.0128 - val_bbox_output_mse: 0.0128 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.0050 - val_loss: 0.0178 - learning_rate: 1.0000e-04
Epoch 14/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 3s/step - bbox_output_loss: 0.0016 - bbox_output_mse: 0.0016 - class_output_accuracy: 1.0000 - class_output_loss: 0.0025 - loss: 0.0041 - val_bbox_output_loss: 0.0143 - val_bbox_output_mse: 0.0143 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.0043 - val_loss: 0.0186 - learning_rate: 1.0000e-04
Epoch 15/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - bbox_output_loss: 0.0015 - bbox_output_mse: 0.0015 - class_output_accuracy: 1.0000 - class_output_loss: 0.0022 - loss: 0.0037



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 3s/step - bbox_output_loss: 0.0015 - bbox_output_mse: 0.0015 - class_output_accuracy: 1.0000 - class_output_loss: 0.0022 - loss: 0.0037 - val_bbox_output_loss: 0.0140 - val_bbox_output_mse: 0.0139 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.0034 - val_loss: 0.0173 - learning_rate: 1.0000e-04
Epoch 16/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - bbox_output_loss: 0.0016 - bbox_output_mse: 0.0016 - class_output_accuracy: 1.0000 - class_output_loss: 0.0020 - loss: 0.0036



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 3s/step - bbox_output_loss: 0.0016 - bbox_output_mse: 0.0016 - class_output_accuracy: 1.0000 - class_output_loss: 0.0020 - loss: 0.0036 - val_bbox_output_loss: 0.0131 - val_bbox_output_mse: 0.0131 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.0028 - val_loss: 0.0159 - learning_rate: 1.0000e-04
Epoch 17/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - bbox_output_loss: 0.0015 - bbox_output_mse: 0.0015 - class_output_accuracy: 1.0000 - class_output_loss: 0.0017 - loss: 0.0032



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 3s/step - bbox_output_loss: 0.0015 - bbox_output_mse: 0.0015 - class_output_accuracy: 1.0000 - class_output_loss: 0.0017 - loss: 0.0032 - val_bbox_output_loss: 0.0134 - val_bbox_output_mse: 0.0133 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.0023 - val_loss: 0.0157 - learning_rate: 1.0000e-04
Epoch 18/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 3s/step - bbox_output_loss: 0.0016 - bbox_output_mse: 0.0016 - class_output_accuracy: 1.0000 - class_output_loss: 0.0015 - loss: 0.0031 - val_bbox_output_loss: 0.0173 - val_bbox_output_mse: 0.0173 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.0020 - val_loss: 0.0193 - learning_rate: 1.0000e-04
Epoch 19/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 3s/step - bbox_output_loss: 0.0013 - bbox_output_mse: 0.0013 - class_output_accuracy: 1.0000 - class_output_loss: 0.0014 - loss: 0.0027 - val_bbox_output_



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 3s/step - bbox_output_loss: 0.0013 - bbox_output_mse: 0.0013 - class_output_accuracy: 1.0000 - class_output_loss: 0.0013 - loss: 0.0026 - val_bbox_output_loss: 0.0133 - val_bbox_output_mse: 0.0132 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.0017 - val_loss: 0.0149 - learning_rate: 1.0000e-04
Epoch 21/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - bbox_output_loss: 0.0013 - bbox_output_mse: 0.0013 - class_output_accuracy: 1.0000 - class_output_loss: 0.0012 - loss: 0.0025



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 3s/step - bbox_output_loss: 0.0013 - bbox_output_mse: 0.0013 - class_output_accuracy: 1.0000 - class_output_loss: 0.0012 - loss: 0.0025 - val_bbox_output_loss: 0.0122 - val_bbox_output_mse: 0.0122 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.0015 - val_loss: 0.0137 - learning_rate: 1.0000e-04
Epoch 22/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 3s/step - bbox_output_loss: 0.0015 - bbox_output_mse: 0.0015 - class_output_accuracy: 1.0000 - class_output_loss: 0.0011 - loss: 0.0026 - val_bbox_output_loss: 0.0156 - val_bbox_output_mse: 0.0156 - val_class_output_accuracy: 1.0000 - val_class_output_loss: 0.0013 - val_loss: 0.0169 - learning_rate: 1.0000e-04
Epoch 23/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 3s/step - bbox_output_loss: 0.0012 - bbox_output_mse: 0.0012 - class_output_accuracy: 1.0000 - class_output_loss: 9.8980e-04 - loss: 0.0022 - val_bbox_out

In [9]:
test_img = 'Images/1_png.rf.0af495bdd3ab02ab14bef9a8c62a5507.jpg'
model = create_model()
model.load_weights('best_model.h5')

img = keras.preprocessing.image.load_img(test_img, target_size=(360, 640))
img_array = keras.preprocessing.image.img_to_array(img)
img_array = tf.expand_dims(img_array, 0)
predictions = model.predict(img_array)
print(predictions)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
{'class_output': array([[0.99857026]], dtype=float32), 'bbox_output': array([[0.37416387, 0.2110887 , 0.65500563, 0.7219951 ]], dtype=float32)}


In [10]:
bbox = predictions["bbox_output"]
bbox = [bbox[0][0] * 640, bbox[0][1] * 360, bbox[0][2] * 640, bbox[0][3] * 360]
print(bbox)

class_prediction_value = round(predictions["class_output"][0][0])

classes = ["nothing", "burger"]
print("Predicted class: {}".format(classes[class_prediction_value]))

[np.float32(239.46487), np.float32(75.991936), np.float32(419.2036), np.float32(259.91824)]
Predicted class: burger


In [13]:
testing = keras.preprocessing.image.load_img(test_img, target_size=(360, 640))

img_width, img_height = testing.size
x1 = int(bbox[0])
y1 = int(bbox[1])
x2 = int(bbox[2])
y2 = int(bbox[3])

draw1 = Draw(testing)
bbox_pixels = [(x1, y1), (x2, y2)]
draw1.rectangle(bbox_pixels, outline='red', width=3)
testing.show()

In [35]:
cam = cv2.VideoCapture(1) 
cam.set(cv2.CAP_PROP_FPS, 120)

while True:
    ret, frame = cam.read()

    frame_resized = cv2.resize(frame, (640, 360))
    frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)

    img_array = tf.expand_dims(frame_rgb, 0)

    predictions = model.predict(img_array, verbose=0)

    bbox = predictions["bbox_output"]
    bbox = [bbox[0][0] * 640, bbox[0][1] * 360, bbox[0][2] * 640, bbox[0][3] * 360]

    class_prediction_value = round(predictions["class_output"][0][0])
    class_name = classes[class_prediction_value]
    print(class_name)

    if class_name != "nothing":
        x1, y1, x2, y2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
        cv2.rectangle(frame_resized, (x1, y1), (x2, y2), (0, 0, 255), 2)
        cv2.putText(frame_resized, class_name, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

    cv2.imshow('Camera Feed', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cam.release()
cv2.destroyAllWindows()

burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger
burger


KeyboardInterrupt: 