In [3]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pathlib
import pandas as pd
import cv2
import glob
import traceback
import xml.etree.ElementTree as ET
import albumentations as A

import tensorflow as tf
print(tf.__version__)
from tensorflow import keras
from tensorflow.keras.applications import MobileNetV3Large
from sklearn.model_selection import train_test_split

import PIL
from PIL import Image 
from PIL.ImageDraw import Draw

2.19.0


In [2]:
def xml_to_csv(path):
    """
    Convert XML files to CSV
    Args:
        path: Path to the folder containing XML files
        skipNegatives: If True, skip images with no burger annotations
    Returns:
        Pandas DataFrame with columns ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
    """
    xml_list = []
    
    # Get all XML files in the directory
    for xml_file in glob.glob(path + '/*.xml'):
        tree = ET.parse(xml_file)
        root = tree.getroot()
        
        # Get image size
        width = int(root.find('size/width').text)
        height = int(root.find('size/height').text)
        filename = root.find('filename').text

        # Check if there are any burger objects
        objects = root.findall('object')
        
        # Function to safely check if an object is a burger
        def is_burger(obj):
            name_tag = obj.find('name')
            if name_tag is not None and name_tag.text is not None:
                # Remove any whitespace and check if it's 'burger'
                return name_tag.text.strip() == 'burger'
            return False
        
        has_burger = any(is_burger(obj) for obj in objects) if objects else False
        
        if not has_burger:
            xml_list.append({
                'filename': filename,
                'width': width,
                'height': height,
                'class': 0,
                'xmin': 0,
                'ymin': 0,
                'xmax': 0,
                'ymax': 0
            })
        else:
            for obj in objects:
                if obj.find('name').text.strip() == 'burger':
                    bbox = obj.find('bndbox')
                    xml_list.append({
                        'filename': filename,
                        'width': width,
                        'height': height,
                        'class': 1,  # BURGER
                        'xmin': int(bbox.find('xmin').text),
                        'ymin': int(bbox.find('ymin').text),
                        'xmax': int(bbox.find('xmax').text),
                        'ymax': int(bbox.find('ymax').text)
                    })
    
    # Convert to DataFrame
    df = pd.DataFrame(xml_list)
    columns = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
    df = df[columns]
    return df

In [None]:
df = xml_to_csv('Images')
# Print some statistics
print(f"\nTotal annotations found: {len(df)}")
print(f"Number of burger annotations (class 1): {len(df[df['class'] == 1])}")
print(f"Number of non-burger images (class 0): {len(df[df['class'] == 0])}")

# Save to CSV
output_file = 'Dataset/annotations.csv'

In [None]:
df.to_csv(output_file, index=False)
print(f"\nSaved annotations to {output_file}")
print("\nFirst few rows of the CSV:")
print(df.head())

In [6]:
transform = A.Compose([
    A.RandomResizedCrop(size=[360, 640], scale=(0.5, 1.0), ratio=(0.75, 1.33), p=1.0),
    A.HorizontalFlip(p=0.5),
    A.Affine(
        scale=(0.85, 1.15),      
        rotate=(-12, 12),      
        translate_percent=(-0.1, 0.1), 
        shear=(-10, 10),          
        p=0.5
    ),
    A.OneOf([
        A.CoarseDropout(num_holes_range=(1, 4), hole_height_range=(0.1, 0.25),
                        hole_width_range=(0.1, 0.35), p=1.0),
        A.GridDropout(ratio=0.3, unit_size_range=(10, 20), fill="inpaint_ns", p=1.0)
    ], p=0.2),
    A.OneOf([
        A.ISONoise(color_shift=(0.01, 0.05), intensity=(0.15, 0.4), p=1.0),
        A.MotionBlur(
            angle_range=(0, 0),
            direction_range=(0.5, 1.0),
            p=1.0)
    ], p=0.5)
])

In [7]:
def augment_image(image_path, output_dir, num_augmentations=3):
    # Read image
    image = cv2.imread(image_path)
    if image is None:
        raise ValueError(f"Could not read image: {image_path}")
    
    print(f"Processing image: {image_path}, shape: {image.shape}")
    
    # Convert BGR to RGB
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Get base filename without extension
    base_name = os.path.splitext(os.path.basename(image_path))[0]
    
    # Apply augmentation multiple times
    for i in range(num_augmentations):
        # Apply the transform (previously pipeline)
        augmented = transform(image=image)  # Changed pipeline to transform
        augmented_image = augmented['image']
        
        # Convert RGB back to BGR for saving
        augmented_image = cv2.cvtColor(augmented_image, cv2.COLOR_RGB2BGR)
        
        # Create output filename
        output_filename = f"{base_name}_aug_{i+1}.jpg"
        output_path = os.path.join(output_dir, output_filename)
        
        # Save augmented image
        cv2.imwrite(output_path, augmented_image)
        
        print(f"Saved augmented image: {output_filename}")


In [9]:
# Create output directory for augmented images
output_dir = "Augmented_Images"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Get all images from the Images folder
image_paths = glob.glob("Images/*.jpg") + glob.glob("Images/*.png")

# Apply augmentation to each image
for image_path in image_paths:
    try:
        augment_image(image_path, output_dir)
    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        print(traceback.format_exc())

print("Augmentation completed!")

Processing image: Images\1-copy_png.rf.75b0339aa72da0f13d9b38adeea13dbe.jpg, shape: (360, 640, 3)
Saved augmented image: 1-copy_png.rf.75b0339aa72da0f13d9b38adeea13dbe_aug_1.jpg
Saved augmented image: 1-copy_png.rf.75b0339aa72da0f13d9b38adeea13dbe_aug_2.jpg
Saved augmented image: 1-copy_png.rf.75b0339aa72da0f13d9b38adeea13dbe_aug_3.jpg
Processing image: Images\100_png.rf.04808b767e584757447ffa48a789af40.jpg, shape: (360, 640, 3)
Saved augmented image: 100_png.rf.04808b767e584757447ffa48a789af40_aug_1.jpg
Saved augmented image: 100_png.rf.04808b767e584757447ffa48a789af40_aug_2.jpg
Saved augmented image: 100_png.rf.04808b767e584757447ffa48a789af40_aug_3.jpg
Processing image: Images\100_png.rf.11490e40b9e259fabbdb0cedefd2fa65.jpg, shape: (360, 640, 3)
Saved augmented image: 100_png.rf.11490e40b9e259fabbdb0cedefd2fa65_aug_1.jpg
Saved augmented image: 100_png.rf.11490e40b9e259fabbdb0cedefd2fa65_aug_2.jpg
Saved augmented image: 100_png.rf.11490e40b9e259fabbdb0cedefd2fa65_aug_3.jpg
Processing

KeyboardInterrupt: 

In [5]:
def split_dataset(csv_path, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, random_state=202):
    """
    Split the dataset into train, validation and test sets while maintaining class balance
    
    Args:
        csv_path: Path to the annotations CSV file
        train_ratio: Ratio of training data (default: 0.7)
        val_ratio: Ratio of validation data (default: 0.15)
        test_ratio: Ratio of test data (default: 0.15)
        random_state: Random seed for reproducibility
    """
    # Read the CSV file
    df = pd.read_csv(csv_path)
    
    # Create separate dataframes for positive and negative samples
    negative_samples = df[df['class'] == 0]
    positive_samples = df[df['class'] == 1]
    
    print(f"Total samples: {len(df)}")
    print(f"Positive samples (class 1): {len(positive_samples)}")
    print(f"Negative samples (class 0): {len(negative_samples)}")
    
    # First split: separate test set
    neg_train_val, neg_test = train_test_split(
        negative_samples, 
        test_size=test_ratio, 
        random_state=random_state
    )
    
    pos_train_val, pos_test = train_test_split(
        positive_samples, 
        test_size=test_ratio, 
        random_state=random_state
    )
    
    # Second split: separate train and validation from the remaining data
    val_ratio_adjusted = val_ratio / (train_ratio + val_ratio)
    
    neg_train, neg_val = train_test_split(
        neg_train_val, 
        test_size=val_ratio_adjusted, 
        random_state=random_state
    )
    
    pos_train, pos_val = train_test_split(
        pos_train_val, 
        test_size=val_ratio_adjusted, 
        random_state=random_state
    )
    
    # Combine positive and negative samples for each set
    train_df = pd.concat([pos_train, neg_train])
    val_df = pd.concat([pos_val, neg_val])
    test_df = pd.concat([pos_test, neg_test])
    
    # Shuffle each dataset
    train_df = train_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    val_df = val_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    test_df = test_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    # Print statistics
    print("\nDataset split statistics:")
    print(f"Training set: {len(train_df)} samples")
    print(f"  - Positive (class 1): {len(train_df[train_df['class'] == 1])}")
    print(f"  - Negative (class 0): {len(train_df[train_df['class'] == 0])}")
    
    print(f"\nValidation set: {len(val_df)} samples")
    print(f"  - Positive (class 1): {len(val_df[val_df['class'] == 1])}")
    print(f"  - Negative (class 0): {len(val_df[val_df['class'] == 0])}")
    
    print(f"\nTest set: {len(test_df)} samples")
    print(f"  - Positive (class 1): {len(test_df[test_df['class'] == 1])}")
    print(f"  - Negative (class 0): {len(test_df[test_df['class'] == 0])}")
    
    # Save the splits to CSV files
    train_df.to_csv('Dataset/train.csv', index=False)
    val_df.to_csv('Dataset/val.csv', index=False)
    test_df.to_csv('Dataset/test.csv', index=False)
    
    return train_df, val_df, test_df

In [None]:
train_df, val_df, test_df = split_dataset('Dataset/annotations.csv')

In [None]:
TRAIN_CSV_FILE = 'Dataset/train.csv'
VAL_CSV_FILE = 'Dataset/val.csv'
TEST_CSV_FILE = 'Dataset/test.csv'
IMAGE_DIR = 'Images'

def prepare_dataset(csv_file, image_dir):
    """
    Prepare images, targets and labels from a CSV file
    
    Args:
        csv_file: Path to CSV file containing annotations
        image_dir: Directory containing the images
        
    Returns:
        images: numpy array of images
        targets: dictionary containing class and bbox outputs
        labels: numpy array of class labels
    """
    # Read the CSV file
    df = pd.read_csv(csv_file)
    
    images = []
    bbox_targets = []
    class_labels = []
    
    for index, row in df.iterrows():
        # Extract data from row
        filename = row['filename']
        width = row['width']
        height = row['height']
        class_label = row['class']  # Assuming this is 0 or 1 for binary classification
        xmin = row['xmin']
        ymin = row['ymin']
        xmax = row['xmax']
        ymax = row['ymax']
        
        # Load and preprocess image
        image_path = os.path.join(image_dir, filename)
        img = keras.preprocessing.image.load_img(image_path, target_size=(360, 640))  # Fixed size for all images
        img_array = keras.preprocessing.image.img_to_array(img)
        img_array = img_array / 255.0  # Normalize pixel values
        
        # Normalize bounding box coordinates
        bbox = [
            round(xmin/width, 4),
            round(ymin/height, 4),
            round(xmax/width, 4),
            round(ymax/height, 4)
        ]
        
        images.append(img_array)
        bbox_targets.append(bbox)
        class_labels.append(class_label)
    
    # Convert lists to numpy arrays
    images = np.array(images, dtype=np.float32)
    bbox_targets = np.array(bbox_targets, dtype=np.float32)
    class_labels = np.array(class_labels, dtype=np.float32)
    
    # Create targets dictionary for multi-output model
    targets = {
        'class_output': class_labels,
        'bbox_output': bbox_targets
    }
    
    return images, targets

# Prepare train, validation and test sets
print("Preparing training data...")
train_images, train_targets = prepare_dataset(TRAIN_CSV_FILE, IMAGE_DIR)

print("Preparing validation data...")
val_images, val_targets = prepare_dataset(VAL_CSV_FILE, IMAGE_DIR)

print("Preparing test data...")
test_images, test_targets = prepare_dataset(TEST_CSV_FILE, IMAGE_DIR)

# Print shapes to verify
print("\nDataset shapes:")
print(f"Training images: {train_images.shape}")
print(f"Training class labels: {train_targets['class_output'].shape}")
print(f"Training bbox targets: {train_targets['bbox_output'].shape}")
print(f"\nValidation images: {val_images.shape}")
print(f"Validation class labels: {val_targets['class_output'].shape}")
print(f"Validation bbox targets: {val_targets['bbox_output'].shape}")
print(f"\nTest images: {test_images.shape}")
print(f"Test class labels: {test_targets['class_output'].shape}")
print(f"Test bbox targets: {test_targets['bbox_output'].shape}")

In [None]:
def prepare_dataset(csv_file, image_dir):
    """
    Prepare images, targets and labels from a CSV file with validation checks
    """
    # Read the CSV file
    df = pd.read_csv(csv_file)
    print(f"Number of records in CSV: {len(df)}")
    
    images = []
    bbox_targets = []
    class_labels = []
    
    for index, row in df.iterrows():
        try:
            # Extract data from row
            filename = row['filename']
            width = float(row['width'])  # Convert to float to avoid division issues
            height = float(row['height'])
            class_name = row['class']
            xmin = float(row['xmin'])
            ymin = float(row['ymin'])
            xmax = float(row['xmax'])
            ymax = float(row['ymax'])
            
            # Validate bounding box coordinates
            if xmin >= xmax or ymin >= ymax:
                print(f"Invalid bbox coordinates in row {index}: xmin={xmin}, xmax={xmax}, ymin={ymin}, ymax={ymax}")
                continue
                
            # Load and preprocess image
            image_path = os.path.join(image_dir, filename)
            if not os.path.exists(image_path):
                print(f"Image not found: {image_path}")
                continue
                
            img = keras.preprocessing.image.load_img(
                image_path, 
                target_size=(360, 640)
            )
            img_array = keras.preprocessing.image.img_to_array(img)
            img_array = img_array / 255.0  # Normalize
            
            # Normalize bounding box coordinates
            bbox = [
                max(0.0, min(1.0, round(xmin/width, 4))),
                max(0.0, min(1.0, round(ymin/height, 4))),
                max(0.0, min(1.0, round(xmax/width, 4))),
                max(0.0, min(1.0, round(ymax/height, 4)))
            ]
            
            # Convert class name to integer if needed
            if isinstance(class_name, str):
                class_label = 1 if class_name.lower() == 'defect' else 0
            else:
                class_label = int(class_name)
            
            images.append(img_array)
            bbox_targets.append(bbox)
            class_labels.append(class_label)
            
        except Exception as e:
            print(f"Error processing row {index}: {str(e)}")
            continue
    
    if not images:
        raise ValueError("No valid images were processed!")
    
    # Convert lists to numpy arrays
    images = np.array(images, dtype=np.float32)
    bbox_targets = np.array(bbox_targets, dtype=np.float32)
    class_labels = np.array(class_labels, dtype=np.float32)
    
    # Print shapes for debugging
    print(f"Images shape: {images.shape}")
    print(f"Bounding box targets shape: {bbox_targets.shape}")
    print(f"Class labels shape: {class_labels.shape}")
    
    # Reshape class labels to have shape (n, 1)
    class_labels = class_labels.reshape(-1, 1)
    
    # Create targets dictionary
    targets = {
        'class_output': class_labels,
        'bbox_output': bbox_targets
    }
    
    return images, targets

# Prepare datasets with error handling
try:
    print("\nPreparing training data...")
    train_images, train_targets = prepare_dataset(TRAIN_CSV_FILE, IMAGE_DIR)
    
    print("\nPreparing validation data...")
    val_images, val_targets = prepare_dataset(VAL_CSV_FILE, IMAGE_DIR)
    
    print("\nPreparing test data...")
    test_images, test_targets = prepare_dataset(TEST_CSV_FILE, IMAGE_DIR)
    
    # Final verification of shapes
    print("\nFinal dataset shapes:")
    print(f"Training:")
    print(f"- Images: {train_images.shape}")
    print(f"- Class labels: {train_targets['class_output'].shape}")
    print(f"- Bounding boxes: {train_targets['bbox_output'].shape}")
    
    print(f"\nValidation:")
    print(f"- Images: {val_images.shape}")
    print(f"- Class labels: {val_targets['class_output'].shape}")
    print(f"- Bounding boxes: {val_targets['bbox_output'].shape}")
    
    print(f"\nTest:")
    print(f"- Images: {test_images.shape}")
    print(f"- Class labels: {test_targets['class_output'].shape}")
    print(f"- Bounding boxes: {test_targets['bbox_output'].shape}")
    
except Exception as e:
    print(f"Error during dataset preparation: {str(e)}")

In [None]:
def visualize_bbox(img, bbox, class_name, color=BOX_COLOR, thickness=2):
    """Visualizes a single bounding box on the image"""
    x_min, y_min, w, h = bbox
    x_min, x_max, y_min, y_max = int(x_min), int(x_min + w), int(y_min), int(y_min + h)
 
    cv2.rectangle(img, (x_min, y_min), (x_max, y_max), color=color, thickness=thickness)
 
    ((text_width, text_height), _) = cv2.getTextSize(class_name, cv2.FONT_HERSHEY_SIMPLEX, 0.35, 1)
    cv2.rectangle(img, (x_min, y_min - int(1.3 * text_height)), (x_min + text_width, y_min), BOX_COLOR, -1)
    cv2.putText(
        img,
        text=class_name,
        org=(x_min, y_min - int(0.3 * text_height)),
        fontFace=cv2.FONT_HERSHEY_SIMPLEX,
        fontScale=0.35,
        color=TEXT_COLOR,
        lineType=cv2.LINE_AA,
    )
    return img
 
 
def visualize(image, bboxes, category_ids, category_id_to_name):
    img = image.copy()
    for bbox, category_id in zip(bboxes, category_ids):
        class_name = category_id_to_name[category_id]
        img = visualize_bbox(img, bbox, class_name)
    plt.figure(figsize=(12, 12))
    plt.axis("off")
    plt.imshow(img)

In [11]:
def create_model():
    base_model = MobileNetV3Large(
        input_shape=(360,640,3),
        include_top=False, 
        weights='imagenet'
    )
    base_model.trainable = True

    x = base_model.output

    x = tf.keras.layers.Conv2D(32, (3, 3), strides=(1, 1), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

    ########## block 1 ##########
    x = tf.keras.layers.Conv2D(64, (3, 3), strides=(1, 1), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

    x_shortcut = x

    for i in range(2):
        x = tf.keras.layers.Conv2D(32, (3, 3), strides=(1, 1), padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x = tf.keras.layers.Conv2D(64, (3, 3), strides=(1, 1), padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x = tf.keras.layers.Add()([x_shortcut, x])
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x_shortcut = x


    ########## block 2 ##########
    x = tf.keras.layers.Conv2D(128, (3, 3), strides=(1, 1), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

    x_shortcut = x

    for i in range(2):
        x = tf.keras.layers.Conv2D(64, (3, 3), strides=(1, 1), padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x = tf.keras.layers.Conv2D(128, (3, 3), strides=(1, 1), padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x = tf.keras.layers.Add()([x_shortcut, x])
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x_shortcut = x

    ########## block 3 ##########
    x = tf.keras.layers.Conv2D(256, (3, 3), strides=(2, 2), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

    x_shortcut = x

    for i in range(3):
        x = tf.keras.layers.Conv2D(128, (3, 3), strides=(1, 1), padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x = tf.keras.layers.Conv2D(256, (3, 3), strides=(1, 1), padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x = tf.keras.layers.Add()([x_shortcut, x])
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x_shortcut = x

        
    ########## block 4 ##########
    x = tf.keras.layers.Conv2D(512, (3, 3), strides=(2, 2), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

    x_shortcut = x

    for i in range(5):
        x = tf.keras.layers.Conv2D(256, (3, 3), strides=(1, 1), padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x = tf.keras.layers.Conv2D(512, (3, 3), strides=(1, 1), padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x = tf.keras.layers.Add()([x_shortcut, x])
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x_shortcut = x

    ########## block 5 ##########
    x = tf.keras.layers.Conv2D(1024, (3, 3), strides=(2, 2), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

    x_shortcut = x

    for i in range(8):
        x = tf.keras.layers.Conv2D(512, (3, 3), strides=(1, 1), padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x = tf.keras.layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x = tf.keras.layers.Add()([x_shortcut, x])
        x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

        x_shortcut = x

    ########## output layers ##########
    x = tf.keras.layers.Conv2D(512, (3, 3), strides=(1, 1), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

    x = tf.keras.layers.Conv2D(256, (3, 3), strides=(1, 1), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

    x = tf.keras.layers.Conv2D(128, (3, 3), strides=(1, 1), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.LeakyReLU(negative_slope=0.1)(x)

    # Classification head
    class_branch = tf.keras.layers.GlobalAveragePooling2D()(x)
    class_branch = tf.keras.layers.Dense(64, activation='relu')(class_branch)
    class_output = tf.keras.layers.Dense(2, activation='sigmoid', name='class_output')(class_branch)

    # Bounding box head
    bbox_branch = tf.keras.layers.GlobalAveragePooling2D()(x)
    bbox_branch = tf.keras.layers.Dense(64, activation='relu')(bbox_branch)
    bbox_output = tf.keras.layers.Dense(4, activation='sigmoid', name='bbox_output')(bbox_branch)

    model = tf.keras.Model(
        inputs=base_model.input, 
        outputs={
            'class_output': class_output,
            'bbox_output': bbox_output
        }
    )
    return model

model = create_model()

In [None]:
optimiser = tf.keras.optimizers.Adam(learning_rate=0.0001)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss={
        'class_output': 'binary_crossentropy',
        'bbox_output': 'mse'
    },
    metrics={
        'class_output': 'accuracy',
        'bbox_output': 'mse'
    }
)

model.summary()

In [None]:
callbacks = [
        tf.keras.callbacks.ModelCheckpoint(
            'best_model.h5',
            save_best_only=True,
            monitor='val_loss'
        ),
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='loss',
            factor=0.2,
            patience=5,
            min_lr=1e-6,
            restore_best_weights=True
        )
    ]

history = model.fit(
        train_images,
        train_targets,
        validation_data=(val_images, val_targets),
        epochs=64,
        batch_size=32,
        callbacks=callbacks
    )

model.save('my_model.keras')

In [None]:
test_img = 'Images/448_png.rf.c0d24997b23fb6a4844cc647456b99d0.jpg'
model = create_model()
model.load_weights('best_model.h5')

img = keras.preprocessing.image.load_img(test_img, target_size=(360, 640))
img_array = keras.preprocessing.image.img_to_array(img)
img_array = tf.expand_dims(img_array, 0)
predictions = model.predict(img_array)
print(predictions)

In [None]:
bbox = predictions["bbox_output"]
bbox = [bbox[0][0] * 640, bbox[0][1] * 360, bbox[0][2] * 640, bbox[0][3] * 360]
print(bbox)

score = predictions["class_output"]
score = tf.nn.softmax(score)
print(score)
# class_prediction_value = round(predictions["class_output"][0][0])

# classes = ["nothing", "burger"]
# print("Predicted class: {}".format(classes[class_prediction_value]))

In [13]:
testing = keras.preprocessing.image.load_img(test_img, target_size=(360, 640))

img_width, img_height = testing.size
x1 = int(bbox[0])
y1 = int(bbox[1])
x2 = int(bbox[2])
y2 = int(bbox[3])

draw1 = Draw(testing)
bbox_pixels = [(x1, y1), (x2, y2)]
draw1.rectangle(bbox_pixels, outline='red', width=3)
testing.show()

In [None]:
cam = cv2.VideoCapture(1) 
cam.set(cv2.CAP_PROP_FPS, 120)

while True:
    ret, frame = cam.read()

    frame_resized = cv2.resize(frame, (640, 360))
    frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)

    img_array = tf.expand_dims(frame_rgb, 0)

    predictions = model.predict(img_array, verbose=0)

    bbox = predictions["bbox_output"]
    bbox = [bbox[0][0] * 640, bbox[0][1] * 360, bbox[0][2] * 640, bbox[0][3] * 360]

    class_prediction_value = round(predictions["class_output"][0][0])
    class_name = classes[class_prediction_value]
    print(class_name)

    if class_name != "nothing":
        x1, y1, x2, y2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
        cv2.rectangle(frame_resized, (x1, y1), (x2, y2), (0, 0, 255), 2)
        cv2.putText(frame_resized, class_name, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

    cv2.imshow('Camera Feed', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cam.release()
cv2.destroyAllWindows()