<a href="https://colab.research.google.com/github/avkaz/DeepLearningPetIdentification/blob/preprocess_pipeline/preprop_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import itertools
from itertools import islice

import json
from PIL import Image, ExifTags, ImageDraw
import requests
import io
import tensorflow as tf
import numpy as np
import tensorflow_hub as hub
import matplotlib.pyplot as plt


# Metadata Fetching
def get_data():
    """
    Fetches and parses JSON data from the given URL.

    Returns:
        dict: The parsed JSON data as a Python dictionary.
    """
    url = "https://raw.githubusercontent.com/avkaz/DeepLearningPetIdentification/main/pets_db.json"

    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        return data
    except requests.RequestException as e:
        print(f"An error occurred while fetching data: {e}")
        raise
    except json.JSONDecodeError as e:
        print(f"An error occurred while parsing JSON: {e}")
        raise

# Model Loading
detector = None
MODEL_URL = "https://tfhub.dev/tensorflow/ssd_mobilenet_v2/fpnlite_320x320/1"





In [2]:
def load_detector_model():
    global detector
    if detector is None:
        print("Uploading model...")
        detector = hub.load(MODEL_URL).signatures['serving_default']
        print("Model loaded successfully.")
    else:
        pass

Function filter_metadata_with_images excludes data records that its "images" metadata is empty.

In [3]:
def filter_metadata_with_images(metadata):
    """
    Filters metadata to include only entries with non-empty 'images' lists.

    Args:
        metadata (dict): The original metadata dictionary.

    Returns:
        dict: A filtered metadata dictionary with entries that have images.
    """
    return {key: value for key, value in metadata.items() if value.get("images")}

Function verify_metadata checks for missing entries of chosen metadata in each data record for the dataset.

In [22]:
# Metadata Verification
def verify_metadata(metadata):
    """
    Verifies metadata integrity by checking for missing or inconsistent entries.

    Args:
        metadata (dict): The metadata dictionary to verify.
    """
    for key, value in metadata.items():
        if not isinstance(value, dict) or "Plemeno" not in value or "Barva" not in value or "Věk" not in value or "Velikost" not in value or "images" not in value:
            print(f"Warning: Incomplete metadata for key {key}: {value}")

In [21]:
# Function to fix orientation using EXIF
def fix_orientation(image):
    """
    Adjust the image orientation based on its EXIF metadata to account for camera rotation.
    The function looks for the 'Orientation' tag in the EXIF data and rotates the image accordingly.

    Arguments:
    image -- The image to fix the orientation for (PIL Image object).

    Returns:
    PIL Image with corrected orientation.
    """
    try:
        for orientation in ExifTags.TAGS.keys():
            if ExifTags.TAGS[orientation] == 'Orientation':
                break
        exif = image._getexif()
        if exif is not None:
            orientation = exif.get(orientation)
            if orientation == 3:
                image = image.rotate(180, expand=True)
            elif orientation == 6:
                image = image.rotate(270, expand=True)
            elif orientation == 8:
                image = image.rotate(90, expand=True)
    except (AttributeError, KeyError, IndexError):
        pass
    return image

# Function to crop and resize the image based on a bounding box

    """
    Crops the image using a given bounding box and then resizes it to the target size.

    Arguments:
    image -- The image to crop and resize (TensorFlow Tensor).
    bounding_box -- A tuple (x1, y1, x2, y2) specifying the coordinates of the bounding box.
    target_size -- The target size (height, width) to resize the image to.

    Returns:
    The cropped and resized image (TensorFlow Tensor).
    """

def crop_and_resize(image, bounding_box, target_size):
    image = tf.convert_to_tensor(image, dtype=tf.float32)
    x1, y1, x2, y2 = bounding_box
    image = tf.strided_slice(image, [int(y1), int(x1), 0], [int(y2), int(x2), 3])
    image = tf.image.resize(image, target_size)
    return image

# Function to detect pets in the image (Placeholder function, adjust as needed)
def detect_pet(image):
    load_detector_model()
    input_tensor = tf.image.resize(image, [640, 640]) / 255.0
    input_tensor = tf.expand_dims(input_tensor, axis=0)
    input_tensor_uint8 = tf.cast(input_tensor * 255.0, tf.uint8)

    result = detector(tf.convert_to_tensor(input_tensor_uint8))
    result = {key: value.numpy() for key, value in result.items()}

    if 'detection_classes' in result and 'detection_scores' in result:
        detected_classes = result['detection_classes']
        detected_boxes = result['detection_boxes']
        detected_scores = result['detection_scores']
        pet_classes = [b"Cat", b"Dog", b"Animal"]

        for idx in range(len(detected_classes[0])):
            detected_class = detected_classes[0][idx]
            detected_score = detected_scores[0][idx]
            detected_box = detected_boxes[0][idx]

            if detected_class in pet_classes and detected_score > 0.5:
                return detected_box
    return None

# Function to visualize the image
def visualize_image(image, title="Processed Image", visualize=False):
    """
    Visualizes the processed image using Matplotlib.

    Arguments:
    image -- The image to visualize, can be a TensorFlow tensor or a NumPy array.
    title -- The title to display on top of the image.
    visualize -- A flag to control whether to visualize the image. Default is True.
    """
    if visualize:
        # Convert TensorFlow tensor to NumPy array if necessary
        if isinstance(image, tf.Tensor):
            image = image.numpy()

        # If it's an RGB image, clip pixel values to the range [0, 1]
        if image.ndim == 3 and image.shape[-1] == 3:
            image = np.clip(image, 0, 1)
        elif image.ndim == 2:  # If grayscale, clip to [0, 255]
            image = np.clip(image, 0, 255).astype(np.uint8)

        # Show the image using Matplotlib
        plt.imshow(image)
        plt.title(title)
        plt.axis("off")
        plt.show()

# Download and Preprocess Image
def download_and_preprocess_image(url, target_size=(224, 224), visualize=False):
    response = requests.get(url)
    image_bytes = response.content
    pil_image = Image.open(io.BytesIO(image_bytes))
    pil_image = fix_orientation(pil_image)

    image = tf.convert_to_tensor(np.array(pil_image), dtype=tf.float32) / 255.0
    bounding_box = detect_pet(image)

    if bounding_box is not None:
        image = crop_and_resize(image, bounding_box, target_size)
    else:
        # If no pet detected, resize with padding
        image = tf.image.resize_with_crop_or_pad(image, target_size[0], target_size[1])

    # Visualize the image if needed
    visualize_image(image, title="Processed Image", visualize=visualize)

    return image

### Function Description: `preprocess_dataset1`

The `preprocess_dataset1` function processes a dataset of pet information to create pairs of images for training, which includes both same-pet pairs (label = 1) and a limited number of different-pet pairs (label = 0).

1. **Same-Pet Pairing (label = 1):**
   - For each pet in the `metadata`, the function checks if the pet has at least 2 images.
   - If the pet has sufficient images, the function generates all possible pairs of images from the same pet.
   - Each image pair is associated with the pet’s metadata attributes, such as breed (`plemeno`), age (`vek`), color (`barva`), and size (`velikost`).
   - These pairs are labeled with `label = 1`, indicating that both images belong to the same pet.

2. **Different-Pet Pairing (label = 0):**
   - The function iterates over all possible pairs of pets in the `metadata` using `itertools.combinations`.
   - To avoid over-representing any pet in the dataset, the function limits the number of different-pet pairs a pet can contribute to, as specified by `max_diff_pet_pairs`.
   - For each valid pair of pets, the function selects the first image from each pet and creates a pair with their respective metadata.
   - These pairs are labeled with `label = 0`, indicating that the images belong to different pets.

3. **Pairing Limits:**
   - The `pet_pair_count` dictionary tracks how many different-pet pairs each pet has been included in.
   - Pets that have already contributed to `max_diff_pet_pairs` are excluded from further different-pet pairings.

4. **Error Handling:**
   - If a pet is missing images or an error occurs during image downloading or processing, the function logs the issue and skips that specific pair.

5. **TensorFlow Dataset Creation:**
   - The function converts the list of image pairs (`data_pairs`) into a TensorFlow dataset using a generator. Each pair contains:
     - Pet attributes for both images: breed (`plemeno`), age (`vek`), color (`barva`), and size (`velikost`).
     - Image pairs: `fotka1` and `fotka2`, representing the images from each pet.
     - A label (`label`), where `1` indicates the images are from the same pet, and `0` indicates the images are from different pets.

6. **Dataset Inspection:**
   - After the TensorFlow dataset is created, the function inspects and prints out a sample of up to specified examples.
   - For each example, it displays the shape and dtype of the images, along with the decoded metadata values for the pet attributes.


In [33]:
def preprocess_dataset1(metadata, target_size=(224, 224), max_diff_pet_pairs=3):
    """
    Preprocesses the dataset to create pairs of images for the same pet and different pets.
    The output dataset contains metadata attributes and image pairs.
    Optimized to limit different pet pairs and use parallelization for efficiency.
    """
    data_pairs = []
    pet_pair_count = {key: 0 for key in metadata}  # To track the number of pairs per pet

    print("Starting dataset preprocessing...")

    # Create image pairs from the same pet (label = 1)
    print("Processing same-pet pairs...")
    for key, entry in metadata.items():
        plemeno = entry.get("Plemeno", "Unknown")
        vek = entry.get("Věk", "Unknown")
        barva = entry.get("Barva", "Unknown")
        velikost = entry.get("Velikost", "Unknown")
        images = entry.get("images", [])

        print(f"Processing pet: {key} - {entry.get('Jméno', 'Unknown')}")
        if len(images) < 2:
            print(f"Skipping pet {key}: Not enough images ({len(images)})")
            continue

        for i in range(len(images)):
            for j in range(i + 1, len(images)):
                try:
                    print(f"Processing same-pet pair: {images[i]} and {images[j]}")
                    image1 = download_and_preprocess_image(images[i], target_size)
                    image2 = download_and_preprocess_image(images[j], target_size)
                    data_pairs.append({
                        "plemeno1": plemeno if isinstance(plemeno, str) else plemeno.decode("utf-8"),
                        "vek1": vek if isinstance(vek, str) else vek.decode("utf-8"),
                        "barva1": barva if isinstance(barva, str) else barva.decode("utf-8"),
                        "velikost1": velikost if isinstance(velikost, str) else velikost.decode("utf-8"),
                        "fotka1": image1,
                        "plemeno2": plemeno if isinstance(plemeno, str) else plemeno.decode("utf-8"),
                        "vek2": vek if isinstance(vek, str) else vek.decode("utf-8"),
                        "barva2": barva if isinstance(barva, str) else barva.decode("utf-8"),
                        "velikost2": velikost if isinstance(velikost, str) else velikost.decode("utf-8"),
                        "fotka2": image2,
                        "label": 1
                    })
                except Exception as e:
                    print(f"Error processing images for same-pet pair ({images[i]}, {images[j]}): {e}")

    print(f"Finished processing same-pet pairs. Total same-pet pairs: {len(data_pairs)}")

    # Create image pairs from different pets (label = 0)
    print("Processing different-pet pairs...")
    all_keys = list(metadata.keys())
    for key1, key2 in itertools.combinations(all_keys, 2):
        # Check if either pet has already reached the max allowed pairs
        if pet_pair_count[key1] >= max_diff_pet_pairs or pet_pair_count[key2] >= max_diff_pet_pairs:
            continue  # Skip this pair if either pet has reached the limit

        entry1 = metadata[key1]
        entry2 = metadata[key2]

        plemeno1 = entry1.get("Plemeno", "Unknown")
        vek1 = entry1.get("Věk", "Unknown")
        barva1 = entry1.get("Barva", "Unknown")
        velikost1 = entry1.get("Velikost", "Unknown")
        images1 = entry1.get("images", [])

        plemeno2 = entry2.get("Plemeno", "Unknown")
        vek2 = entry2.get("Věk", "Unknown")
        barva2 = entry2.get("Barva", "Unknown")
        velikost2 = entry2.get("Velikost", "Unknown")
        images2 = entry2.get("images", [])

        print(f"Processing different-pet pair: {key1} and {key2}")
        if not images1 or not images2:
            print(f"Skipping pair ({key1}, {key2}): Missing images.")
            continue

        try:
            print(f"Processing images: {images1[0]} and {images2[0]}")
            image1 = download_and_preprocess_image(images1[0], target_size)
            image2 = download_and_preprocess_image(images2[0], target_size)
            data_pairs.append({
                "plemeno1": plemeno1 if isinstance(plemeno1, str) else plemeno1.decode("utf-8"),
                "vek1": vek1 if isinstance(vek1, str) else vek1.decode("utf-8"),
                "barva1": barva1 if isinstance(barva1, str) else barva1.decode("utf-8"),
                "velikost1": velikost1 if isinstance(velikost1, str) else velikost1.decode("utf-8"),
                "fotka1": image1,
                "plemeno2": plemeno2 if isinstance(plemeno2, str) else plemeno2.decode("utf-8"),
                "vek2": vek2 if isinstance(vek2, str) else vek2.decode("utf-8"),
                "barva2": barva2 if isinstance(barva2, str) else barva2.decode("utf-8"),
                "velikost2": velikost2 if isinstance(velikost2, str) else velikost2.decode("utf-8"),
                "fotka2": image2,
                "label": 0
            })

            # Increment the pair count for both pets
            pet_pair_count[key1] += 1
            pet_pair_count[key2] += 1
        except Exception as e:
            print(f"Error processing images for different-pet pair ({images1[0]}, {images2[0]}): {e}")

    print(f"Finished processing different-pet pairs. Total different-pet pairs: {len(data_pairs)}")

    print("Finalizing dataset creation...")

    # Convert data_pairs to a TensorFlow dataset using a generator
    def data_generator():
        for i, pair in enumerate(data_pairs):
            if i % 100 == 0:
                print(f"Yielding pair {i}/{len(data_pairs)}")
            yield pair

    dataset = tf.data.Dataset.from_generator(
        data_generator,
        output_signature={
            "plemeno1": tf.TensorSpec(shape=(), dtype=tf.string),
            "vek1": tf.TensorSpec(shape=(), dtype=tf.string),
            "barva1": tf.TensorSpec(shape=(), dtype=tf.string),
            "velikost1": tf.TensorSpec(shape=(), dtype=tf.string),
            "fotka1": tf.TensorSpec(shape=(target_size[0], target_size[1], 3), dtype=tf.float32),
            "plemeno2": tf.TensorSpec(shape=(), dtype=tf.string),
            "vek2": tf.TensorSpec(shape=(), dtype=tf.string),
            "barva2": tf.TensorSpec(shape=(), dtype=tf.string),
            "velikost2": tf.TensorSpec(shape=(), dtype=tf.string),
            "fotka2": tf.TensorSpec(shape=(target_size[0], target_size[1], 3), dtype=tf.float32),
            "label": tf.TensorSpec(shape=(), dtype=tf.int32)
        }
    )

    print("Dataset creation completed.")

    # Print a few examples from the dataset
    print("Inspecting the dataset...")
    for i, example in enumerate(dataset.take(50)):  # Show up to 50 examples
        print(f"Example {i + 1}:")
        for key, value in example.items():
            if key.startswith("fotka"):  # Display shape for images
                print(f"  {key}: shape={value.shape}, dtype={value.dtype}")
            else:
                decoded_value = value.numpy().decode("utf-8") if isinstance(value.numpy(), bytes) else value.numpy()
                print(f"  {key}: {decoded_value}")

    return dataset


### Function Description: `preprocess_dataset2`

The `preprocess_dataset2` function functions basically the same as `preprocess_dataset1` but without the different pairings limitations. So this function creates every possible same and different pairings but takes a lot more time.

In [30]:
def preprocess_dataset2(metadata, target_size=(224, 224)):
    """
    Preprocesses the dataset to create pairs of images for the same pet and different pets.
    The output dataset contains metadata attributes and image pairs.
    """
    import itertools

    data_pairs = []

    print("Starting dataset preprocessing...")

    # Create image pairs from the same pet (label = 1)
    print("Processing same-pet pairs...")
    for key, entry in metadata.items():
        plemeno = entry.get("Plemeno", "Unknown")
        vek = entry.get("Věk", "Unknown")
        barva = entry.get("Barva", "Unknown")
        velikost = entry.get("Velikost", "Unknown")
        images = entry.get("images", [])

        print(f"Processing pet: {key} - {entry.get('Jméno', 'Unknown')}")
        if len(images) < 2:
            print(f"Skipping pet {key}: Not enough images ({len(images)})")
            continue

        for i in range(len(images)):
            for j in range(i + 1, len(images)):
                try:
                    print(f"Processing same-pet pair: {images[i]} and {images[j]}")
                    image1 = download_and_preprocess_image(images[i], target_size)
                    image2 = download_and_preprocess_image(images[j], target_size)
                    data_pairs.append({
                        "plemeno1": plemeno if isinstance(plemeno, str) else plemeno.decode("utf-8"),
                        "vek1": vek if isinstance(vek, str) else vek.decode("utf-8"),
                        "barva1": barva if isinstance(barva, str) else barva.decode("utf-8"),
                        "velikost1": velikost if isinstance(velikost, str) else velikost.decode("utf-8"),
                        "fotka1": image1,
                        "plemeno2": plemeno if isinstance(plemeno, str) else plemeno.decode("utf-8"),
                        "vek2": vek if isinstance(vek, str) else vek.decode("utf-8"),
                        "barva2": barva if isinstance(barva, str) else barva.decode("utf-8"),
                        "velikost2": velikost if isinstance(velikost, str) else velikost.decode("utf-8"),
                        "fotka2": image2,
                        "label": 1
                    })
                except Exception as e:
                    print(f"Error processing images for same-pet pair ({images[i]}, {images[j]}): {e}")

    print(f"Finished processing same-pet pairs. Total same-pet pairs: {len(data_pairs)}")

    # Create image pairs from different pets (label = 0)
    print("Processing different-pet pairs...")
    all_keys = list(metadata.keys())
    for key1, key2 in itertools.combinations(all_keys, 2):
        entry1 = metadata[key1]
        entry2 = metadata[key2]

        plemeno1 = entry1.get("Plemeno", "Unknown")
        vek1 = entry1.get("Věk", "Unknown")
        barva1 = entry1.get("Barva", "Unknown")
        velikost1 = entry1.get("Velikost", "Unknown")
        images1 = entry1.get("images", [])

        plemeno2 = entry2.get("Plemeno", "Unknown")
        vek2 = entry2.get("Věk", "Unknown")
        barva2 = entry2.get("Barva", "Unknown")
        velikost2 = entry2.get("Velikost", "Unknown")
        images2 = entry2.get("images", [])

        print(f"Processing different-pet pair: {key1} and {key2}")
        if not images1 or not images2:
            print(f"Skipping pair ({key1}, {key2}): Missing images.")
            continue

        try:
            print(f"Processing images: {images1[0]} and {images2[0]}")
            image1 = download_and_preprocess_image(images1[0], target_size)
            image2 = download_and_preprocess_image(images2[0], target_size)
            data_pairs.append({
                "plemeno1": plemeno1 if isinstance(plemeno1, str) else plemeno1.decode("utf-8"),
                "vek1": vek1 if isinstance(vek1, str) else vek1.decode("utf-8"),
                "barva1": barva1 if isinstance(barva1, str) else barva1.decode("utf-8"),
                "velikost1": velikost1 if isinstance(velikost1, str) else velikost1.decode("utf-8"),
                "fotka1": image1,
                "plemeno2": plemeno2 if isinstance(plemeno2, str) else plemeno2.decode("utf-8"),
                "vek2": vek2 if isinstance(vek2, str) else vek2.decode("utf-8"),
                "barva2": barva2 if isinstance(barva2, str) else barva2.decode("utf-8"),
                "velikost2": velikost2 if isinstance(velikost2, str) else velikost2.decode("utf-8"),
                "fotka2": image2,
                "label": 0
            })
        except Exception as e:
            print(f"Error processing images for different-pet pair ({images1[0]}, {images2[0]}): {e}")

    print(f"Finished processing different-pet pairs. Total different-pet pairs: {len(data_pairs) - len(metadata)}")

    print("Finalizing dataset creation...")

    # Convert data_pairs to a TensorFlow dataset using a generator
    def data_generator():
        for i, pair in enumerate(data_pairs):
            if i % 100 == 0:
                print(f"Yielding pair {i}/{len(data_pairs)}")
            yield pair

    dataset = tf.data.Dataset.from_generator(
        data_generator,
        output_signature={
            "plemeno1": tf.TensorSpec(shape=(), dtype=tf.string),
            "vek1": tf.TensorSpec(shape=(), dtype=tf.string),
            "barva1": tf.TensorSpec(shape=(), dtype=tf.string),
            "velikost1": tf.TensorSpec(shape=(), dtype=tf.string),
            "fotka1": tf.TensorSpec(shape=(target_size[0], target_size[1], 3), dtype=tf.float32),
            "plemeno2": tf.TensorSpec(shape=(), dtype=tf.string),
            "vek2": tf.TensorSpec(shape=(), dtype=tf.string),
            "barva2": tf.TensorSpec(shape=(), dtype=tf.string),
            "velikost2": tf.TensorSpec(shape=(), dtype=tf.string),
            "fotka2": tf.TensorSpec(shape=(target_size[0], target_size[1], 3), dtype=tf.float32),
            "label": tf.TensorSpec(shape=(), dtype=tf.int32)
        }
    )

    print("Dataset creation completed.")

    # Print a few examples from the dataset
    print("Inspecting the dataset...")
    for i, example in enumerate(dataset.take(50)):  # Show up to 50 examples
        print(f"Example {i + 1}:")
        for key, value in example.items():
            if key.startswith("fotka"):  # Display shape for images
                print(f"  {key}: shape={value.shape}, dtype={value.dtype}")
            else:
                decoded_value = value.numpy().decode("utf-8") if isinstance(value.numpy(), bytes) else value.numpy()
                print(f"  {key}: {decoded_value}")

    return dataset


In [34]:
if __name__ == "__main__":
    print("Fetching metadata...")
    metadata = get_data()
    verify_metadata(metadata)

    #print("Filtering metadata to remove entries without images...")
    filtered_metadata = filter_metadata_with_images(metadata)
    print(f"Filtered metadata contains {len(filtered_metadata)} entries (original: {len(metadata)})")

    # Limit to the specified entries for testing
    filtered_metadata = dict(islice(filtered_metadata.items(), 5))
    print(f"Using the first {len(filtered_metadata)} entries for testing.")

    verify_metadata(filtered_metadata)

    print("Creating dataset...")
    dataset = preprocess_dataset2(filtered_metadata)

Fetching metadata...
Filtered metadata contains 10328 entries (original: 12050)
Using the first 5 entries for testing.
Creating dataset...
Starting dataset preprocessing...
Processing same-pet pairs...
Processing pet: tanyny-chomutov-2024-12-21 - Tanyny
Processing same-pet pair: https://www.psidetektiv.cz/data/catalog/big/2024/12/22/img190390.jpg and https://www.psidetektiv.cz/data/catalog/big/2024/12/22/img190391.jpg
Processing same-pet pair: https://www.psidetektiv.cz/data/catalog/big/2024/12/22/img190390.jpg and https://www.psidetektiv.cz/data/catalog/big/2024/12/22/img190392.jpg
Processing same-pet pair: https://www.psidetektiv.cz/data/catalog/big/2024/12/22/img190390.jpg and https://www.psidetektiv.cz/data/catalog/big/2024/12/22/img190393.jpg
Processing same-pet pair: https://www.psidetektiv.cz/data/catalog/big/2024/12/22/img190390.jpg and https://www.psidetektiv.cz/data/catalog/big/2024/12/22/img190394.jpg
Processing same-pet pair: https://www.psidetektiv.cz/data/catalog/big/2024

TODO: dataset prep. as input for the efficientnetb0 model, dataset split for training, validation and testing, performance optimization etc.