<a href="https://colab.research.google.com/github/avkaz/DeepLearningPetIdentification/blob/preprocess_pipeline/preprop_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook for creating dataset of pets pairs

Main goal is to create a dataset for fine tuning pretrained model.
Each row consist of 2 pictures, metadata and label. Label "1" is if pictures are of the same pet and "0" if pets a different.
Result dataset is saved in a json format.

## Importing Utility library

In [None]:
## 1st -  Download utility.py file from github repository
## 2nd - Imports all functions from utility.py

import requests

# Correct raw URL for the utility.py file
url = "https://raw.githubusercontent.com/avkaz/DeepLearningPetIdentification/main/utility.py"

# Fetch and save the file locally
response = requests.get(url)
with open("utility.py", "wb") as f:
    f.write(response.content)


import utility
print("utility.py downloaded successfully.")

utility.py downloaded successfully.


## Defining filtering and controlling functions

In [None]:
def filter_metadata_with_images(metadata):
    """
    Filters metadata to include only entries with non-empty 'images' lists.

    Args:
        metadata (dict): The original metadata dictionary.

    Returns:
        dict: A filtered metadata dictionary with entries that have images.
    """
    return {key: value for key, value in metadata.items() if value.get("images")}

In [None]:
# Metadata Verification
def verify_metadata(metadata):
    """
    Verifies metadata integrity by checking for missing or inconsistent entries.

    Args:
        metadata (dict): The metadata dictionary to verify.
    """
    for key, value in metadata.items():
        if not isinstance(value, dict) or "Plemeno" not in value or "Barva" not in value or "Věk" not in value or "Velikost" not in value or "images" not in value:
            print(f"Warning: Incomplete metadata for key {key}: {value}")

In [None]:
def tensor_to_list(tensor):
    """Converts a tensor to a list of pixel values."""
    return tensor.numpy().tolist()

In [None]:
def save_pet_pairs_to_json(pet_pairs, pet_name, filename="pet_pairs.json"):
    """Saves the pairs for a specific pet to a JSON file."""
    try:
        with open(filename, 'a') as json_file:  # Open in append mode
            json.dump({pet_name: pet_pairs}, json_file, indent=4)
            json_file.write("\n")  # To separate each pet's data
        print(f"Pairs for {pet_name} successfully saved to {filename}")
    except Exception as e:
        print(f"Error saving pairs for {pet_name} to JSON: {e}")

## Defining main function

In [None]:
import random
import json
import tensorflow as tf
import utility  # Assuming utility contains download_and_preprocess_image function

def tensor_to_list(tensor):
    """Converts a tensor to a list of pixel values."""
    return tensor.numpy().tolist()

def save_pet_pairs_to_json(pet_pairs, pet_name, filename="pet_pairs.json"):
    """Saves the pairs for a specific pet to a JSON file."""
    try:
        with open(filename, 'a') as json_file:  # Open in append mode
            json.dump({pet_name: pet_pairs}, json_file, indent=4)
            json_file.write("\n")  # To separate each pet's data
        print(f"Pairs for {pet_name} successfully saved to {filename}")
    except Exception as e:
        print(f"Error saving pairs for {pet_name} to JSON: {e}")

def preprocess_dataset_random(metadata, target_size=(224, 224), max_same_pet_pairs=2, max_diff_pet_pairs=2, start_from_index=0):
    """
    Preprocesses the dataset to create pairs of images for the same pet and random different pets.
    Returns a list of dictionaries containing metadata and image pairs.
    The function will start processing from the pet defined by the start_from_index.
    Each pet's pairs will be saved incrementally.
    """
    data_pairs = []
    pet_pair_count = {key: 0 for key in metadata}  # To track the number of pairs per pet

    # Print statement to show the beginning of the preprocessing
    print("Starting dataset preprocessing...")

    all_keys = list(metadata.keys())

    # Starting from the defined index (start_from_index)
    all_keys = all_keys[start_from_index:]

    # Print the starting point
    print(f"Starting from pet {all_keys[0]} at index {start_from_index}...")

    for idx, key in enumerate(all_keys, start=start_from_index):
        entry = metadata[key]
        plemeno = entry.get("Plemeno", "Unknown")
        vek = entry.get("Věk", "Unknown")
        barva = entry.get("Barva", "Unknown")
        velikost = entry.get("Velikost", "Unknown")
        images = entry.get("images", [])

        print(f"Processing pet {key} ({idx + 1}/{len(all_keys) + start_from_index})...")  # Print current pet number

        if len(images) < 2:
            print(f"Skipping pet {key}: Not enough images ({len(images)})")
            continue

        same_pet_pair_count = 0
        diff_pet_pair_count = 0
        pet_data_pairs = []

        processed_same_pairs = set()  # To track the same-pet pairs
        processed_diff_pairs = set()  # To track the different-pet pairs

        # Counter to track duplicate skips
        duplicate_skip_count = 0

        # First, create same-pet pairs until max_same_pet_pairs is reached
        print(f"Processing same-pet pairs for {key}...")
        while same_pet_pair_count < max_same_pet_pairs:
            random.shuffle(images)
            for i in range(len(images)):
                for j in range(i + 1, len(images)):
                    if same_pet_pair_count >= max_same_pet_pairs:
                        break

                    # Create a unique pair identifier
                    pair_id = tuple(sorted([images[i], images[j]]))  # Sorting ensures order doesn't matter

                    if pair_id in processed_same_pairs:
                        print(f"Skipping duplicate same-pet pair: {images[i]} and {images[j]}")
                        duplicate_skip_count += 1
                        if duplicate_skip_count > 3:
                            print(f"Skipping pet {key} due to too many duplicate skips. Moving to next pet.")
                            break  # Move to the next pet if too many duplicates are found
                        continue  # Skip if the pair has already been processed

                    try:
                        print(f"Processing same-pet pair: {images[i]} and {images[j]} for {key}")
                        image1 = utility.download_and_preprocess_image(images[i], target_size)
                        image2 = utility.download_and_preprocess_image(images[j], target_size)

                        # Convert tensors to lists of pixel values
                        image1_list = tensor_to_list(image1)
                        image2_list = tensor_to_list(image2)

                        pet_data_pairs.append({
                            "plemeno1": plemeno,
                            "vek1": vek,
                            "barva1": barva,
                            "velikost1": velikost,
                            "fotka1": image1_list,
                            "plemeno2": plemeno,
                            "vek2": vek,
                            "barva2": barva,
                            "velikost2": velikost,
                            "fotka2": image2_list,
                            "label": 1
                        })
                        same_pet_pair_count += 1

                        # Mark this pair as processed
                        processed_same_pairs.add(pair_id)
                    except Exception as e:
                        print(f"Error processing same-pet pair ({images[i]}, {images[j]}): {e}")

            # Check if we broke out of the loop due to too many duplicates
            if duplicate_skip_count > 3:
                break

        # Now, create different-pet pairs until max_diff_pet_pairs is reached
        print(f"Processing different-pet pairs for {key}...")
        valid_diff_pair_attempts = 0  # To track valid attempts for different-pet pairs

        while diff_pet_pair_count < max_diff_pet_pairs:
            other_pets = [k for k in all_keys if k != key]  # List of all other pets
            if not other_pets:
                break

            key2 = random.choice(other_pets)  # Randomly select another pet
            entry2 = metadata[key2]
            plemeno2 = entry2.get("Plemeno", "Unknown")
            vek2 = entry2.get("Věk", "Unknown")
            barva2 = entry2.get("Barva", "Unknown")
            velikost2 = entry2.get("Velikost", "Unknown")
            images2 = entry2.get("images", [])

            if not images2:
                print(f"Skipping different-pet pair ({key}, {key2}): Missing images for {key2}.")
                continue

            # Create a unique identifier for the different-pet pair
            diff_pair_id = tuple(sorted([key, key2]))  # Sorting ensures no duplicates between pets

            if diff_pair_id in processed_diff_pairs:
                print(f"Skipping duplicate different-pet pair: {key} and {key2}")
                valid_diff_pair_attempts += 1
                if valid_diff_pair_attempts > 10:  # Allow up to 10 attempts
                    print(f"No more valid different-pet pairs for {key}. Moving to next part.")
                    break  # Skip this pet and move to the next part if no valid pairs found
                continue  # Skip if the pair has already been processed

            try:
                print(f"Processing different-pet pair: {key} ({images[0]}) and {key2} ({images2[0]})")
                image1 = utility.download_and_preprocess_image(images[0], target_size)
                image2 = utility.download_and_preprocess_image(images2[0], target_size)

                # Convert tensors to lists of pixel values
                image1_list = tensor_to_list(image1)
                image2_list = tensor_to_list(image2)

                pet_data_pairs.append({
                    "plemeno1": plemeno,
                    "vek1": vek,
                    "barva1": barva,
                    "velikost1": velikost,
                    "fotka1": image1_list,
                    "plemeno2": plemeno2,
                    "vek2": vek2,
                    "barva2": barva2,
                    "velikost2": velikost2,
                    "fotka2": image2_list,
                    "label": 0
                })

                diff_pet_pair_count += 1
                pet_pair_count[key] += 1
                pet_pair_count[key2] += 1

                # Mark this pair as processed
                processed_diff_pairs.add(diff_pair_id)

            except Exception as e:
                print(f"Error processing different-pet pair ({key}, {key2}): {e}")

        print(f"Finished processing for {key}. Total same-pet pairs: {same_pet_pair_count}, Total different-pet pairs: {diff_pet_pair_count}")

        # After processing this pet, save the pairs for this specific pet to a JSON file
        save_pet_pairs_to_json(pet_data_pairs, key)

    print("Dataset preprocessing completed.")
    return data_pairs


## Triggering the result function

In [None]:
if __name__ == "__main__":
    print("Fetching metadata...")
    metadata = utility.get_data()
    verify_metadata(metadata)

    print("Filtering metadata to remove entries without images...")
    filtered_metadata = filter_metadata_with_images(metadata)
    print(f"Filtered metadata contains {len(filtered_metadata)} entries (original: {len(metadata)})")

    # Limit to the specified entries for testing
    filtered_metadata = dict(islice(filtered_metadata.items(), 5))
    print(f"Using the first {len(filtered_metadata)} entries for testing.")

    verify_metadata(filtered_metadata)

    print("Creating dataset...")
    max_same_pet_pairs = 2  # Max same-pet pairs per pet
    max_diff_pet_pairs = 2  # Max different-pet pairs per pet
    target_size = (96, 96)
    dataset = preprocess_dataset_random(filtered_metadata, target_size, max_same_pet_pairs, max_diff_pet_pairs)


Fetching metadata...
Filtering metadata to remove entries without images...
Filtered metadata contains 10328 entries (original: 12050)
Using the first 5 entries for testing.
Creating dataset...
Starting dataset preprocessing...
Starting from pet tanyny-chomutov-2024-12-21 at index 0...
Processing pet tanyny-chomutov-2024-12-21 (1/5)...
Processing same-pet pairs for tanyny-chomutov-2024-12-21...
Processing same-pet pair: https://www.psidetektiv.cz/data/catalog/big/2024/12/22/img190394.jpg and https://www.psidetektiv.cz/data/catalog/big/2024/12/22/img190392.jpg for tanyny-chomutov-2024-12-21
Uploading model...
Model loaded successfully.
Processing same-pet pair: https://www.psidetektiv.cz/data/catalog/big/2024/12/22/img190394.jpg and https://www.psidetektiv.cz/data/catalog/big/2024/12/22/img190393.jpg for tanyny-chomutov-2024-12-21
Processing different-pet pairs for tanyny-chomutov-2024-12-21...
Processing different-pet pair: tanyny-chomutov-2024-12-21 (https://www.psidetektiv.cz/data/ca