In [28]:
import os
import shutil
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing import image
from pathlib import Path
import hashlib

def generate_hash(length=5):
    """Generate a random 5-character hash based on SHA-256 for file naming."""
    return hashlib.sha256(os.urandom(16)).hexdigest()[:length]

def classify_and_organize_images_hierarchy(model, family_labels, genus_labels, species_labels, input_folder, output_folder, image_size=(224, 224), confidence_threshold=0.5):
    """
    Classify images by family, genus, and species in a hierarchical directory structure.
    
    Parameters:
    - model: Trained TensorFlow model for hierarchical classification.
    - family_labels, genus_labels, species_labels: Decoded lists of family, genus, and species labels.
    - input_folder: Path to the folder containing images to classify.
    - output_folder: Path to the folder where organized images will be saved.
    - image_size: Expected input size for the model.
    - confidence_threshold: Minimum confidence threshold for classification; if not met, image goes to "Unknown" folder.
    """
    # Create output folder if it doesn't exist
    Path(output_folder).mkdir(parents=True, exist_ok=True)
    
    # Loop through each image in the input folder
    for img_name in os.listdir(input_folder):
        img_path = os.path.join(input_folder, img_name)
        
        # Check if file is an image
        if not img_name.lower().endswith(('.jpeg', '.jpg', '.png')) or img_name.startswith('.'):
            continue

        # Load and preprocess the image
        img = image.load_img(img_path, target_size=image_size)
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)  # Add batch dimension
        img_array = tf.keras.applications.resnet_v2.preprocess_input(img_array)
        
        # Predict family, genus, and species
        family_pred, genus_pred, species_pred = model.predict(img_array)

        ### Step 1: Family Level Classification ###
        top_family_idx, family_score = np.argmax(family_pred[0]), np.max(family_pred[0])
        
        # Check confidence for family level
        if family_score < confidence_threshold:
            unknown_folder = os.path.join(output_folder, "Unknown")
            Path(unknown_folder).mkdir(parents=True, exist_ok=True)
            unknown_destination_path = os.path.join(unknown_folder, img_name)
            shutil.copy(img_path, unknown_destination_path)
            print(f"Image {img_name} classified as Unknown at Family level due to low confidence.")
            continue

        family_name = family_labels[top_family_idx].decode('utf-8') if isinstance(family_labels[top_family_idx], bytes) else family_labels[top_family_idx]
        family_score_str = int(family_score * 100)  # Convert score to percentage
        family_folder = os.path.join(output_folder, family_name)
        Path(family_folder).mkdir(parents=True, exist_ok=True)
        
        # Filename format with family score and hash
        family_hash = generate_hash()
        family_img_name = f"{family_score_str}_{family_name}_{family_hash}.jpeg"
        family_destination_path = os.path.join(family_folder, family_img_name)
        shutil.copy(img_path, family_destination_path)

        ### Step 2: Genus Level Classification ###
        top_genus_idx, genus_score = np.argmax(genus_pred[0]), np.max(genus_pred[0])
        
        # Check confidence for genus level
        if genus_score < confidence_threshold:
            unknown_folder = os.path.join(family_folder, "Unknown")
            Path(unknown_folder).mkdir(parents=True, exist_ok=True)
            unknown_destination_path = os.path.join(unknown_folder, img_name)
            shutil.copy(img_path, unknown_destination_path)
            print(f"Image {img_name} classified as Unknown at Genus level due to low confidence.")
            continue

        genus_name = genus_labels[top_genus_idx].decode('utf-8') if isinstance(genus_labels[top_genus_idx], bytes) else genus_labels[top_genus_idx]
        genus_score_str = int(genus_score * 100)  # Convert score to percentage
        genus_folder = os.path.join(family_folder, genus_name)
        Path(genus_folder).mkdir(parents=True, exist_ok=True)
        
        # Filename format with genus score and hash
        genus_hash = generate_hash()
        genus_img_name = f"{genus_score_str}_{genus_name}_{family_name}_{genus_hash}.jpeg"
        genus_destination_path = os.path.join(genus_folder, genus_img_name)
        shutil.copy(img_path, genus_destination_path)

        ### Step 3: Species Level Classification ###
        top_species_idx, species_score = np.argmax(species_pred[0]), np.max(species_pred[0])

        # Check confidence for species level
        if species_score < confidence_threshold:
            unknown_folder = os.path.join(genus_folder, "Unknown")
            Path(unknown_folder).mkdir(parents=True, exist_ok=True)
            unknown_destination_path = os.path.join(unknown_folder, img_name)
            shutil.copy(img_path, unknown_destination_path)
            print(f"Image {img_name} classified as Unknown at Species level due to low confidence.")
            continue

        species_name = species_labels[top_species_idx].decode('utf-8') if isinstance(species_labels[top_species_idx], bytes) else species_labels[top_species_idx]
        species_score_str = int(species_score * 100)  # Convert score to percentage
        species_folder = os.path.join(genus_folder, species_name)
        Path(species_folder).mkdir(parents=True, exist_ok=True)
        
        # Filename format with species score and hash
        species_hash = generate_hash()
        species_img_name = f"{species_score_str}_{species_name}_{genus_name}_{family_name}_{species_hash}.jpeg"
        species_destination_path = os.path.join(species_folder, species_img_name)
        shutil.copy(img_path, species_destination_path)

        print(f"Image {img_name} classified and saved as {species_img_name} in {species_folder}")

# Example usage:
# classify_and_organize_images_hierarchy(
#     model=your_loaded_model,
#     family_labels=family_labels,
#     genus_labels=genus_labels,
#     species_labels=species_labels,
#     input_folder='/path/to/your/unlabeled_images',
#     output_folder='/path/to/organized_images'
# )

In [26]:
model = tf.keras.models.load_model('/Users/leonardo/Documents/Projects/cryptovision/models/hacpl_rn50v2_mo_antropic_classifier.keras')

In [5]:
import pandas as pd

def image_directory_to_pandas(image_path):
    """
    Create a pandas DataFrame with image paths and taxonomic labels extracted from a directory structure.

    Parameters:
    ----------
    image_path : str
        The root directory containing subfolders with images.

    Returns:
    -------
    pandas.DataFrame
        A DataFrame containing image paths and label information. Columns include:
        - 'path': The full path to the image.
        - 'folder_label': The folder name, representing the original label (format: 'family_genus_species').
        - 'family': Extracted family name from the folder label.
        - 'genus': Extracted genus name from the folder label.
        - 'species': Combination of genus and species names (e.g., 'genus species').

    Raises:
    ------
    ValueError:
        If the folder label format does not match the expected 'family_genus_species' format.
    """
    labels = []
    paths = []

    # Walk through the directory and collect image paths and labels
    for root_dir, _, filenames in os.walk(image_path):
        for filename in filenames:
            # Ignore hidden files and non-image files
            if filename.startswith('.') or os.path.splitext(filename)[1].lower() not in {".jpeg", ".png", ".jpg"}:
                continue

            # Extract the folder name as the label, ignoring 'GT' directories
            folder_label = os.path.basename(root_dir)
            if folder_label != "GT":
                labels.append(folder_label)
                paths.append(os.path.join(root_dir, filename))

    # Create DataFrame with paths and folder labels
    df = pd.DataFrame({'image_path': paths, 'folder_label': labels})
    df['folder_label'] = df['folder_label'].astype("category")

    # Split the folder_label into 'family', 'genus', and 'species'
    try:
        df[['family', 'genus', 'species']] = df['folder_label'].str.split("_", expand=True)
        df['species'] = df['genus'] + " " + df['species']
    except ValueError as e:
        raise ValueError(
            "Error splitting folder labels. Ensure that your folder structure follows 'family_genus_species' format."
        ) from e

    # Return the dataframe with specified columns
    return df[['image_path', 'folder_label', 'family', 'genus', 'species']]

df = image_directory_to_pandas("/Users/leonardo/Documents/Projects/cryptovision/data/processed/cv_images_dataset")

In [6]:
 # Extract the unique family, genus, and species from the dataframe
family_labels = sorted(df['family'].unique())
genus_labels = sorted(df['genus'].unique())
species_labels = sorted(df['species'].unique())

# Convert family, genus, and species labels to TensorFlow tensors
family_labels = tf.constant(family_labels)
genus_labels = tf.constant(genus_labels)
species_labels = tf.constant(species_labels)

family_labels = family_labels.numpy().tolist()
genus_labels = genus_labels.numpy().tolist()
species_labels = species_labels.numpy().tolist()

In [29]:
classify_and_organize_images_hierarchy(
    model, 
    family_labels, 
    genus_labels, 
    species_labels, 
    '/Volumes/T7_shield/CryptoVision/Data/others/chris_images copy', 
    '/Volumes/T7_shield/CryptoVision/Data/others/chris_images_organized', 
    image_size=(299, 299)
)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Image LIRS23_0265.JPG classified and saved as 87_Pleurosicya labiata_Pleurosicya_Gobiidae_04e45.jpeg in /Volumes/T7_shield/CryptoVision/Data/others/chris_images_organized/Gobiidae/Pleurosicya/Pleurosicya labiata
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Image LIRS23_0503.JPG classified and saved as 65_Pleurosicya labiata_Pleurosicya_Gobiidae_11fda.jpeg in /Volumes/T7_shield/CryptoVision/Data/others/chris_images_organized/Gobiidae/Pleurosicya/Pleurosicya labiata
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Image LIRS23_0517.JPG classified and saved as 69_Pleurosicya labiata_Pleurosicya_Gobiidae_62975.jpeg in /Volumes/T7_shield/CryptoVision/Data/others/chris_images_organized/Gobiidae/Pleurosicya/Pleurosicya labiata
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Image LIRS23_0271.JPG classified and saved as 77_Pleurosicya labiata_Pleuros

KeyboardInterrupt: 