In [1]:
import os
import pandas as pd

def create_dataframe_from_directory(base_path):
    """
    Create a pandas DataFrame from a directory structure where each subfolder is a species name
    and contains images of that species.
    
    Args:
        base_path (str): The path to the base directory containing subfolders for each species.
        
    Returns:
        pd.DataFrame: A DataFrame with two columns:
                      - 'path': The full path to each image.
                      - 'folder_name': The name of the subfolder, representing the species.
    """
    data = {
        "path": [],
        "folder_name": []
    }

    # Iterate through each subfolder in the base path
    for folder_name in os.listdir(base_path):
        folder_path = os.path.join(base_path, folder_name)
        
        # Only process directories (ignore files that might be at the base path)
        if os.path.isdir(folder_path):
            # Iterate through all images in the subfolder
            for image_name in os.listdir(folder_path):
                # Check if the file is an image by extension
                if image_name.lower().endswith((".jpg", ".jpeg", ".png")):
                    image_path = os.path.join(folder_path, image_name)
                    data["path"].append(image_path)
                    data["folder_name"].append(folder_name)
    
    # Create a DataFrame from the collected data
    df = pd.DataFrame(data)
    return df

df = create_dataframe_from_directory("/Users/leonardo/Documents/Projects/cryptovision/data/processed/cv_images_dataset")
df.head()

Unnamed: 0,path,folder_name
0,/Users/leonardo/Documents/Projects/cryptovisio...,Labridae_Halichoeres_claudia
1,/Users/leonardo/Documents/Projects/cryptovisio...,Labridae_Halichoeres_claudia
2,/Users/leonardo/Documents/Projects/cryptovisio...,Labridae_Halichoeres_claudia
3,/Users/leonardo/Documents/Projects/cryptovisio...,Labridae_Halichoeres_claudia
4,/Users/leonardo/Documents/Projects/cryptovisio...,Labridae_Halichoeres_claudia


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

def split_dataframe(df, test_size=0.2, val_size=0.1, random_state=42):
    """
    Split a pandas DataFrame into train, validation, and test sets,
    stratified by the 'folder_name' column.

    Args:
        df (pd.DataFrame): The DataFrame containing image paths and labels.
        test_size (float): Proportion of the dataset to include in the test split.
        val_size (float): Proportion of the dataset to include in the validation split.
        random_state (int): Seed for random number generation for reproducibility.

    Returns:
        tuple: Three pandas DataFrames for train, validation, and test sets.
    """
    # First, split into train+validation and test sets
    train_val_df, test_df = train_test_split(
        df,
        test_size=test_size,
        stratify=df['folder_name'],
        random_state=random_state
    )
    
    # Calculate the adjusted validation size relative to the remaining train+val data
    val_relative_size = val_size / (1 - test_size)
    
    # Split the train+validation set into train and validation sets
    train_df, val_df = train_test_split(
        train_val_df,
        test_size=val_relative_size,
        stratify=train_val_df['folder_name'],
        random_state=random_state
    )
    
    return train_df, val_df, test_df

# Example usage:
# Assuming df is the DataFrame created earlier with 'path' and 'folder_name' columns
train_df, val_df, test_df = split_dataframe(df, test_size=0.15, val_size=0.15)

# Display the sizes of each split
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

Training set size: 6315
Validation set size: 1354
Test set size: 1354


In [7]:
import os
import tensorflow as tf
import pandas as pd

BATCH_SIZE = 32
EPOCHS = 10
IMG_SIZE = (299, 299)
SEED = 42

def extract_family_genus_and_species_from_folder_name(folder_name):
    # Split the folder name assuming the format "family_genus_species"
    family_genus_species = tf.strings.split(folder_name, "_")
    family = family_genus_species[-3]
    genus = family_genus_species[-2]
    species = family_genus_species[-2] + " " + family_genus_species[-1]

    return family, genus, species

def process_path(path, folder_name, family_labels, genus_labels, species_labels):
    # Load the raw data from the file as a string
    img = tf.io.read_file(path)

    # Decode the image
    img = tf.image.decode_jpeg(img, channels=3)

    # Resize the image to the desired size
    img = tf.image.resize(img, IMG_SIZE)

    # Get the family, genus, and species labels
    family, genus, species = extract_family_genus_and_species_from_folder_name(folder_name)

    # Convert family, genus, and species to indices
    family_label = tf.argmax(tf.equal(family_labels, family))
    genus_label = tf.argmax(tf.equal(genus_labels, genus))
    species_label = tf.argmax(tf.equal(species_labels, species))

    # Convert to one-hot encoded format
    family_label = tf.one_hot(family_label, len(family_labels))
    genus_label = tf.one_hot(genus_label, len(genus_labels))
    species_label = tf.one_hot(species_label, len(species_labels))

    # Return the image and a dictionary of labels with matching keys
    return img, {
        "family": family_label,
        "genus": genus_label,
        "species": species_label
    }

def build_dataset_from_dataframe(df, batch_size=32):
    # Extract the unique family, genus, and species from the dataframe
    family_labels = set()
    genus_labels = set()
    species_labels = set()

    for folder_name in df['folder_name']:
        # Extract family, genus, and species
        family, genus, species = folder_name.split("_")
        species = genus + " " + species

        # Collect unique family, genus, and species names
        family_labels.add(family)
        genus_labels.add(genus)
        species_labels.add(species)

    # Convert sets to sorted lists to keep label indices consistent
    family_labels = sorted(list(family_labels))
    genus_labels = sorted(list(genus_labels))
    species_labels = sorted(list(species_labels))

    # Convert family, genus, and species labels to TensorFlow tensors
    family_labels = tf.constant(family_labels)
    genus_labels = tf.constant(genus_labels)
    species_labels = tf.constant(species_labels)

    # Create a TensorFlow dataset from the dataframe's paths and folder names
    path_ds = tf.data.Dataset.from_tensor_slices((df['path'].values, df['folder_name'].values))

    # Map the processing function to the dataset
    image_label_ds = path_ds.map(
        lambda x, y: process_path(x, y, family_labels, genus_labels, species_labels),
        num_parallel_calls=tf.data.AUTOTUNE,
    )

    # Shuffle, batch, and prefetch the dataset
    image_label_ds = image_label_ds.shuffle(buffer_size=len(df))
    image_label_ds = image_label_ds.batch(batch_size)
    image_label_ds = image_label_ds.prefetch(buffer_size=tf.data.AUTOTUNE)

    return (
        image_label_ds,
        family_labels.numpy().tolist(),
        genus_labels.numpy().tolist(),
        species_labels.numpy().tolist(),
    )

# Example usage with a pandas DataFrame
import pandas as pd

# Suppose df_train, df_val, and df_test are your dataframes
# with columns 'path' (image path) and 'folder_name' (family_genus_species)
train_ds, family_labels, genus_labels, species_labels = build_dataset_from_dataframe(train_df)
val_ds, _, _, _ = build_dataset_from_dataframe(val_df)
test_ds, _, _, _ = build_dataset_from_dataframe(test_df)

2024-10-23 16:21:15.409391: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2024-10-23 16:21:15.409418: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 36.00 GB
2024-10-23 16:21:15.409422: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 13.50 GB
2024-10-23 16:21:15.409434: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-10-23 16:21:15.409444: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [8]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
valid_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [9]:


# Data Augmentation Function
data_augmentation = tf.keras.Sequential(
    [
        tf.keras.layers.RandomFlip("horizontal"),
        tf.keras.layers.RandomRotation(0.2),
        tf.keras.layers.RandomZoom(0.2),
        tf.keras.layers.RandomTranslation(0.1, 0.1),
        tf.keras.layers.RandomContrast(0.2),
        tf.keras.layers.RandomBrightness(0.2),
    ]
)

# Image Preprocessing
preprocess_input = tf.keras.applications.resnet_v2.preprocess_input

base_model = tf.keras.applications.ResNet50V2(
    input_shape=IMG_SIZE + (3,),
    include_top=False,
    weights='imagenet'
)
base_model.trainable = False

# Define the inputs and apply augmentation
inputs = tf.keras.Input(shape=IMG_SIZE + (3,))
x = data_augmentation(inputs)
x = preprocess_input(x)
x = base_model(x, training=False)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = tf.keras.layers.Dropout(0.2)(x)

# Define family output
family_output = tf.keras.layers.Dense(len(family_labels), activation='softmax', name='family')(x)

# Concatenate the family output with the base model output
family_features = tf.keras.layers.Concatenate()([x, family_output])

# Define genus output, using family features as additional input
genus_hidden = tf.keras.layers.Dense(256, activation='relu')(family_features)
genus_output = tf.keras.layers.Dense(len(genus_labels), activation='softmax', name='genus')(genus_hidden)

# Concatenate the family and genus outputs with the base model output
genus_features = tf.keras.layers.Concatenate()([x, family_output, genus_output])

# Define species output, using both family and genus features as additional input
species_hidden = tf.keras.layers.Dense(256, activation='relu')(genus_features)
species_output = tf.keras.layers.Dense(len(species_labels), activation='softmax', name='species')(species_hidden)

# Create the hierarchical model
model = tf.keras.Model(inputs, [family_output, genus_output, species_output])


base_learning_rate = 0.0001
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=base_learning_rate),
    loss={
        'family': 'categorical_crossentropy',
        'genus': 'categorical_crossentropy',
        'species': 'categorical_crossentropy'
    },
    metrics=['accuracy']
)

# %%
model.summary()

In [10]:
base_learning_rate = 0.0001
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=base_learning_rate),
    loss = 'categorical_crossentropy',
    metrics = ['accuracy','accuracy','accuracy'],
)

history = model.fit(
    train_ds,
    epochs=10,
    validation_data=val_ds
)

Epoch 1/10


2024-10-23 16:21:32.795878: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m 44/198[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m51s[0m 333ms/step - family_accuracy: 0.1034 - family_loss: 3.0499 - genus_accuracy: 0.0481 - genus_loss: 3.9571 - loss: 11.1429 - species_accuracy: 0.0457 - species_loss: 4.1242

KeyboardInterrupt: 