In [1]:
import os
import shutil
import imagehash
import pandas as pd
from PIL import Image
from tqdm import tqdm

def image_directory_to_pandas(image_path):
    """
    Create a pandas DataFrame with image paths and taxonomic labels extracted from a directory structure.

    Parameters:
    ----------
    image_path : str
        The root directory containing subfolders with images.

    Returns:
    -------
    pandas.DataFrame
        A DataFrame containing image paths and label information. Columns include:
        - 'path': The full path to the image.
        - 'folder_label': The folder name, representing the original label (format: 'family_genus_species').
        - 'family': Extracted family name from the folder label.
        - 'genus': Extracted genus name from the folder label.
        - 'species': Combination of genus and species names (e.g., 'genus species').

    Raises:
    ------
    ValueError:
        If the folder label format does not match the expected 'family_genus_species' format.
    """
    labels = []
    paths = []

    # Walk through the directory and collect image paths and labels
    for root_dir, _, filenames in os.walk(image_path):
        for filename in filenames:
            # Ignore hidden files and non-image files
            if filename.startswith('.') or os.path.splitext(filename)[1].lower() not in {".jpeg", ".png", ".jpg"}:
                continue

            # Extract the folder name as the label, ignoring 'GT' directories
            folder_label = os.path.basename(root_dir)
            if folder_label != "GT":
                labels.append(folder_label)
                paths.append(os.path.join(root_dir, filename))

    # Create DataFrame with paths and folder labels
    df = pd.DataFrame({'image_path': paths, 'folder_label': labels})
    df['folder_label'] = df['folder_label'].astype("category")

    # Split the folder_label into 'family', 'genus', and 'species'
    try:
        df[['family', 'genus', 'species']] = df['folder_label'].str.split("_", expand=True)
        df['species'] = df['genus'] + " " + df['species']
    except ValueError as e:
        raise ValueError(
            "Error splitting folder labels. Ensure that your folder structure follows 'family_genus_species' format."
        ) from e

    # Return the dataframe with specified columns
    return df[['image_path', 'folder_label', 'family', 'genus', 'species']]


def copy_image_direcory(source_path, destination_path):
    """
    Copy images from a source directory to a destination directory while preserving 
    their subfolder structure, format, and size.

    Parameters:
    ----------
    source_path : str
        The path to the source directory containing subfolders with images.
    destination_path : str
        The path to the destination directory where images will be copied.
    """
    # Ensure the destination path exists
    os.makedirs(destination_path, exist_ok=True)

    # Walk through the source directory
    for root_dir, _, filenames in os.walk(source_path):
        for filename in filenames:
            # Skip hidden files and only process image files
            if filename.startswith('.') or os.path.splitext(filename)[1].lower() not in {".jpeg", ".png", ".jpg"}:
                continue

            # Full path of the source image
            source_image_path = os.path.join(root_dir, filename)
            
            # Create a relative path from the source directory to maintain the folder structure
            relative_path = os.path.relpath(root_dir, source_path)
            
            # Construct the corresponding destination directory path
            destination_dir = os.path.join(destination_path, relative_path)
            
            # Ensure the destination directory exists
            os.makedirs(destination_dir, exist_ok=True)
            
            # Copy the image to the destination directory
            destination_image_path = os.path.join(destination_dir, filename)
            shutil.copy2(source_image_path, destination_image_path)

            print(f"Copied {source_image_path} to {destination_image_path}")

    print("Image copy operation completed.")
    
    
def split_image_dataframe(df, test_size=0.2, val_size=0.1, random_state=42, stratify_by='folder_name'):
    """
    Split a pandas DataFrame into train, validation, and test sets,
    stratified by the 'folder_name' column.

    Args:
        df (pd.DataFrame): The DataFrame containing image paths and labels.
        test_size (float): Proportion of the dataset to include in the test split.
        val_size (float): Proportion of the dataset to include in the validation split.
        random_state (int): Seed for random number generation for reproducibility.

    Returns:
        tuple: Three pandas DataFrames for train, validation, and test sets.
    """
    # First, split into train+validation and test sets
    train_val_df, test_df = train_test_split(
        df,
        test_size=test_size,
        stratify=df[stratify_by],
        random_state=random_state
    )
    
    # Calculate the adjusted validation size relative to the remaining train+val data
    val_relative_size = val_size / (1 - test_size)
    
    # Split the train+validation set into train and validation sets
    train_df, val_df = train_test_split(
        train_val_df,
        test_size=val_relative_size,
        stratify=train_val_df[stratify_by],
        random_state=random_state
    )
    
    return train_df, val_df, test_df


def copy_dataset_images_to_directory(df, target_directory, path_column='image_path', label_column='folder_label'):
    """
    Copies images to a specified target directory while preserving 
    the subfolder structure based on a DataFrame's label column.
    Keeps original image format, size, and metadata intact.

    Parameters:
    ----------
    df : pandas.DataFrame
        A DataFrame containing the following columns:
        - 'path_column': The path to the source image (default: 'image_path').
        - 'label_column': The label to be used for creating subfolders (default: 'folder_label').
    target_directory : str
        The path to the target directory where images will be copied.
    path_column : str, optional
        The name of the column in the DataFrame that contains image paths.
        Default is 'image_path'.
    label_column : str, optional
        The name of the column in the DataFrame that contains the folder labels.
        Default is 'folder_label'.

    Returns:
    -------
    None
        Copies the images to the specified target directory while maintaining structure.
    """
    # Iterate through the DataFrame with a progress bar
    for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Copying images", unit="image"):
        image_path = row[path_column]
        label = row[label_column]
        
        # Create the subfolder in the target directory
        target_subfolder = os.path.join(target_directory, label)
        os.makedirs(target_subfolder, exist_ok=True)
        
        # Construct the target path for the image
        target_path = os.path.join(target_subfolder, os.path.basename(image_path))
        
        # Copy the image to the target directory, preserving metadata
        shutil.copy2(image_path, target_path)

    print(f"All images have been copied to {target_directory} with metadata preserved.")
    
    return True
    
    
def rename_and_convert_images(input_path, suffix):
    """
    Converts images to JPEG format (if necessary) and renames them according to the given pattern.
    
    Args:
    - input_path (str): Path to the main directory containing subfolders.
    - suffix (str): A suffix to add at the beginning of each renamed image.
    
    Example:
    If a subfolder is named "Family_Genus_species", the images inside it will be renamed to:
    "{suffix}_Genus_species_1.jpg", "{suffix}_Genus_species_2.jpg", etc.
    """
    # Iterate through each subfolder in the input path
    for subfolder in os.listdir(input_path):
        subfolder_path = os.path.join(input_path, subfolder)
        
        # Check if it's a directory
        if os.path.isdir(subfolder_path):
            # Extract genus and species from the subfolder name (assumes the format "Family_Genus_species")
            try:
                _, genus, species = subfolder.split('_')
            except ValueError:
                print(f"Skipping folder '{subfolder}': not in 'Family_Genus_species' format.")
                continue

            # Iterate through all files in the subfolder
            image_files = [f for f in os.listdir(subfolder_path) if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff'))]
            image_files.sort()  # Sort files to ensure consistent numbering

            # Rename each image with the desired pattern and convert if necessary
            for idx, image_name in enumerate(image_files, start=1):
                # Define the new image name with .jpeg extension
                new_image_name = f"{suffix}_{genus}_{species}_{idx}.jpeg"
                
                # Define the full paths for the old and new image names
                old_image_path = os.path.join(subfolder_path, image_name)
                new_image_path = os.path.join(subfolder_path, new_image_name)
                
                # Convert the image to .jpeg format if necessary
                try:
                    with Image.open(old_image_path) as img:
                        # Convert to RGB mode if the image has an alpha channel (like .png)
                        if img.mode in ("RGBA", "P"):
                            img = img.convert("RGB")
                        
                        # Save the image in .jpeg format with the new name
                        img.save(new_image_path, "JPEG", quality=95)
                        
                        # Optionally, remove the old file if it was converted
                        if old_image_path != new_image_path:
                            os.remove(old_image_path)
                    
                    print(f"Converted and renamed: {old_image_path} -> {new_image_path}")
                
                except Exception as e:
                    print(f"Error processing {old_image_path}: {e}")

    print("Renaming and conversion completed.")
    
    return True


def generate_image_hash(image_path):
    """
    Generate a perceptual hash for an image using the average hash method.

    Parameters:
    ----------
    image_path : str
        The file path of the image for which to generate the hash.

    Returns:
    -------
    ImageHash
        An average hash object for the provided image.
    """
    with Image.open(image_path) as img:
        return imagehash.average_hash(img)


def move_matching_images(target_images_folder, source_folder, destination_folder):
    """
    Detect and move duplicate images from the source folder to the destination folder
    based on matching image hashes in the target images folder.

    Parameters:
    ----------
    target_images_folder : str
        Path to the folder containing target images for comparison.
    source_folder : str
        Path to the folder containing source images to check for duplicates.
    destination_folder : str
        Path to the folder where duplicate images will be moved.

    Returns:
    -------
    None
        Moves duplicate images to the specified destination folder and prints the moved files.
    """
    # Ensure destination folder exists
    os.makedirs(destination_folder, exist_ok=True)
    
    # Generate hashes for target images
    target_hashes = {}
    for file_name in os.listdir(target_images_folder):
        # Process only image files, ignoring hidden files
        if file_name.lower().endswith(('jpg', 'jpeg', 'png')) and not file_name.startswith('.'):
            target_image_path = os.path.join(target_images_folder, file_name)
            target_hash = generate_image_hash(target_image_path)
            target_hashes[file_name] = target_hash

    # Compare and move matching images from source to destination
    for file_name in os.listdir(source_folder):
        # Process only image files, ignoring hidden files
        if file_name.lower().endswith(('jpg', 'jpeg', 'png')) and not file_name.startswith('.'):
            source_image_path = os.path.join(source_folder, file_name)
            source_hash = generate_image_hash(source_image_path)
            
            # Check for matching hashes
            for target_name, target_hash in target_hashes.items():
                if source_hash == target_hash:
                    destination_path = os.path.join(destination_folder, file_name)
                    shutil.move(source_image_path, destination_path)
                    print(f"Moved: {source_image_path} -> {destination_path}")
                    break
                
                

In [None]:
from cryptovision.tools import image_directory_to_pandas

In [3]:
rename_and_convert_images('/Volumes/T7_shield/CryptoVision/Data/fish_functions/training/test', 'lab_')

Error processing /Volumes/T7_shield/CryptoVision/Data/fish_functions/training/test/Blenniidae_Ecsenius_bicolor/._LIRS18_014_B1_134_JMC_3614_Cirripectes_variolosus.jpeg: cannot identify image file '/Volumes/T7_shield/CryptoVision/Data/fish_functions/training/test/Blenniidae_Ecsenius_bicolor/._LIRS18_014_B1_134_JMC_3614_Cirripectes_variolosus.jpeg'
Error processing /Volumes/T7_shield/CryptoVision/Data/fish_functions/training/test/Blenniidae_Ecsenius_bicolor/._LIRS18_014_B1_135_JMC_3615_Cirripectes_variolosus.jpeg: cannot identify image file '/Volumes/T7_shield/CryptoVision/Data/fish_functions/training/test/Blenniidae_Ecsenius_bicolor/._LIRS18_014_B1_135_JMC_3615_Cirripectes_variolosus.jpeg'
Error processing /Volumes/T7_shield/CryptoVision/Data/fish_functions/training/test/Blenniidae_Ecsenius_bicolor/._LIRS18_014_B1_136_JMC_3616_Cirripectes_variolosus.jpeg: cannot identify image file '/Volumes/T7_shield/CryptoVision/Data/fish_functions/training/test/Blenniidae_Ecsenius_bicolor/._LIRS18_01

True

In [31]:
# Lab Image set
df_sjb = image_directory_to_pandas(
    "/Users/leonardo/Library/CloudStorage/Box-Box/CryptoVision/Data/sjb/species",
)

df_web = image_directory_to_pandas(
    "/Users/leonardo/Library/CloudStorage/Box-Box/CryptoVision/Data/web/species/train",
)

# Concatenate both dataframes
df_raw = pd.concat([df_sjb, df_web], ignore_index=True)

print(f'Original Dataset Shape: {df_raw.shape}')

# Set Train, Validation & Test dataframes
label_count = df_raw['folder_label'].value_counts()

valid_labels = label_count[label_count >= 50].index
filtered_df_raw = df_raw[df_raw['folder_label'].isin(valid_labels)]

print(f'Filtered Dataset Shape: {filtered_df_raw.shape}')
print(f'Number of Classes: {filtered_df_raw["folder_label"].nunique()}')

Original Dataset Shape: (9492, 5)
Filtered Dataset Shape: (9023, 5)
Number of Classes: 57


In [32]:
copy_dataset_images_to_directory(
    filtered_df_raw, 
    '/Users/leonardo/Documents/Projects/cryptovision/data/processed/cv_images_dataset',
    'image_path',
    'folder_label'
)

Copying images: 100%|██████████| 9023/9023 [00:04<00:00, 1970.34image/s]

All images have been copied to /Users/leonardo/Documents/Projects/cryptovision/data/processed/cv_images_dataset with metadata preserved.





In [28]:
train_ds, val_ds, test_ds = split_image_dataframe(df_raw, 0.2, 0.2, 1, 'folder_label')

print(train_ds.shape)
print(val_ds.shape)
print(test_ds.shape)

(5694, 5)
(1899, 5)
(1899, 5)


In [None]:
# Create source dataset metadata
def source_metadata(path, version, description, date_created, previous_version):
    
    df = image_directory_to_pandas(path)
    
    sample_path = df['image_path'].iloc[0]
    
    metadata = {
        'version': version,
        'source': path,
        'description': description,
        'date_created': date_created,
        'previous_version': previous_version,
        'images': df.shape[0],
        'species': df['species'].nunique(),
        'genus': df['genus'].nunique(),
        'family': df['family'].nunique(),
        'format': sample_path.split('.')[-1].upper()
    }
    
    return metadata
    

In [2]:
from cryptovision.tools import image_directory_to_pandas

In [7]:
df = image_directory_to_pandas(
    '/Volumes/T7_shield/CryptoVision/Data/Images/Sources/INaturaList/Species/v250116/images'
)

print(df.shape)
print(df['folder_label'].nunique())
print(df['species'].nunique())
print(df['genus'].nunique())
print(df['family'].nunique())

(17060, 5)
163
163
74
26


In [5]:
sample_path.split('.')[-1].upper()

'JPEG'

In [10]:
from PIL import Image
import os

def find_small_images(base_path, min_width=384, min_height=384, image_extensions=(".jpg", ".jpeg", ".png")):
    """
    Search for images in all subdirectories of a given path that have dimensions below the specified size.

    Args:
        base_path (str): The base directory to search.
        min_width (int): Minimum allowed width for images. Default is 384.
        min_height (int): Minimum allowed height for images. Default is 384.
        image_extensions (tuple): Tuple of allowed image file extensions. Default is (".jpg", ".jpeg", ".png").

    Returns:
        list: A list of paths for images that have dimensions below the specified size.
    """
    small_images = []

    for root, _, files in os.walk(base_path):
        for file in files:
            if file.lower().endswith(image_extensions) and not file.startswith('.'):
                image_path = os.path.join(root, file)
                try:
                    with Image.open(image_path) as img:
                        width, height = img.size
                        if width < min_width or height < min_height:
                            small_images.append(image_path)
                except Exception as e:
                    print(f"Error processing {image_path}: {e}")

    return small_images

In [11]:
base_directory = "/Volumes/T7_shield/CryptoVision/Data/Images/Sources/INaturaList/Species/v250116/images"
small_images_list = find_small_images(base_directory, min_width=384, min_height=384)

# Print the results
print(f"Found {len(small_images_list)} images with dimensions below 384x384.")
for img_path in small_images_list:
    print(img_path)

Found 616 images with dimensions below 384x384.
/Volumes/T7_shield/CryptoVision/Data/Images/Sources/INaturaList/Species/v250116/images/Apogonidae_Apogon_binotatus/inatlist_Apogon_binotatus_00107.jpeg
/Volumes/T7_shield/CryptoVision/Data/Images/Sources/INaturaList/Species/v250116/images/Apogonidae_Apogon_binotatus/inatlist_Apogon_binotatus_00086.jpeg
/Volumes/T7_shield/CryptoVision/Data/Images/Sources/INaturaList/Species/v250116/images/Apogonidae_Apogon_binotatus/inatlist_Apogon_binotatus_00028.jpeg
/Volumes/T7_shield/CryptoVision/Data/Images/Sources/INaturaList/Species/v250116/images/Apogonidae_Apogon_binotatus/inatlist_Apogon_binotatus_00018.jpeg
/Volumes/T7_shield/CryptoVision/Data/Images/Sources/INaturaList/Species/v250116/images/Apogonidae_Apogon_binotatus/inatlist_Apogon_binotatus_00085.jpeg
/Volumes/T7_shield/CryptoVision/Data/Images/Sources/INaturaList/Species/v250116/images/Apogonidae_Apogon_binotatus/inatlist_Apogon_binotatus_00087.jpeg
/Volumes/T7_shield/CryptoVision/Data/Ima

In [13]:
import cv2

def find_blurry_images(base_path, threshold=100, image_extensions=(".jpg", ".jpeg", ".png")):
    blurry_images = []

    for root, _, files in os.walk(base_path):
        for file in files:
            if file.lower().endswith(image_extensions) and not file.startswith('.'):
                image_path = os.path.join(root, file)
                try:
                    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
                    laplacian_var = cv2.Laplacian(img, cv2.CV_64F).var()
                    if laplacian_var < threshold:  # Threshold for blur detection
                        blurry_images.append((image_path, laplacian_var))
                except Exception as e:
                    print(f"Error processing {image_path}: {e}")
    
    return blurry_images

base_directory = "/Volumes/T7_shield/CryptoVision/Data/Images/Sources/INaturaList/Species/v250116/images"
blurry_images_list = find_blurry_images(base_directory, threshold=100)

# Print the results
print(f"Found {len(blurry_images_list)} blurry images.")
for img_path, variance in blurry_images_list:
    print(f"Image: {img_path}, Variance: {variance}")

Found 5001 blurry images.
Image: /Volumes/T7_shield/CryptoVision/Data/Images/Sources/INaturaList/Species/v250116/images/Apogonidae_Apogon_binotatus/inatlist_Apogon_binotatus_00133.jpeg, Variance: 26.286242534618435
Image: /Volumes/T7_shield/CryptoVision/Data/Images/Sources/INaturaList/Species/v250116/images/Apogonidae_Apogon_binotatus/inatlist_Apogon_binotatus_00134.jpeg, Variance: 38.405852601731816
Image: /Volumes/T7_shield/CryptoVision/Data/Images/Sources/INaturaList/Species/v250116/images/Apogonidae_Apogon_binotatus/inatlist_Apogon_binotatus_00130.jpeg, Variance: 28.400157209565243
Image: /Volumes/T7_shield/CryptoVision/Data/Images/Sources/INaturaList/Species/v250116/images/Apogonidae_Apogon_binotatus/inatlist_Apogon_binotatus_00129.jpeg, Variance: 14.280107351453639
Image: /Volumes/T7_shield/CryptoVision/Data/Images/Sources/INaturaList/Species/v250116/images/Apogonidae_Apogon_binotatus/inatlist_Apogon_binotatus_00128.jpeg, Variance: 98.24854868297294
Image: /Volumes/T7_shield/Cryp

In [21]:
import pandas as pd

# Set sources

df_lab = image_directory_to_pandas(
    '/Volumes/T7_shield/CryptoVision/Data/Images/Sources/Lab/SJB/Processed/Species/v250106/images', 'lab'
)

df_lirs23 = image_directory_to_pandas(
    '/Volumes/T7_shield/CryptoVision/Data/Images/Sources/Lab/LIRS23/Processed/Species/v250115/images', 'lirs23'
)

df_scls = image_directory_to_pandas(
    '/Volumes/T7_shield/CryptoVision/Data/Images/Sources/Lab/SCLS/Processed/Species/v250115/images', 'scls'
)

df_cbc24 = image_directory_to_pandas(
    '/Volumes/T7_shield/CryptoVision/Data/Images/Sources/Lab/CBC24/Processed/Species/v250115/images', 'cbc24'
)

df_web = image_directory_to_pandas(
    '/Volumes/T7_shield/CryptoVision/Data/Images/Sources/Web/Species/v250117/images', 'web'
)

df_inat = image_directory_to_pandas(
    '/Volumes/T7_shield/CryptoVision/Data/Images/Sources/INaturaList/Species/v250116/images', 'inat'
)

# Concatenate all dataframes
df_raw = pd.concat([df_lab, df_lirs23, df_scls, df_cbc24, df_web, df_inat], ignore_index=True)

og_size = df_raw.shape[0]

val_count = df['species'].value_counts()

valid_labels = val_count[val_count >= 50].index

df_filtered = df_raw[df_raw['species'].isin(valid_labels)]

print(f'Original Dataset Shape: {df_raw.shape}')
print(f'Filtered Dataset Shape: {df_filtered.shape} > Reduced by {(df_filtered.shape[0] / og_size - 1) * 100:.2f}%')


Original Dataset Shape: (32023, 6)
Filtered Dataset Shape: (25324, 6) > Reduced by -20.92%


In [27]:
print(df_filtered['family'].nunique())
print(df_filtered['genus'].nunique())
print(df_filtered['species'].nunique())

19
57
98


In [26]:
import shutil
from tqdm import tqdm

new_dir = '/Volumes/T7_shield/CryptoVision/Data/Images/Datasets/v2.0.0/images'

for index, data in tqdm(df_filtered.iterrows(), total=df_filtered.shape[0], desc="Copying images", unit="image"):
    
    image_name = os.path.basename(data['image_path'])
    folder_name = data['folder_label']
    
    os.makedirs(os.path.join(new_dir, folder_name), exist_ok=True)
    
    new_image_path = os.path.join(new_dir, folder_name, image_name)
    
    shutil.copy2(data['image_path'], new_image_path)

25324it [02:39, 158.91it/s]


In [28]:
import os
import shutil
from PIL import Image
import imagehash

def find_and_move_duplicates(base_path, target_folder, image_extensions=(".jpg", ".jpeg", ".png")):
    """
    Find duplicate images in a directory and move them to a target folder.

    Args:
        base_path (str): The base directory to search for duplicate images.
        target_folder (str): The directory where duplicate images will be moved.
        image_extensions (tuple): Tuple of allowed image file extensions.

    Returns:
        list: A list of tuples containing duplicate image paths and their original counterparts.
    """
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    hashes = {}
    duplicates = []

    for root, _, files in os.walk(base_path):
        for file in files:
            if file.lower().endswith(image_extensions) and not file.startswith('.'):
                image_path = os.path.join(root, file)
                try:
                    with Image.open(image_path) as img:
                        img_hash = imagehash.average_hash(img)
                        if img_hash in hashes:
                            duplicates.append((image_path, hashes[img_hash]))
                            # Move the duplicate image to the target folder
                            shutil.move(image_path, os.path.join(target_folder, os.path.basename(image_path)))
                        else:
                            hashes[img_hash] = image_path
                except Exception as e:
                    print(f"Error processing {image_path}: {e}")

    return duplicates

# Example usage
base_directory = "/Volumes/T7_shield/CryptoVision/Data/Images/Datasets/v2.0.0/images"
target_directory = "/Volumes/T7_shield/CryptoVision/Data/Images/Rejected/Duplicates_v2"


for folder in os.listdir(base_directory):
    
    if not os.path.isdir(os.path.join(base_directory, folder)):
        continue
    
    duplicates = find_and_move_duplicates(
        os.path.join(base_directory, folder), 
        os.path.join(target_directory, folder)
    )


Error processing /Volumes/T7_shield/CryptoVision/Data/Images/Datasets/v2.0.0/images/Chaenopsidae_Acanthemblemaria_aspera/._IMG_8989_Chaenopsidae.jpeg: cannot identify image file '/Volumes/T7_shield/CryptoVision/Data/Images/Datasets/v2.0.0/images/Chaenopsidae_Acanthemblemaria_aspera/._IMG_8989_Chaenopsidae.jpeg'
Error processing /Volumes/T7_shield/CryptoVision/Data/Images/Datasets/v2.0.0/images/Chaenopsidae_Acanthemblemaria_aspera/._IMG_8995_Chaenopsidae.jpeg: cannot identify image file '/Volumes/T7_shield/CryptoVision/Data/Images/Datasets/v2.0.0/images/Chaenopsidae_Acanthemblemaria_aspera/._IMG_8995_Chaenopsidae.jpeg'
Error processing /Volumes/T7_shield/CryptoVision/Data/Images/Datasets/v2.0.0/images/Chaenopsidae_Acanthemblemaria_aspera/._IMG_8998_Chaenopsidae.jpeg: cannot identify image file '/Volumes/T7_shield/CryptoVision/Data/Images/Datasets/v2.0.0/images/Chaenopsidae_Acanthemblemaria_aspera/._IMG_8998_Chaenopsidae.jpeg'
Error processing /Volumes/T7_shield/CryptoVision/Data/Images/

In [29]:
df_v2 = image_directory_to_pandas('/Volumes/T7_shield/CryptoVision/Data/Images/Datasets/v2.0.0/images')

df_v2.shape

(24415, 5)

In [30]:
df_v2['folder_label'].value_counts()

folder_label
Gobiidae_Nemateleotris_magnifica            888
Labridae_Pseudocheilinus_hexataenia         714
Tetraodontidae_Canthigaster_solandri        712
Gobiidae_Istigobius_decoratus               681
Chaetodontidae_Chaetodon_quadrimaculatus    645
                                           ... 
Apogonidae_Siphamia_tubifer                  67
Gobiidae_Priolepis_cincta                    66
Gobiidae_Eviota_prasina                      53
Gobiidae_Trimma_okinawae                     52
Pseudochromidae_Cypho_ purpurascens           3
Name: count, Length: 98, dtype: int64