In [14]:
import os
import shutil
import numpy as np
import pandas as pd
from PIL import Image
from pathlib import Path
from sklearn.model_selection import train_test_split

def create_image_dataframe(image_path, split_flag=True, column_name=["label"]):
    label = []
    path = []

    for dirname, _, filenames in os.walk(image_path):
        for filename in filenames:
            if filename.startswith('.'):
                continue  # Ignore files starting with a dot
            if os.path.splitext(filename)[1] in (".jpeg", ".png", ".jpg"):
                if dirname.split()[-1] != "GT":
                    label.append(os.path.split(dirname)[1])
                    path.append(os.path.join(dirname, filename))

    df_og = pd.DataFrame(columns=["path", 'label'])
    df_og["path"] = path
    df_og['label'] = label
    df_og['label'] = df_og['label'].astype("category")

    if split_flag:
        try:
            # Split the 'label' column into 'family', 'genus', and 'species' columns
            df_og[["family", "genus", "species"]] = df_og['label'].str.split("_", expand=True)
            df_og['species'] = df_og['genus'] + " " + df_og['species']
        except ValueError:
            print("Error splitting labels. Check your folder format -> {family}_{species}_{genus}.")

        return df_og[['path'] + column_name]

    else:
        df_og.rename(columns={'label': column_name[0]}, inplace=True)
        return df_og
    
def copy_images_to_directory(df, target_directory):
    for index, row in df.iterrows():
        image_path = row['path']
        label = row['label']
        target_subfolder = os.path.join(target_directory, label)
        
        # Create the subfolder if it doesn't exist
        if not os.path.exists(target_subfolder):
            os.makedirs(target_subfolder)
        
        # Copy the image to the target subfolder
        target_path = os.path.join(target_subfolder, os.path.basename(image_path))
        shutil.copy(image_path, target_path)
        
def train_valid_test_split(df, valid_size=0.15, test_size=0.15, random_state=42, stratify_column='label'):
    
    # Split from df to train & test set
    train_df, test_df = train_test_split(
        df, 
        test_size=test_size, 
        random_state=random_state, 
        stratify=df[stratify_column]
    )
    
    # Split from train to train & valid set
    train_df, valid_df = train_test_split(
        train_df,
        test_size=valid_size,
        random_state=random_state,
        stratify=train_df[stratify_column]
    )
    
    return train_df, valid_df, test_df

def convert_images_to_jpeg(directory):
    """
    Recursively converts all images in the given directory and its subdirectories to JPEG format.
    
    Args:
    directory (str): Path to the directory containing images.
    """
    supported_formats = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff')
    
    for root, _, files in os.walk(directory):
        for file in files:
            if file.lower().endswith(supported_formats):
                file_path = os.path.join(root, file)
                try:
                    with Image.open(file_path) as img:
                        rgb_img = img.convert('RGB')  # Convert to RGB
                        # Save the image in JPEG format
                        os.remove(file_path)
                        jpeg_path = os.path.splitext(file_path)[0] + '.jpeg'
                        rgb_img.save(jpeg_path, 'JPEG')
                        print(f"Converted {file_path} to {jpeg_path}")
                except Exception as e:
                    print(f"Failed to convert {file_path}: {e}")

In [3]:
# Lab Image set
df_sjb = create_image_dataframe(
    "/Users/leonardo/Library/CloudStorage/GoogleDrive-leonardofonseca.r@gmail.com/My Drive/04_projects/CryptoVision/Data/sjb/species",
)

# Web Scrapping Image set
df_web = create_image_dataframe(
    "/Users/leonardo/Library/CloudStorage/GoogleDrive-leonardofonseca.r@gmail.com/My Drive/04_projects/CryptoVision/Data/web_scrapping/species/train",
)

# Concatenate both dataframes
df_raw = pd.concat([df_sjb, df_web], ignore_index=True)

print(df_raw.shape)

df_raw.head()

(9519, 2)


Unnamed: 0,path,label
0,/Users/leonardo/Library/CloudStorage/GoogleDri...,Labridae_Halichoeres_claudia
1,/Users/leonardo/Library/CloudStorage/GoogleDri...,Labridae_Halichoeres_claudia
2,/Users/leonardo/Library/CloudStorage/GoogleDri...,Labridae_Halichoeres_claudia
3,/Users/leonardo/Library/CloudStorage/GoogleDri...,Labridae_Halichoeres_claudia
4,/Users/leonardo/Library/CloudStorage/GoogleDri...,Labridae_Halichoeres_claudia


In [20]:
# Set Train, Validation & Test dataframes
label_count = df_raw['label'].value_counts()

valid_labels = label_count[label_count >= 50].index
filtered_df_raw = df_raw[df_raw['label'].isin(valid_labels)]

In [21]:
train_df, valid_df, test_df = train_valid_test_split(
    filtered_df_raw, 
    valid_size=0.2,
    test_size=0.2,
)

print(train_df.shape, valid_df.shape, test_df.shape)

(5789, 2) (1448, 2) (1810, 2)


In [22]:
common_dir = '/Users/leonardo/Documents/Projects/cryptovision/data/processed'

copy_images_to_directory(train_df, os.path.join(common_dir, 'train'))
copy_images_to_directory(test_df, os.path.join(common_dir, 'test'))
copy_images_to_directory(valid_df, os.path.join(common_dir, 'valid'))


In [26]:

new_train_df = create_image_dataframe(os.path.join(common_dir, 'train'))
new_test_df = create_image_dataframe(os.path.join(common_dir, 'test'))
new_valid_df = create_image_dataframe(os.path.join(common_dir, 'valid'))

print(new_train_df.shape, new_valid_df.shape, new_test_df.shape)


(5789, 2) (1448, 2) (1810, 2)


In [24]:
convert_images_to_jpeg(os.path.join(common_dir, 'valid'))
convert_images_to_jpeg(os.path.join(common_dir, 'test'))
convert_images_to_jpeg(os.path.join(common_dir, 'train'))

Converted /Users/leonardo/Documents/Projects/cryptovision/data/processed/valid/Labridae_Halichoeres_claudia/62132550_fish_photo_15731.png to /Users/leonardo/Documents/Projects/cryptovision/data/processed/valid/Labridae_Halichoeres_claudia/62132550_fish_photo_15731.jpeg
Converted /Users/leonardo/Documents/Projects/cryptovision/data/processed/valid/Labridae_Halichoeres_claudia/fish_photo_15737.png to /Users/leonardo/Documents/Projects/cryptovision/data/processed/valid/Labridae_Halichoeres_claudia/fish_photo_15737.jpeg
Converted /Users/leonardo/Documents/Projects/cryptovision/data/processed/valid/Labridae_Halichoeres_claudia/98055_web_Halichoeres_claudia_10.jpeg to /Users/leonardo/Documents/Projects/cryptovision/data/processed/valid/Labridae_Halichoeres_claudia/98055_web_Halichoeres_claudia_10.jpeg
Converted /Users/leonardo/Documents/Projects/cryptovision/data/processed/valid/Labridae_Halichoeres_claudia/fish_photo_6009.png to /Users/leonardo/Documents/Projects/cryptovision/data/processed

In [25]:
list_train = os.listdir(os.path.join(common_dir, "train"))
list_valid = os.listdir(os.path.join(common_dir, "valid"))

list(set(list_train) - set(list_valid))

[]

In [27]:
new_train_df.to_csv('train.csv')
new_test_df.to_csv('test.csv')
new_valid_df.to_csv('valid.csv')