In [1]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
def create_image_dataframe(image_path, split_flag=True, column_name=["label"]):
    label = []
    path = []

    for dirname, _, filenames in os.walk(image_path):
        for filename in filenames:
            if filename.startswith('.'):
                continue  # Ignore files starting with a dot
            if os.path.splitext(filename)[1] in (".jpeg", ".png", ".jpg"):
                if dirname.split()[-1] != "GT":
                    label.append(os.path.split(dirname)[1])
                    path.append(os.path.join(dirname, filename))

    df_og = pd.DataFrame(columns=["path", 'label'])
    df_og["path"] = path
    df_og['label'] = label
    df_og['label'] = df_og['label'].astype("category")

    if split_flag:
        try:
            # Split the 'label' column into 'family', 'genus', and 'species' columns
            df_og[["family", "genus", "species"]] = df_og['label'].str.split("_", expand=True)
            df_og['species'] = df_og['genus'] + " " + df_og['species']
        except ValueError:
            print("Error splitting labels. Check your folder format -> {family}_{species}_{genus}.")

        return df_og[['path'] + column_name]

    else:
        df_og.rename(columns={'label': column_name[0]}, inplace=True)
        return df_og

In [3]:
df_web = create_image_dataframe(
    "/Users/leonardo/Library/CloudStorage/GoogleDrive-leonardofonseca.r@gmail.com/My Drive/04_projects/CryptoVision/Data/web_scrapping/species/train",
    True,
    ["family", "genus", "species"],
)

df_web.head()

Unnamed: 0,path,family,genus,species
0,/Users/leonardo/Library/CloudStorage/GoogleDri...,Labridae,Halichoeres,Halichoeres claudia
1,/Users/leonardo/Library/CloudStorage/GoogleDri...,Labridae,Halichoeres,Halichoeres claudia
2,/Users/leonardo/Library/CloudStorage/GoogleDri...,Labridae,Halichoeres,Halichoeres claudia
3,/Users/leonardo/Library/CloudStorage/GoogleDri...,Labridae,Halichoeres,Halichoeres claudia
4,/Users/leonardo/Library/CloudStorage/GoogleDri...,Labridae,Halichoeres,Halichoeres claudia


In [4]:
df_sjb = create_image_dataframe(
    "/Users/leonardo/Library/CloudStorage/GoogleDrive-leonardofonseca.r@gmail.com/My Drive/04_projects/CryptoVision/Data/sjb/species",
    True,
    ["family", "genus", "species"],
)

df_sjb.head()

Unnamed: 0,path,family,genus,species
0,/Users/leonardo/Library/CloudStorage/GoogleDri...,Labridae,Halichoeres,Halichoeres claudia
1,/Users/leonardo/Library/CloudStorage/GoogleDri...,Labridae,Halichoeres,Halichoeres claudia
2,/Users/leonardo/Library/CloudStorage/GoogleDri...,Labridae,Halichoeres,Halichoeres claudia
3,/Users/leonardo/Library/CloudStorage/GoogleDri...,Labridae,Halichoeres,Halichoeres claudia
4,/Users/leonardo/Library/CloudStorage/GoogleDri...,Labridae,Halichoeres,Halichoeres claudia


In [5]:
df = pd.concat([df_sjb, df_web]).reset_index(drop=True)

df.head()

Unnamed: 0,path,family,genus,species
0,/Users/leonardo/Library/CloudStorage/GoogleDri...,Labridae,Halichoeres,Halichoeres claudia
1,/Users/leonardo/Library/CloudStorage/GoogleDri...,Labridae,Halichoeres,Halichoeres claudia
2,/Users/leonardo/Library/CloudStorage/GoogleDri...,Labridae,Halichoeres,Halichoeres claudia
3,/Users/leonardo/Library/CloudStorage/GoogleDri...,Labridae,Halichoeres,Halichoeres claudia
4,/Users/leonardo/Library/CloudStorage/GoogleDri...,Labridae,Halichoeres,Halichoeres claudia


In [7]:
from sklearn.model_selection import train_test_split

# Split the filtered DataFrame into training and testing sets
X_train, X_test = train_test_split(
    df,
    test_size=0.2,
    stratify=df['species'],
    random_state=42,
)

In [8]:
X_train.to_csv('/Users/leonardo/Documents/Projects/cryptovision/data/processed/train.csv', index=False)
X_test.to_csv('/Users/leonardo/Documents/Projects/cryptovision/data/processed/test.csv', index=False)

In [10]:
df_test = pd.read_csv('/Users/leonardo/Documents/Projects/cryptovision/data/processed/test.csv')

df_test.head()

Unnamed: 0,path,family,genus,species
0,/Users/leonardo/Library/CloudStorage/GoogleDri...,Tetraodontidae,Canthigaster,Canthigaster solandri
1,/Users/leonardo/Library/CloudStorage/GoogleDri...,Gobiidae,Gobiodon,Gobiodon histrio
2,/Users/leonardo/Library/CloudStorage/GoogleDri...,Tripterygiidae,Ucla,Ucla xenogrammus
3,/Users/leonardo/Library/CloudStorage/GoogleDri...,Tripterygiidae,Ucla,Ucla xenogrammus
4,/Users/leonardo/Library/CloudStorage/GoogleDri...,Chaenopsidae,Acanthemblemaria,Acanthemblemaria aspera
