In [None]:
import os
import re
import pandas as pd

def create_fruit_dataframe(folder="data/fruit"):
    """
    Creates a DataFrame with one row per fruit object.
    Columns: object, grayscale_image (path), color_image (path)
    """
    fruits = {}

    for fname in os.listdir(folder):
        if not (fname.endswith(".png") or fname.endswith(".jpg") or fname.endswith(".jpeg")):
            continue

        path = os.path.join(folder, fname)
        base = os.path.splitext(fname)[0]

        # Detect if it's color or grayscale
        if base.endswith("_color"):
            obj_name = base.replace("_color", "")
            img_type = "color"
        else:
            obj_name = base
            img_type = "grayscale"

        # Initialize dict entry 
        if obj_name not in fruits:
            fruits[obj_name] = {
                "object": obj_name,
                "grayscale_image": None,
                "color_image": None
            }

        # Fill in file paths
        if img_type == "color":
            fruits[obj_name]["color_image"] = path
        else:
            fruits[obj_name]["grayscale_image"] = path
    fruits = pd.DataFrame(fruits.values())
    fruits["object"] = fruits["object"].str.replace(r"\d+$", "", regex=True)

    return fruits

In [26]:
df = create_fruit_dataframe("data/fruit")
df
df.to_parquet("data/fruit/fruit_images.parquet")