In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from pathlib import Path

from sklearn import preprocessing
from sklearn.model_selection import train_test_split 
from tensorflow.keras.preprocessing.image import img_to_array, load_img # type: ignore

In [None]:
# Set Variables
IMAGE_SIZE = (300, 225)  # The size images should be rescaled to. If None, defaults to original size
DATA_PATH = Path("../data/fungi-clef-2025")
MD_PATH = DATA_PATH / "metadata/FungiTastic-FewShot"
IMAGE_PATH = DATA_PATH / "images/FungiTastic-FewShot"
LABEL = "class"

### Data Processing

In [None]:
# Load the metadata for each split
md_train_val = pd.read_csv(MD_PATH / "FungiTastic-FewShot-Train.csv")
# Use the validation set as the test set because it has labels
md_test = pd.read_csv(MD_PATH / "FungiTastic-FewShot-Val.csv")

In [None]:
# Drop any rows that are missing labels
md_train_val = md_train_val.dropna(subset=LABEL)
# Drop any classes that have only 1 label so we can stratify
md_train_val = md_train_val.groupby(LABEL).filter(lambda group: len(group) > 1)

In [None]:
# Split the train data into training and validation
md_train, md_val = train_test_split(md_train_val, test_size=0.20, stratify=md_train_val[LABEL])

In [None]:
# Label each split
md_train["split"] = "train"
md_val["split"] = "val"
md_test["split"] = "test"

In [None]:
# Join all of the data together
md_df = pd.concat([md_train, md_val, md_test])

# Add the full image location for each image
# Options for image size include 300p, 500p, 720p, fullsize
md_df["image_path"] = md_df.apply(
    lambda row: IMAGE_PATH / f"{'val' if row['split'] == 'test' else 'train'}/300p/{row['filename']}", axis=1
)

In [None]:
# Map the class to an ID
le = preprocessing.LabelEncoder()
le.fit(md_df["class"])
md_df["class_label"] = md_df["class"]
md_df["class_idx"] = le.transform(md_df["class"])

### Image Loading

In [None]:
images = []
labels = []

for _, row in md_df.iterrows():
    # Load and save the image as an array
    img = load_img(row["image_path"], target_size=IMAGE_SIZE)
    img_arr = img_to_array(img)
    images.append(img_arr)

    # Append the class to the list of labels
    labels.append(row["class_idx"])

In [None]:
# Stack and convert into a numpy array
images = np.stack(images)

In [None]:
# Rescale all of the images so they're pixel value is between [0, 1]
images = images / 255.0

In [None]:
# Cast label list to np.array for easier manipulation
labels = np.array(labels)

In [None]:
# Re-split the images and their labels
train_idx = md_df["split"] == "train"
val_idx = md_df["split"] == "val"
test_idx = md_df["split"] == "test"

train_images = images[train_idx]
train_labels = labels[train_idx]

val_images = images[val_idx]
val_labels = labels[val_idx]

test_images = images[test_idx]
test_labels = labels[test_idx]

In [None]:
print(f"Shape train images: {train_images.shape}")
print(f"Shape val images: {val_images.shape}")
print(f"Shape test images: {test_images.shape}")

In [None]:
SAVE_AS = Path("./300x225")
SAVE_AS.mkdir(exist_ok=True)

# Save the image data
np.save(SAVE_AS / "train_images.npy", train_images)
np.save(SAVE_AS / "val_images.npy", val_images)
np.save(SAVE_AS / "test_images.npy", test_images)

# Save the label data
np.save(SAVE_AS / "train_labels.npy", train_labels)
np.save(SAVE_AS / "val_labels.npy", val_labels)
np.save(SAVE_AS / "test_labels.npy", test_labels)

# Save the  metadata
md_df.to_csv(SAVE_AS / "metadata.csv")
