In [2]:
import os
import pandas as pd
from tqdm import tqdm  # For progress bars during processing

In [6]:
def load_data(styles_path: str, images_path: str):
    """
    Load styles.csv and images.csv into DataFrames.

    Args:
        styles_path (str): Path to styles.csv file.
        images_path (str): Path to images.csv file.

    Returns:
        tuple: (styles_df, images_df)
    """
    styles_df = pd.read_csv(styles_path, on_bad_lines='warn')
    images_df = pd.read_csv(images_path)
    return styles_df, images_df


def filter_valid_entries(styles_df: pd.DataFrame, image_dir: str) -> pd.DataFrame:
    """
    Filter rows with all required labels and existing image files.

    Args:
        styles_df (pd.DataFrame): Raw styles dataframe.
        image_dir (str): Path to folder containing image files.

    Returns:
        pd.DataFrame: Filtered dataframe.
    """
    # Keep only rows with all required fields
    required_cols = ['id', 'gender', 'baseColour', 'season', 'articleType']
    styles_df = styles_df.dropna(subset=required_cols)

    # Convert IDs to string to match image filenames
    styles_df["id"] = styles_df["id"].astype(str)

    # Get set of available image filenames (without .jpg)
    existing_image_ids = set([
        img_name.split(".")[0]
        for img_name in os.listdir(image_dir)
        if img_name.endswith('.jpg')
    ])

    # Keep only rows where image exists
    styles_df = styles_df[styles_df["id"].isin(existing_image_ids)]

    return styles_df


def standardize_fields(styles_df: pd.DataFrame) -> pd.DataFrame:
    """
    Standardize capitalization and remove unwanted categories.

    Args:
        styles_df (pd.DataFrame): Filtered dataframe.

    Returns:
        pd.DataFrame: Cleaned and standardized dataframe.
    """
    # Normalize capitalization across categorical fields
    styles_df["gender"] = styles_df["gender"].str.title()
    styles_df["baseColour"] = styles_df["baseColour"].str.title()
    styles_df["season"] = styles_df["season"].str.title()

    # Remove invalid/unwanted values
    invalid_seasons = ["Others"]
    invalid_colours = ["Multi", "Combo"]

    styles_df = styles_df[~styles_df["season"].isin(invalid_seasons)]
    styles_df = styles_df[~styles_df["baseColour"].isin(invalid_colours)]

    return styles_df


def save_cleaned_data(styles_df: pd.DataFrame, save_path: str):
    """
    Save cleaned DataFrame to a CSV file.

    Args:
        styles_df (pd.DataFrame): Cleaned dataframe.
        save_path (str): Output path for cleaned CSV.
    """
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    styles_df.to_csv(save_path, index=False)
    print(f"✅ Cleaned data saved to: {save_path}")


def preprocess(styles_csv, images_csv, image_folder, save_path):
    """
    Orchestrates the entire preprocessing pipeline.

    Args:
        styles_csv (str): Path to styles.csv
        images_csv (str): Path to images.csv (not currently used)
        image_folder (str): Directory where image files are stored
        save_path (str): Output path to save cleaned styles
    """
    styles_df, _ = load_data(styles_csv, images_csv)
    styles_df = filter_valid_entries(styles_df, image_folder)
    styles_df = standardize_fields(styles_df)
    save_cleaned_data(styles_df, save_path)
styles_csv = r"D:\CODING\Machine Learning\PROJECTS\fashion-product-classifier\data\raw\fashion-product-images-dataset\styles.csv"
images_csv = r"D:\CODING\Machine Learning\PROJECTS\fashion-product-classifier\data\raw\fashion-product-images-dataset\images.csv"
image_folder = r"D:\CODING\Machine Learning\PROJECTS\fashion-product-classifier\data\raw\fashion-product-images-dataset\images"
save_path = r"D:\CODING\Machine Learning\PROJECTS\fashion-product-classifier\data\processed\cleaned-styles.csv"

preprocess(styles_csv, images_csv, image_folder, save_path)

Skipping line 6569: expected 10 fields, saw 11
Skipping line 7399: expected 10 fields, saw 11
Skipping line 7939: expected 10 fields, saw 11
Skipping line 9026: expected 10 fields, saw 11
Skipping line 10264: expected 10 fields, saw 11
Skipping line 10427: expected 10 fields, saw 11
Skipping line 10905: expected 10 fields, saw 11
Skipping line 11373: expected 10 fields, saw 11
Skipping line 11945: expected 10 fields, saw 11
Skipping line 14112: expected 10 fields, saw 11
Skipping line 14532: expected 10 fields, saw 11
Skipping line 15076: expected 10 fields, saw 12
Skipping line 29906: expected 10 fields, saw 11
Skipping line 31625: expected 10 fields, saw 11
Skipping line 33020: expected 10 fields, saw 11
Skipping line 35748: expected 10 fields, saw 11
Skipping line 35962: expected 10 fields, saw 11
Skipping line 37770: expected 10 fields, saw 11
Skipping line 38105: expected 10 fields, saw 11
Skipping line 38275: expected 10 fields, saw 11
Skipping line 38404: expected 10 fields, saw

✅ Cleaned data saved to: D:\CODING\Machine Learning\PROJECTS\fashion-product-classifier\data\processed\cleaned-styles.csv
