In [1]:
import os
import pandas as pd

def delete_unmatched_rows(csv_path, image_folder, id_column="ImageName"):
    """
    Deletes rows from a CSV file if their image ID doesn't match a file in the image folder.
    Compares only the base ID (no extension).

    Args:
        csv_path: Path to the CSV file to clean (will overwrite)
        image_folder: Path to the folder containing image files
        id_column: Column in the CSV containing image IDs (no extension)
    """

    # Load CSV
    df = pd.read_csv(csv_path)
    df[id_column] = df[id_column].astype(str)

    # Get all image IDs from the folder (remove extensions)
    folder_ids = {
        os.path.splitext(f)[0]
        for f in os.listdir(image_folder)
        if os.path.isfile(os.path.join(image_folder, f))
    }

    # Filter the DataFrame
    filtered_df = df[df[id_column].isin(folder_ids)]
    filtered_df = filtered_df.drop_duplicates(subset=['ImageName'])
    #keep only column ImageName and ValidatedType
    filtered_df = filtered_df[['ImageName', 'ValidatedType']]
    #rename ImageName to file_name and ValidatedType to label
    filtered_df = filtered_df.rename(columns={'ImageName': 'file_name', 'ValidatedType': 'label'})

    # Overwrite original CSV
    filtered_df.to_csv("C:/Users/Christian/Desktop/N1_data/clean_data.csv", index=False)

    print(f"‚úÖ Filtered CSV: kept {len(filtered_df)} of {len(df)} rows.")
    print(f"üìù Overwritten: {"C:/Users/Christian/Desktop/N1_data/clean_data.csv"}")

# Example usage
if __name__ == "__main__":
    csv_path = "C:/Users/Christian/Desktop/N1_data/sampled_dataset.csv"
    image_folder = "C:/Users/Christian/Desktop/N1_data/image_data/"

    delete_unmatched_rows(csv_path, image_folder)

‚úÖ Filtered CSV: kept 7930 of 9414 rows.
üìù Overwritten: C:/Users/Christian/Desktop/N1_data/clean_data.csv
