In [27]:
import os
import hashlib
from PIL import Image
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm

PROJECT_ROOT = os.path.abspath("..")
DATASET_PATH = os.path.join(PROJECT_ROOT, "data", "raw", "emotion-fer")
PROCESSED_PATH = os.path.join(PROJECT_ROOT, "data", "processed")
os.makedirs(PROCESSED_PATH, exist_ok=True)

In [28]:
splits = [d for d in os.listdir(DATASET_PATH) if os.path.isdir(os.path.join(DATASET_PATH, d))]
data_records = []

for split in splits:
    split_path = os.path.join(DATASET_PATH, split)
    for label in os.listdir(split_path):
        label_path = os.path.join(split_path, label)
        if not os.path.isdir(label_path):
            continue
        for file_name in os.listdir(label_path):
            file_path = os.path.join(label_path, file_name)
            data_records.append({"split": split, "label": label, "filepath": file_path})

df = pd.DataFrame(data_records)

In [29]:
def file_hash(path):
    try:
        with open(path, "rb") as f:
            return hashlib.md5(f.read()).hexdigest()
    except Exception:
        return None

hashes = {}
duplicates = []

In [30]:

for path in tqdm(df["filepath"], desc="Checking duplicates"):
    h = file_hash(path)
    if h:
        if h in hashes:
            duplicates.append(path)
        else:
            hashes[h] = path

corrupted = []
for path in tqdm(df["filepath"], desc="Checking corrupted images"):
    try:
        img = Image.open(path)
        img.verify()
    except Exception:
        corrupted.append(path)

df_clean = df[~df["filepath"].isin(duplicates + corrupted)].reset_index(drop=True)

target_size = (48, 48)
processed_records = []

Checking duplicates: 100%|██████████| 35887/35887 [00:04<00:00, 8737.49it/s]
Checking corrupted images: 100%|██████████| 35887/35887 [00:07<00:00, 4800.63it/s]


In [31]:
for row in tqdm(df_clean.itertuples(), desc="Processing images"):
    try:
        img = Image.open(row.filepath).convert("L")
        img = img.resize(target_size)
        label_path = os.path.join(PROCESSED_PATH, row.label)
        os.makedirs(label_path, exist_ok=True)
        processed_file_path = os.path.join(label_path, os.path.basename(row.filepath))
        img.save(processed_file_path)
        processed_records.append({"label": row.label, "filepath": processed_file_path})
    except Exception:
        continue

df_final = pd.DataFrame(processed_records)
le = LabelEncoder()
df_final["label_encoded"] = le.fit_transform(df_final["label"])

Processing images: 34034it [00:06, 4942.68it/s] 


In [32]:
X_train, X_temp, y_train, y_temp = train_test_split(
    df_final["filepath"], df_final["label_encoded"], test_size=0.3, stratify=df_final["label_encoded"], random_state=42
)

In [33]:
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

In [34]:
train_df = pd.DataFrame({"filepath": X_train, "label": y_train})
val_df = pd.DataFrame({"filepath": X_val, "label": y_val})
test_df = pd.DataFrame({"filepath": X_test, "label": y_test})


In [35]:
train_df.to_csv(os.path.join(PROCESSED_PATH, "train.csv"), index=False)
val_df.to_csv(os.path.join(PROCESSED_PATH, "val.csv"), index=False)
test_df.to_csv(os.path.join(PROCESSED_PATH, "test.csv"), index=False)

In [36]:
df_final.to_csv(os.path.join(PROCESSED_PATH, "full_processed_dataset.csv"), index=False)

summary = {
    "total_original_images": len(df),
    "corrupted_images_removed": len(corrupted),
    "duplicate_images_removed": len(duplicates),
    "total_processed_images": len(df_final),
    "num_classes": df_final["label"].nunique(),
    "train_count": len(train_df),
    "val_count": len(val_df),
    "test_count": len(test_df)
}

print(summary)

{'total_original_images': 35887, 'corrupted_images_removed': 0, 'duplicate_images_removed': 1853, 'total_processed_images': 3535, 'num_classes': 2, 'train_count': 2474, 'val_count': 530, 'test_count': 531}


In [37]:
missing_files = []
for path in df_final["filepath"]:
    if not os.path.exists(path):
        missing_files.append(path)

empty_images = []
for path in df_final["filepath"]:
    try:
        img = Image.open(path)
        if img.size[0] == 0 or img.size[1] == 0:
            empty_images.append(path)
    except:
        empty_images.append(path)

processed_class_counts = df_final["label"].value_counts()

hashes_processed = {}
duplicates_processed = []
for path in tqdm(df_final["filepath"], desc="Checking duplicates in processed dataset"):
    try:
        with open(path, "rb") as f:
            h = hashlib.md5(f.read()).hexdigest()
        if h in hashes_processed:
            duplicates_processed.append(path)
        else:
            hashes_processed[h] = path
    except:
        continue

checks_summary = {
    "missing_files": len(missing_files),
    "empty_images": len(empty_images),
    "duplicates_in_processed": len(duplicates_processed),
    "class_counts": processed_class_counts.to_dict()
}

print("========== POST-PROCESSING DATA CHECKS ==========")
for key, value in checks_summary.items():
    print(f"{key}: {value}")
print("=================================================")

Checking duplicates in processed dataset: 100%|██████████| 3535/3535 [00:00<00:00, 9490.41it/s] 

missing_files: 0
empty_images: 0
duplicates_in_processed: 0
class_counts: {'surprised': 2460, 'sad': 1075}



