# Data Preprocessing Pipeline
**Author:** G8  
**Task:** 1.1 - Single Object Image Processing  
**Timeline:** - Feb 1, 2026

**Purpose:**
- Preprocess raw images to 224x224 RGB
- Split into train/val/test (70/15/15)
- Create class mapping for model training

## Setup and Imports

In [30]:
import os
import cv2
import numpy as np
import pandas as pd
from pathlib import Path
import shutil
from tqdm import tqdm
import matplotlib.pyplot as plt
import json

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")
print(f"OpenCV version: {cv2.__version__}")
print(f"NumPy version: {np.__version__}")

Libraries imported successfully!
OpenCV version: 4.8.1
NumPy version: 1.24.3


## Configuration

In [31]:
# Project paths - adjust if needed
PROJECT_ROOT = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
RAW_DATA_PATH = PROJECT_ROOT.parent / "Discriminative Project Milstone_1"

# Output paths
RAW_ORGANIZED = PROJECT_ROOT / "data" / "raw"
PROCESSED_PATH = PROJECT_ROOT / "data" / "processed" / "single_objects"
STATS_PATH = PROJECT_ROOT / "data" / "statistics"

# Preprocessing parameters
TARGET_SIZE = (224, 224)
TRAIN_RATIO = 0.70
VAL_RATIO = 0.15
TEST_RATIO = 0.15

print("Configuration:")
print(f"  Project root: {PROJECT_ROOT}")
print(f"  Raw data: {RAW_DATA_PATH}")
print(f"  Target size: {TARGET_SIZE}")
print(f"  Split: {TRAIN_RATIO}/{VAL_RATIO}/{TEST_RATIO}")

Configuration:
  Project root: /Users/kevin/Documents/GitHub/Python/VESKL/11.DAE/NEU/NEU_IE7615/Prj/Discriminative/G8/CNN_Attendance_System
  Raw data: /Users/kevin/Documents/GitHub/Python/VESKL/11.DAE/NEU/NEU_IE7615/Prj/Discriminative/G8/Discriminative Project Milstone_1
  Target size: (224, 224)
  Split: 0.7/0.15/0.15


## Step 1: Organize Raw Data
Copy from `Discriminative Project Milstone_1/*_OBJ*` to `data/raw/OBJ*/`

In [32]:
def extract_object_id(folder_name):
    """
    Extract standardized object ID
    Examples: 'images_OBJ001' -> 'OBJ001', 'IMAGES_OBJ786' -> 'OBJ786'
    """
    parts = folder_name.upper().split('OBJ')
    if len(parts) >= 2:
        obj_num = ''.join(filter(str.isdigit, parts[1][:10]))
        if obj_num:
            return f"OBJ{obj_num.zfill(3)}"
    return None

print("="*80)
print("ORGANIZING RAW DATA")
print("="*80)

# Find all object folders
obj_folders = [f for f in RAW_DATA_PATH.iterdir() if f.is_dir() and '_OBJ' in f.name.upper()]
obj_folders.sort()

print(f"\nFound {len(obj_folders)} object folders")

# Organize into standardized structure
RAW_ORGANIZED.mkdir(parents=True, exist_ok=True)
organized_count = 0

for folder in tqdm(obj_folders, desc="Organizing folders"):
    obj_id = extract_object_id(folder.name)
    if obj_id:
        target_folder = RAW_ORGANIZED / obj_id
        target_folder.mkdir(exist_ok=True)
        
        # Copy image files
        exts = ('.jpg', '.jpeg', '.png', '.bmp', '.JPG', '.JPEG', '.PNG', '.BMP')
        for img_file in folder.iterdir():
            if img_file.suffix in exts:
                shutil.copy2(img_file, target_folder / img_file.name)
        
        organized_count += 1

print(f"\nOrganized {organized_count} objects into: {RAW_ORGANIZED}")

ORGANIZING RAW DATA

Found 39 object folders


Organizing folders: 100%|██████████| 39/39 [00:02<00:00, 17.58it/s]


Organized 39 objects into: /Users/kevin/Documents/GitHub/Python/VESKL/11.DAE/NEU/NEU_IE7615/Prj/Discriminative/G8/CNN_Attendance_System/data/raw





## Step 2: Preprocess Images
Resize to 224x224 if need and prepare for training

In [33]:
def preprocess_image(img_path, target_size=(224, 224)):
    """
    Preprocess single image:
    1. Read as RGB
    2. Resize to target_size only if needed
    3. Keep as uint8 for storage (normalize during training)
    """
    img = cv2.imread(str(img_path))
    if img is None:
        return None
    
    # BGR to RGB
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    # Only resize if the image size is not equal to target_size
    if img_rgb.shape[:2] != target_size:  # img.shape[:2] gives (height, width)
        img_rgb = cv2.resize(img_rgb, target_size, interpolation=cv2.INTER_AREA)
    
    return img_rgb


print("="*80)
print("PREPROCESSING IMAGES")
print("="*80)

object_folders = sorted([f for f in RAW_ORGANIZED.iterdir() if f.is_dir()])
print(f"\nProcessing {len(object_folders)} objects...")

proc_stats = []
temp_output = PROCESSED_PATH / "all_preprocessed"

for obj_folder in tqdm(object_folders, desc="Processing"):
    obj_id = obj_folder.name
    
    # Get images
    exts = ('.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG')
    imgs = [f for f in obj_folder.iterdir() if f.suffix in exts]
    
    success, fail = 0, 0
    output_folder = temp_output / obj_id
    output_folder.mkdir(parents=True, exist_ok=True)
    
    for img_file in imgs:
        try:
            img_proc = preprocess_image(img_file, TARGET_SIZE)
            if img_proc is not None:
                out_path = output_folder / f"{img_file.stem}.jpg"
                # Save as BGR
                cv2.imwrite(str(out_path), cv2.cvtColor(img_proc, cv2.COLOR_RGB2BGR))
                success += 1
            else:
                fail += 1
        except Exception:
            fail += 1
    
    proc_stats.append({'object_id': obj_id, 'successful': success, 'failed': fail})

df_proc = pd.DataFrame(proc_stats)
print(f"\nTotal processed: {df_proc['successful'].sum()}")
print(f"Failed: {df_proc['failed'].sum()}")
print(f"Avg per object: {df_proc['successful'].mean():.1f}")

PREPROCESSING IMAGES

Processing 39 objects...


Processing: 100%|██████████| 39/39 [00:02<00:00, 15.04it/s]


Total processed: 4108
Failed: 0
Avg per object: 105.3





## Step 3: Train/Val/Test Split

In [34]:
print("="*80)
print("SPLITTING DATASET")
print("="*80)

preprocessed_root = PROCESSED_PATH / "all_preprocessed"
objects = sorted([f for f in preprocessed_root.iterdir() if f.is_dir()])

split_stats = []

for obj_folder in tqdm(objects, desc="Splitting"):
    obj_id = obj_folder.name
    images = sorted(list(obj_folder.glob("*.jpg")))
    
    if len(images) == 0:
        continue
    
    # Shuffle
    np.random.shuffle(images)
    
    # Split indices
    n = len(images)
    train_end = int(n * TRAIN_RATIO)
    val_end = train_end + int(n * VAL_RATIO)
    
    splits = {
        'train': images[:train_end],
        'val': images[train_end:val_end],
        'test': images[val_end:]
    }
    
    # Copy to split folders
    for split_name, files in splits.items():
        output_folder = PROCESSED_PATH / split_name / obj_id
        output_folder.mkdir(parents=True, exist_ok=True)
        for img_file in files:
            shutil.copy2(img_file, output_folder / img_file.name)
    
    split_stats.append({
        'object_id': obj_id,
        'total': n,
        'train': len(splits['train']),
        'val': len(splits['val']),
        'test': len(splits['test'])
    })

df_split = pd.DataFrame(split_stats)
print(f"\nSPLIT SUMMARY:")
print(f"  Train: {df_split['train'].sum()} ({df_split['train'].sum()/df_split['total'].sum()*100:.1f}%)")
print(f"  Val:   {df_split['val'].sum()} ({df_split['val'].sum()/df_split['total'].sum()*100:.1f}%)")
print(f"  Test:  {df_split['test'].sum()} ({df_split['test'].sum()/df_split['total'].sum()*100:.1f}%)")

# Save stats
df_split.to_csv(STATS_PATH / 'split_distribution.csv', index=False)
print(f"\nSaved: {STATS_PATH / 'split_distribution.csv'}")

SPLITTING DATASET


Splitting: 100%|██████████| 39/39 [00:01<00:00, 24.89it/s]


SPLIT SUMMARY:
  Train: 2871 (69.9%)
  Val:   611 (14.9%)
  Test:  626 (15.2%)

Saved: /Users/kevin/Documents/GitHub/Python/VESKL/11.DAE/NEU/NEU_IE7615/Prj/Discriminative/G8/CNN_Attendance_System/data/statistics/split_distribution.csv





## Step 4: Create Class Mapping

In [35]:
print("="*80)
print("CREATING CLASS MAPPING")
print("="*80)

train_path = PROCESSED_PATH / "train"
classes = sorted([f.name for f in train_path.iterdir() if f.is_dir()])

class_to_idx = {cls: idx for idx, cls in enumerate(classes)}
idx_to_class = {idx: cls for cls, idx in class_to_idx.items()}

# Save mapping
mapping = {
    'class_to_idx': class_to_idx,
    'idx_to_class': idx_to_class,
    'num_classes': len(classes)
}

mapping_file = PROJECT_ROOT / "data" / "class_mapping.json"
with open(mapping_file, 'w') as f:
    json.dump(mapping, f, indent=2)

print(f"\nTotal classes: {len(classes)}")
print(f"\nFirst 10:")
for cls, idx in list(class_to_idx.items())[:10]:
    print(f"  {cls} -> {idx}")
print(f"\nSaved: {mapping_file}")

CREATING CLASS MAPPING

Total classes: 39

First 10:
  OBJ001 -> 0
  OBJ002 -> 1
  OBJ003 -> 2
  OBJ004 -> 3
  OBJ005 -> 4
  OBJ006 -> 5
  OBJ007 -> 6
  OBJ008 -> 7
  OBJ009 -> 8
  OBJ010 -> 9

Saved: /Users/kevin/Documents/GitHub/Python/VESKL/11.DAE/NEU/NEU_IE7615/Prj/Discriminative/G8/CNN_Attendance_System/data/class_mapping.json


## Step 5: Verification

In [36]:
print("="*80)
print("VERIFICATION")
print("="*80)

for split in ['train', 'val', 'test']:
    split_path = PROCESSED_PATH / split
    objs = [f for f in split_path.iterdir() if f.is_dir()]
    total_imgs = sum(len(list(f.glob("*.jpg"))) for f in objs)
    
    print(f"\n{split.upper()}:")
    print(f"  Objects: {len(objs)}")
    print(f"  Images: {total_imgs}")
    
    # Check sample image
    for f in objs:
        imgs = list(f.glob("*.jpg"))
        if imgs:
            img = cv2.imread(str(imgs[0]))
            print(f"  Sample shape: {img.shape} (expected: 224, 224, 3)")
            break

print("\n" + "="*80)
print("TASK 1.1 COMPLETED!")
print("="*80)

VERIFICATION

TRAIN:
  Objects: 39
  Images: 2871
  Sample shape: (224, 224, 3) (expected: 224, 224, 3)

VAL:
  Objects: 39
  Images: 611
  Sample shape: (224, 224, 3) (expected: 224, 224, 3)

TEST:
  Objects: 39
  Images: 626
  Sample shape: (224, 224, 3) (expected: 224, 224, 3)

TASK 1.1 COMPLETED!
