# 1. Examining imbalance in outcome variable in the training set

In [1]:
import os
import shutil
import random
from pathlib import Path
from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
data_dir = Path("data/raw")
fraud_dir = data_dir / "Fraud"
non_fraud_dir = data_dir / "Non-Fraud"

In [5]:
fraud_count = len([f for f in os.listdir(fraud_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))])
non_fraud_count = len([f for f in os.listdir(non_fraud_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))])

In [6]:
total = fraud_count + non_fraud_count
fraud_pct = (fraud_count / total) * 100
imbalance_ratio = non_fraud_count / fraud_count

print(f"Total images: {total}")
print(f"Fraud images: {fraud_count} ({fraud_pct:.2f}%)")
print(f"Non-Fraud images: {non_fraud_count} ({100 - fraud_pct:.2f}%)")
print(f"Imbalance Ratio (Non-Fraud/Fraud): {imbalance_ratio:.2f}")

Total images: 5200
Fraud images: 200 (3.85%)
Non-Fraud images: 5000 (96.15%)
Imbalance Ratio (Non-Fraud/Fraud): 25.00


# 2. Train-Validation Split

In [7]:
random.seed(42)
np.random.seed(42)

raw_dir = "data/raw/"
train_dir = "data/train/"
val_dir = "data/val/"

def create_balanced_split():
    
    for output in [train_dir, val_dir]:
        os.makedirs(os.path.join(output, "Fraud"), exist_ok=True)
        os.makedirs(os.path.join(output, "Non-Fraud"), exist_ok=True)
    
    for class_name in ["Fraud", "Non-Fraud"]:
        class_path = os.path.join(raw_dir, class_name)
        
        if not os.path.exists(class_path):
            print(f"Warning: {class_path} doesn't exist, skipping...")
            continue
        
        image_files = [f for f in os.listdir(class_path)
                       if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))]
        
        if len(image_files) == 0:
            print(f"Warning: No images found in {class_path}")
            continue
        
        print(f"  Processing {class_name}: {len(image_files)} images")
        
        train_files, val_files = train_test_split(
            image_files,
            test_size=0.2,
            random_state=12345,
            shuffle=True
        )
        
        for file_name in train_files:
            src = os.path.join(class_path, file_name)
            dst = os.path.join(train_dir, class_name, file_name)
            shutil.copy2(src, dst)
        
        for file_name in val_files:
            src = os.path.join(class_path, file_name)
            dst = os.path.join(val_dir, class_name, file_name)
            shutil.copy2(src, dst)
        
        print(f"    Train: {len(train_files)}, Val: {len(val_files)}")

In [8]:
create_balanced_split()

  Processing Fraud: 200 images
    Train: 160, Val: 40
  Processing Non-Fraud: 5000 images
    Train: 4000, Val: 1000


# 3. Addressing the Imbalance

### Due to the severe inbalance, we should...
    - Use BALANCED split for training (create_balanced_oversample_split)
    - Use STRATIFIED split for final evaluation
    - Use Focal Loss or class weighting
    - Focus on F1-score, not accuracy
    - Monitor fraud detection rate carefully

In [40]:
fraud_dir = "data/raw/Fraud"
non_fraud_dir = "data/raw/Non-Fraud"

fraud_files = [f for f in os.listdir(fraud_dir) 
               if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))]
non_fraud_files = [f for f in os.listdir(non_fraud_dir) 
                if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))]

target_fraud_count = len(non_fraud_files)
oversample_factor = target_fraud_count // len(fraud_files)
remainder = target_fraud_count % len(fraud_files)

print(f"   Original: {len(fraud_files)} fraud, {len(non_fraud_files)} non-fraud")
print(f"   Strategy: Each fraud image used {oversample_factor} times + {remainder} extra")

   Original: 200 fraud, 5000 non-fraud
   Strategy: Each fraud image used 25 times + 0 extra
