# ðŸ“Š Exploratory Data Analysis
## SVM+ORB Mustache Try-On System

This notebook explores:
1. Dataset statistics and distribution
2. ORB feature visualization
3. BoVW histogram analysis
4. Face detection performance
5. Sample predictions

In [None]:
import sys
sys.path.append('..')

import cv2
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json

from pipelines.dataset import FaceDataset
from pipelines.features import ORBFeatureExtractor, BoVWEncoder
from pipelines.utils import visualize_keypoints

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

%matplotlib inline

## 1. Dataset Statistics

In [None]:
# Load dataset
cascade_path = '../assets/cascades/haarcascade_frontalface_default.xml'

dataset = FaceDataset(
    pos_dir='../data/faces',
    neg_dir='../data/non_faces',
    face_cascade_path=cascade_path,
    random_state=42
)

stats = dataset.load_and_split(roi_size=(128, 128))
print(json.dumps(stats, indent=2))

In [None]:
# Visualize split distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Split sizes
splits = ['Train', 'Val', 'Test']
sizes = [stats['train'], stats['val'], stats['test']]

axes[0].bar(splits, sizes, color=['#2ecc71', '#3498db', '#e74c3c'])
axes[0].set_title('Dataset Split Sizes', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Number of Samples')
axes[0].grid(axis='y', alpha=0.3)

# Class distribution
pos_counts = [stats['train_pos'], stats['val_pos'], stats['test_pos']]
neg_counts = [stats['train_neg'], stats['val_neg'], stats['test_neg']]

x = np.arange(len(splits))
width = 0.35

axes[1].bar(x - width/2, pos_counts, width, label='Face', color='#3498db')
axes[1].bar(x + width/2, neg_counts, width, label='Non-Face', color='#e74c3c')
axes[1].set_title('Class Distribution', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Count')
axes[1].set_xticks(x)
axes[1].set_xticklabels(splits)
axes[1].legend()
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 2. Visualize Sample Images

In [None]:
# Display random samples
X_train, y_train = dataset.get_train_data()

fig, axes = plt.subplots(2, 8, figsize=(16, 4))

# Show faces
face_indices = np.where(y_train == 1)[0]
for i, ax in enumerate(axes[0]):
    idx = np.random.choice(face_indices)
    img = cv2.cvtColor(X_train[idx], cv2.COLOR_BGR2RGB)
    ax.imshow(img)
    ax.set_title('Face', fontsize=10)
    ax.axis('off')

# Show non-faces
nonface_indices = np.where(y_train == 0)[0]
for i, ax in enumerate(axes[1]):
    idx = np.random.choice(nonface_indices)
    img = cv2.cvtColor(X_train[idx], cv2.COLOR_BGR2RGB)
    ax.imshow(img)
    ax.set_title('Non-Face', fontsize=10)
    ax.axis('off')

plt.suptitle('Training Samples', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

## 3. ORB Feature Visualization

In [None]:
# Extract and visualize ORB keypoints
orb_extractor = ORBFeatureExtractor(n_features=500)

fig, axes = plt.subplots(2, 4, figsize=(16, 8))

for i, ax in enumerate(axes.flat):
    # Get random face sample
    idx = np.random.choice(face_indices)
    img = X_train[idx].copy()
    
    # Extract keypoints and descriptors
    kp, desc = orb_extractor.extract_keypoints_and_descriptors(img)
    
    # Draw keypoints
    img_kp = visualize_keypoints(img, kp, color=(0, 255, 0))
    img_kp = cv2.cvtColor(img_kp, cv2.COLOR_BGR2RGB)
    
    ax.imshow(img_kp)
    ax.set_title(f'{len(kp)} keypoints', fontsize=10)
    ax.axis('off')

plt.suptitle('ORB Keypoint Detection', fontsize=16, fontweight='bold', y=0.98)
plt.tight_layout()
plt.show()

In [None]:
# Analyze keypoint distribution
keypoint_counts = []

for i in range(min(100, len(X_train))):
    kp, _ = orb_extractor.extract_keypoints_and_descriptors(X_train[i])
    keypoint_counts.append(len(kp) if kp else 0)

plt.figure(figsize=(10, 5))
plt.hist(keypoint_counts, bins=30, color='#3498db', alpha=0.7, edgecolor='black')
plt.axvline(np.mean(keypoint_counts), color='red', linestyle='--', 
            label=f'Mean: {np.mean(keypoint_counts):.1f}')
plt.title('ORB Keypoint Count Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Number of Keypoints')
plt.ylabel('Frequency')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

print(f"Keypoint Statistics:")
print(f"  Mean: {np.mean(keypoint_counts):.1f}")
print(f"  Median: {np.median(keypoint_counts):.1f}")
print(f"  Min: {np.min(keypoint_counts)}")
print(f"  Max: {np.max(keypoint_counts)}")

## 4. Bag of Visual Words Analysis

In [None]:
# Build BoVW codebook on subset
print("Building BoVW codebook...")

# Extract descriptors from 100 samples
descriptors_list = []
for i in range(min(100, len(X_train))):
    desc = orb_extractor.extract_descriptors(X_train[i])
    if desc is not None:
        descriptors_list.append(desc)

# Create codebook
bovw_encoder = BoVWEncoder(k=128, random_state=42)
bovw_encoder.fit(descriptors_list, max_descriptors=50000)

print("âœ“ Codebook created!")

In [None]:
# Visualize BoVW histograms
fig, axes = plt.subplots(2, 3, figsize=(15, 8))

for i, ax in enumerate(axes.flat):
    # Get random sample
    idx = np.random.choice(len(X_train))
    img = X_train[idx]
    label = 'Face' if y_train[idx] == 1 else 'Non-Face'
    
    # Extract and encode
    desc = orb_extractor.extract_descriptors(img)
    hist = bovw_encoder.encode(desc)
    
    # Plot histogram
    ax.bar(range(len(hist)), hist, width=1.0, 
           color='#3498db' if y_train[idx] == 1 else '#e74c3c')
    ax.set_title(f'{label} - L1 norm: {np.sum(hist):.2f}', fontsize=10)
    ax.set_xlabel('Visual Word ID')
    ax.set_ylabel('Frequency')
    ax.set_ylim([0, max(hist) * 1.1])

plt.suptitle('BoVW Histogram Examples (k=128)', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()

## 5. Feature Space Visualization (t-SNE)

In [None]:
from sklearn.manifold import TSNE

# Encode subset of data
n_samples = 200
sample_indices = np.random.choice(len(X_train), n_samples, replace=False)

print("Encoding features...")
features = []
labels = []

for idx in sample_indices:
    desc = orb_extractor.extract_descriptors(X_train[idx])
    hist = bovw_encoder.encode(desc)
    features.append(hist)
    labels.append(y_train[idx])

features = np.array(features)
labels = np.array(labels)

print("Running t-SNE...")
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
features_2d = tsne.fit_transform(features)

# Plot
plt.figure(figsize=(10, 8))
for label, name, color in [(0, 'Non-Face', '#e74c3c'), (1, 'Face', '#3498db')]:
    mask = labels == label
    plt.scatter(features_2d[mask, 0], features_2d[mask, 1], 
                label=name, alpha=0.6, s=50, c=color)

plt.title('t-SNE Visualization of BoVW Features', fontsize=14, fontweight='bold')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Model Performance (if trained)

In [None]:
# Load config if model exists
config_path = Path('../models/config.json')

if config_path.exists():
    with open(config_path, 'r') as f:
        config = json.load(f)
    
    print("Model Configuration:")
    print(json.dumps({
        'ORB Features': config['orb_features'],
        'BoVW k': config['bovw_k'],
        'SVM Kernel': config['svm_kernel'],
        'Best Params': config['best_params']
    }, indent=2))
    
    print("\nTest Results:")
    print(json.dumps(config['test_results'], indent=2))
    
    # Plot metrics
    metrics = config['test_results']
    metric_names = ['accuracy', 'precision', 'recall', 'f1']
    values = [metrics[m] for m in metric_names]
    
    plt.figure(figsize=(10, 6))
    bars = plt.bar(metric_names, values, color=['#2ecc71', '#3498db', '#e74c3c', '#f39c12'])
    plt.title('Model Performance Metrics', fontsize=14, fontweight='bold')
    plt.ylabel('Score')
    plt.ylim([0, 1.0])
    
    # Add value labels on bars
    for bar, val in zip(bars, values):
        plt.text(bar.get_x() + bar.get_width()/2, val + 0.02, 
                f'{val:.3f}', ha='center', va='bottom', fontweight='bold')
    
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("No trained model found. Run training first!")

## 7. Conclusion

This EDA revealed:
- Dataset balance and split quality
- ORB feature density and distribution
- BoVW histogram characteristics
- Feature separability in 2D space
- Model performance metrics

Next steps:
1. Train with optimal hyperparameters
2. Test on webcam/images
3. Fine-tune mustache overlay parameters