# Phase 9.1: Initial Data Exploration

This notebook explores raw sensor data and provides initial insights into:
- Data structure and quality
- Signal characteristics
- Class distributions
- FFT frequency content

## Setup & Configuration

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Add project to path
project_root = Path('..')
sys.path.insert(0, str(project_root))

# Configure plotting
%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Configuration
DATA_DIR = project_root / 'data' / 'processed'
RAW_DATA_DIR = project_root / 'data' / 'raw'
OUTPUT_DIR = Path('outputs/exploration')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

CLASSES = ['No Leak', '1/16"', '3/32"', '1/8"']
SAMPLING_RATE = 10000  # Hz
N_CHANNELS = 9  # 3 accelerometers × 3 axes

print(f"Project root: {project_root}")
print(f"Data directory: {DATA_DIR}")
print(f"Output directory: {OUTPUT_DIR}")

## Data Loading

In [None]:
# Load processed data
try:
    X_train = np.load(DATA_DIR / 'X_train.npy')
    y_train = np.load(DATA_DIR / 'y_train.npy')
    X_val = np.load(DATA_DIR / 'X_val.npy')
    y_val = np.load(DATA_DIR / 'y_val.npy')
    X_test = np.load(DATA_DIR / 'X_test.npy')
    y_test = np.load(DATA_DIR / 'y_test.npy')
    
    print("✓ Processed data loaded successfully")
    print(f"  Training:   X={X_train.shape}, y={y_train.shape}")
    print(f"  Validation: X={X_val.shape}, y={y_val.shape}")
    print(f"  Test:       X={X_test.shape}, y={y_test.shape}")
except FileNotFoundError:
    print("⚠ Processed data not found. Please run prepare_data.py first.")
    print("  Command: python scripts/prepare_data.py --raw-data data/raw --output-dir data/processed")
    X_train = X_val = X_test = None
    y_train = y_val = y_test = None

## Data Statistics

In [None]:
if X_train is not None:
    # Class distribution
    print("\n=== Class Distribution ===")
    for split_name, y in [('Train', y_train), ('Val', y_val), ('Test', y_test)]:
        print(f"\n{split_name}:")
        unique, counts = np.unique(y, return_counts=True)
        for class_idx, count in zip(unique, counts):
            pct = 100 * count / len(y)
            print(f"  {CLASSES[class_idx]:10s}: {count:4d} ({pct:5.1f}%)")
    
    # Data statistics
    print("\n=== Signal Statistics ===")
    X_all = np.vstack([X_train, X_val, X_test])
    print(f"Mean: {X_all.mean():.6f}")
    print(f"Std:  {X_all.std():.6f}")
    print(f"Min:  {X_all.min():.6f}")
    print(f"Max:  {X_all.max():.6f}")
    print(f"Shape: {X_all.shape} (samples, freq_bins, channels)")

## Signal Characteristics Per Class

In [None]:
if X_train is not None:
    # Compute statistics per class
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    X_all = np.vstack([X_train, X_val, X_test])
    y_all = np.hstack([y_train, y_val, y_test])
    
    for idx, class_name in enumerate(CLASSES):
        class_data = X_all[y_all == idx]
        ax = axes[idx // 2, idx % 2]
        
        # Plot mean spectrum with std
        mean_spec = class_data.mean(axis=(0, 2))
        std_spec = class_data.std(axis=(0, 2))
        
        freq_bins = np.arange(len(mean_spec))
        ax.plot(freq_bins, mean_spec, 'b-', linewidth=2, label='Mean')
        ax.fill_between(freq_bins, mean_spec - std_spec, mean_spec + std_spec, 
                         alpha=0.3, label='±1 Std')
        
        ax.set_title(f'{class_name} (n={len(class_data)})', fontsize=12, fontweight='bold')
        ax.set_xlabel('Frequency Bin')
        ax.set_ylabel('FFT Magnitude')
        ax.legend()
        ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'signal_characteristics.png', dpi=150, bbox_inches='tight')
    plt.show()
    print("✓ Saved: signal_characteristics.png")

## Class Distribution Visualization

In [None]:
if X_train is not None:
    fig, ax = plt.subplots(figsize=(10, 6))
    
    X_all = np.vstack([X_train, X_val, X_test])
    y_all = np.hstack([y_train, y_val, y_test])
    
    unique, counts = np.unique(y_all, return_counts=True)
    colors = sns.color_palette("husl", len(CLASSES))
    
    bars = ax.bar(CLASSES, counts, color=colors, alpha=0.7, edgecolor='black', linewidth=2)
    
    # Add value labels on bars
    for bar, count in zip(bars, counts):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
               f'{int(count)}\n({100*count/len(y_all):.1f}%)',
               ha='center', va='bottom', fontsize=11, fontweight='bold')
    
    ax.set_ylabel('Number of Samples', fontsize=12)
    ax.set_title('Class Distribution (All Data)', fontsize=14, fontweight='bold')
    ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'class_distribution.png', dpi=150, bbox_inches='tight')
    plt.show()
    print("✓ Saved: class_distribution.png")

## Per-Channel Analysis

In [None]:
if X_train is not None:
    # Analyze energy in each channel
    X_all = np.vstack([X_train, X_val, X_test])
    y_all = np.hstack([y_train, y_val, y_test])
    
    channel_names = [
        'Acc0_X', 'Acc0_Y', 'Acc0_Z',
        'Acc1_X', 'Acc1_Y', 'Acc1_Z',
        'Acc2_X', 'Acc2_Y', 'Acc2_Z'
    ]
    
    fig, axes = plt.subplots(3, 3, figsize=(15, 10))
    axes = axes.flatten()
    
    for ch_idx in range(N_CHANNELS):
        channel_data = X_all[:, :, ch_idx]
        channel_energy = np.sum(channel_data ** 2, axis=1)
        
        ax = axes[ch_idx]
        for class_idx, class_name in enumerate(CLASSES):
            mask = y_all == class_idx
            ax.hist(channel_energy[mask], bins=20, alpha=0.6, label=class_name)
        
        ax.set_title(f'{channel_names[ch_idx]}', fontweight='bold')
        ax.set_xlabel('Signal Energy')
        ax.set_ylabel('Frequency')
        ax.legend(fontsize=8)
        ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'channel_energy_distribution.png', dpi=150, bbox_inches='tight')
    plt.show()
    print("✓ Saved: channel_energy_distribution.png")

## Data Quality Assessment

In [None]:
if X_train is not None:
    X_all = np.vstack([X_train, X_val, X_test])
    
    print("\n=== Data Quality Checks ===")
    
    # Check for NaN/Inf
    nan_count = np.isnan(X_all).sum()
    inf_count = np.isinf(X_all).sum()
    print(f"\nNaN values:     {nan_count} ({100*nan_count/X_all.size:.4f}%)")
    print(f"Inf values:     {inf_count} ({100*inf_count/X_all.size:.4f}%)")
    
    # Check for zeros
    zero_count = (X_all == 0).sum()
    print(f"Zero values:    {zero_count} ({100*zero_count/X_all.size:.4f}%)")
    
    # Check value ranges
    print(f"\nValue range:    [{X_all.min():.6f}, {X_all.max():.6f}]")
    print(f"Mean:           {X_all.mean():.6f}")
    print(f"Std:            {X_all.std():.6f}")
    
    # Check shape consistency
    print(f"\nShape consistency: OK (all samples have shape {X_all.shape[1:]})")
    
    print("\n✓ Data quality assessment complete")

## Summary

In [None]:
if X_train is not None:
    print("\n" + "="*60)
    print("DATA EXPLORATION SUMMARY")
    print("="*60)
    print(f"\nDataset size:    {len(X_train) + len(X_val) + len(X_test)} samples")
    print(f"Train/Val/Test:  {len(X_train)}/{len(X_val)}/{len(X_test)}")
    print(f"Classes:         {len(CLASSES)} ({', '.join(CLASSES)})")
    print(f"Features:        {X_train.shape[1]} frequency bins × {X_train.shape[2]} channels")
    print(f"\nOutputs saved to: {OUTPUT_DIR}")
    print("  - signal_characteristics.png")
    print("  - class_distribution.png")
    print("  - channel_energy_distribution.png")
    print("\n✓ Data exploration complete!")
    print("\nNext steps:")
    print("  1. Review signal characteristics per class")
    print("  2. Check for data quality issues")
    print("  3. Verify class balance")
    print("  4. Go to 02_fft_comparison.ipynb to validate FFT method")