# 1. Initial Data Exploration, Preprocessing, and Heuristic Labeling

This notebook covers the initial steps of loading example CPAP data, performing preprocessing, detecting breaths and events, and then applying a heuristic (rule-based) labeling approach to a subset of events. This labeled subset will serve as the initial 'ground truth' for training our classification model.

**Assumed File Structure:**
```
repository_root/
├── data/                     # CPAP data CSVs will go here
│   └── dummy_cpap_data.csv   # Example data file
├── notebooks/
│   └── 01_Initial_Data_Exploration_and_Labeling.ipynb
├── src/
│   ├── data_loader.py
│   ├── preprocessing.py
│   ├── breath_detection.py
│   ├── event_detection.py
│   └── feature_engineering.py
│   └── classification_model.py
└── results/
    └── (plots, outputs will be saved here)
```

**Note:** This notebook uses placeholder data generation. In a real scenario, you would replace `generate_dummy_cpap_data_for_notebook` with actual data loading using `src.data_loader.load_cpap_data`.

## 1.1 Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import os

# Add src directory to Python path to import modules
module_path = os.path.abspath(os.path.join('..')) # Assumes notebook is in 'notebooks' directory
if module_path not in sys.path:
    sys.path.append(module_path)

from src.data_loader import load_cpap_data, resample_data
from src.preprocessing import butterworth_filter, flag_high_leak_periods, calculate_rolling_baseline
from src.breath_detection import detect_breaths_from_flow
from src.event_detection import detect_apneas_hypopneas
from src.feature_engineering import extract_features_for_event

# Configure plotting
plt.rcParams['figure.figsize'] = (15, 5)
pd.set_option('display.max_rows', 100)

In [None]:
# Helper function to generate dummy data for this notebook (mimicking OSCAR CSV structure)
def generate_dummy_cpap_data_for_notebook(num_seconds=600, sampling_hz=25, filename="dummy_cpap_data.csv"):
    num_samples = num_seconds * sampling_hz
    time_stamps = pd.to_datetime(np.arange(num_samples) / sampling_hz, unit='s')
    
    # Base flow simulating breathing
    base_flow = 0.3 * np.sin(2 * np.pi * 0.25 * np.arange(num_samples) / sampling_hz) # 0.25 Hz = 15 bpm
    base_flow += 0.05 * np.random.randn(num_samples) # Noise
    
    # Pressure (simple auto-adjust simulation)
    pressure = 8 + 0.5 * np.sin(2 * np.pi * 0.01 * np.arange(num_samples) / sampling_hz) 
    pressure += 0.1 * np.random.randn(num_samples)

    # Leak rate
    leak_rate = 2.0 + np.abs(0.5 * np.random.randn(num_samples))
    leak_rate[int(num_samples*0.4):int(num_samples*0.45)] = 25 # High leak period

    # Simulate an obstructive-like event (flattened flow, pressure increase)
    obs_start, obs_end = int(num_samples*0.2), int(num_samples*0.2) + 15*sampling_hz
    base_flow[obs_start:obs_end] = 0.05 * np.sin(2 * np.pi * 0.2 * np.arange(obs_end-obs_start) / sampling_hz) + 0.02 # Low, flat
    pressure[obs_start:obs_end+5*sampling_hz] += 2 # Pressure increases

    # Simulate a central-like event (smooth cessation)
    cen_start, cen_end = int(num_samples*0.6), int(num_samples*0.6) + 12*sampling_hz
    for i in range(cen_start - 2*sampling_hz, cen_start):
        base_flow[i] *= (cen_start - i) / (2*sampling_hz) # Fade out
    base_flow[cen_start:cen_end] = 0.01 * np.random.randn(cen_end-cen_start) # Near zero
    for i in range(cen_end, cen_end + 2*sampling_hz):
        if i < num_samples:
            base_flow[i] *= (i - cen_end) / (2*sampling_hz) # Fade in
    
    df = pd.DataFrame({
        'Timestamp': time_stamps,
        'FlowRate': base_flow,
        'Pressure': pressure,
        'LeakRate': leak_rate,
        'MinuteVent': 6.0 + np.sin(2 * np.pi * 0.02 * np.arange(num_samples) / sampling_hz),
        'RespRate': 15 + np.random.randn(num_samples),
        'TidalVol': 0.4 + 0.05 * np.random.randn(num_samples)
    })
    
    data_dir = '../data'
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    filepath = os.path.join(data_dir, filename)
    df.to_csv(filepath, index=False)
    print(f"Generated dummy data at {filepath}")
    return filepath

# Generate or specify path to your data file
dummy_data_filepath = generate_dummy_cpap_data_for_notebook()
# cpap_data_filepath = '../data/your_oscar_export.csv' # Replace with actual path
cpap_data_filepath = dummy_data_filepath 

# Load data using the data_loader module
raw_df = load_cpap_data(cpap_data_filepath,
                          timestamp_col='Timestamp',
                          flow_rate_col='FlowRate',
                          pressure_col='Pressure',
                          leak_rate_col='LeakRate',
                          minute_vent_col='MinuteVent',
                          resp_rate_col='RespRate',
                          tidal_vol_col='TidalVol')

if raw_df is not None:
    print("Data loaded successfully:")
    raw_df.info()
    raw_df.head()
else:
    print("Failed to load data.")

## 1.2 Data Resampling and Preprocessing

In [None]:
SAMPLING_FREQ_HZ = 25 # Target sampling frequency

if raw_df is not None:
    df_resampled = resample_data(raw_df, target_freq_hz=SAMPLING_FREQ_HZ)
    if df_resampled is not None:
        print(f"\nResampled data to {SAMPLING_FREQ_HZ} Hz:")
        df_resampled.info()
        
        # Apply low-pass filter to flow rate
        df_resampled['flow_rate_filtered'] = butterworth_filter(
            df_resampled['flow_rate'], 
            filter_type='lowpass', 
            cutoff_freq_hz=2.0, # Cutoff to remove noise but keep breath shape
            sampling_freq_hz=SAMPLING_FREQ_HZ
        )
        
        # Flag high leak periods
        df_resampled['high_leak'] = flag_high_leak_periods(
            df_resampled['leak_rate'], 
            leak_threshold=20.0, # L/min, example threshold
            min_duration_sec=10.0, 
            sampling_freq_hz=SAMPLING_FREQ_HZ
        )
        
        # Calculate flow baseline (e.g., rolling median of filtered flow for amplitude)
        # For event detection, baseline should reflect typical breath amplitude.
        # Using a rolling median of absolute flow might be a simple proxy here.
        df_resampled['flow_baseline'] = calculate_rolling_baseline(
            df_resampled['flow_rate_filtered'].abs(), # Use absolute flow for amplitude baseline
            window_sec=120, 
            sampling_freq_hz=SAMPLING_FREQ_HZ,
            quantile=0.5 # Median
        )
        
        print("\nPreprocessed data head:")
        print(df_resampled[['flow_rate', 'flow_rate_filtered', 'flow_baseline', 'high_leak']].head())
        
        # Plot to verify
        plt.figure(figsize=(15, 8))
        ax1 = plt.subplot(211)
        df_resampled['flow_rate'].plot(label='Raw Flow', alpha=0.7, ax=ax1)
        df_resampled['flow_rate_filtered'].plot(label='Filtered Flow', ax=ax1)
        df_resampled['flow_baseline'].plot(label='Flow Baseline (Abs Median)', linestyle='--', ax=ax1)
        ax1.set_title('Flow Data and Baseline')
        ax1.legend()
        
        ax2 = plt.subplot(212, sharex=ax1)
        df_resampled['leak_rate'].plot(label='Leak Rate', ax=ax2)
        ax2.fill_between(df_resampled.index, 0, df_resampled['leak_rate'].max(), 
                         where=df_resampled['high_leak'], color='red', alpha=0.3, label='High Leak')
        ax2.set_title('Leak Data')
        ax2.legend()
        plt.tight_layout()
        plt.show()
        
    else:
        print("Resampling failed.")
else:
    print("Skipping preprocessing as data loading failed.")

## 1.3 Breath and Event Detection

In [None]:
if 'df_resampled' in locals() and df_resampled is not None:
    # Breath Detection (optional for this notebook's main goal, but good for context)
    # breaths_df = detect_breaths_from_flow(df_resampled['flow_rate_filtered'], 
    #                                       sampling_freq_hz=SAMPLING_FREQ_HZ)
    # if not breaths_df.empty:
    #     print(f"\nDetected {len(breaths_df)} breaths.")
    #     print(breaths_df.head())
    # else:
    #     print("No breaths detected.")
        
    # Event Detection
    events_df = detect_apneas_hypopneas(
        flow_series=df_resampled['flow_rate_filtered'], # Use filtered flow
        baseline_flow_series=df_resampled['flow_baseline'],
        sampling_freq_hz=SAMPLING_FREQ_HZ,
        apnea_threshold_ratio=0.1, # Flow < 10% of baseline
        hypopnea_upper_threshold_ratio=0.7, # Flow < 70% of baseline (reduction >30%)
        hypopnea_lower_threshold_ratio=0.1, # Flow > 10% of baseline (reduction <90%)
        min_event_duration_s=10.0,
        high_leak_flags=df_resampled['high_leak']
    )
    
    if not events_df.empty:
        print(f"\nDetected {len(events_df)} apnea/hypopnea candidate events:")
        print(events_df[['event_start_time', 'event_end_time', 'event_type', 'event_duration_s', 'avg_flow_reduction_percent', 'excluded_due_to_leak']])
    else:
        print("No apnea/hypopnea events detected.")
        events_df = pd.DataFrame() # Ensure it exists for later steps

else:
    print("Skipping event detection as preprocessing failed or was skipped.")
    events_df = pd.DataFrame()

## 1.4 Heuristic Labeling of Events

Now, we'll define some heuristic rules to assign initial labels ('likely_obstructive', 'likely_central', 'ambiguous') to the detected events. This requires visual inspection and domain knowledge. We'll create a helper function to plot events for inspection.

**Heuristic Ideas (examples, needs refinement based on data):**
*   **Likely Obstructive:**
    *   Apnea/Hypopnea with significant flow flattening *before* or *during* (for hypopnea).
    *   Sharp, large recovery breaths immediately following the event.
    *   Associated with increased pressure from an auto-CPAP (if pressure data shows this clearly).
    *   High variability in flow leading up to an apnea.
*   **Likely Central:**
    *   Smooth, tapered decrease and increase in flow for apneas.
    *   Often part of a periodic breathing pattern (e.g., Cheyne-Stokes like waxing/waning flow over longer periods).
    *   Hypopneas show proportional reduction in flow without significant shape change (inspiratory flow still rounded).
    *   No significant auto-CPAP pressure increase specifically for the event.
*   **Ambiguous:** Events that don't clearly fit either pattern or have conflicting features.

In [None]:
def plot_event_for_labeling(event_idx, event_series, flow_signal, pressure_signal, baseline_signal, window_s=30):
    """Plots flow and pressure around an event for manual inspection and labeling."""
    event_start = event_series['event_start_time']
    event_end = event_series['event_end_time']
    event_type = event_series['event_type']
    
    plot_start = event_start - pd.Timedelta(seconds=window_s)
    plot_end = event_end + pd.Timedelta(seconds=window_s)
    
    plt.figure(figsize=(18, 7))
    
    # Flow Plot
    ax1 = plt.subplot(211)
    flow_signal.loc[plot_start:plot_end].plot(label='Flow', ax=ax1, color='cornflowerblue')
    if baseline_signal is not None:
         baseline_signal.loc[plot_start:plot_end].plot(label='Flow Baseline', linestyle='--', color='orange', ax=ax1)
    ax1.axvspan(event_start, event_end, color='red', alpha=0.2, label=f'{event_type}')
    ax1.set_title(f'Event {event_idx}: {event_type} ({event_series["event_duration_s"]:.1f}s) | Start: {event_start.time()}')
    ax1.legend()
    ax1.grid(True, which='both', linestyle=':', linewidth=0.5)
    
    # Pressure Plot
    ax2 = plt.subplot(212, sharex=ax1)
    pressure_signal.loc[plot_start:plot_end].plot(label='Pressure', ax=ax2, color='green')
    ax2.axvspan(event_start, event_end, color='red', alpha=0.2)
    ax2.set_title('Pressure')
    ax2.legend()
    ax2.grid(True, which='both', linestyle=':', linewidth=0.5)
    
    plt.tight_layout()
    # In a real interactive session, you might display this and then prompt for a label.
    # For this notebook, we'll save it.
    results_dir = '../results/event_plots_for_labeling'
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    plt.savefig(f"{results_dir}/event_{event_idx}_{event_type.replace('_candidate','')}.png")
    plt.close() # Close to avoid too many plots in output if run for many events

# Initialize a new column for heuristic labels
if not events_df.empty:
    events_df['heuristic_label'] = 'ambiguous' # Default to ambiguous
    
    print("\nPlotting a few events for manual inspection (plots saved to results/event_plots_for_labeling/):")
    # Plot first few events as examples (up to 5, or fewer if not many events)
    num_events_to_plot = min(len(events_df), 5)
    for i in range(num_events_to_plot):
        event = events_df.iloc[i]
        plot_event_for_labeling(i, event, 
                                df_resampled['flow_rate_filtered'], 
                                df_resampled['pressure'],
                                df_resampled['flow_baseline'])
        print(f"  Plot saved for event {i}: {event['event_type']} starting at {event['event_start_time'].time()}")
    
    # --- Apply Heuristic Rules (Example) ---
    # These rules are very basic and would need significant refinement based on real data patterns.
    for index, event in events_df.iterrows():
        # Get event context from full signal if needed for rules
        # For simplicity, use pre-calculated event properties for now.
        
        is_obstructive_like = False
        is_central_like = False
        
        # Rule 1: Very high flow reduction apneas are often central if not other obstructive signs
        if event['event_type'] == 'apnea_candidate' and event['avg_flow_reduction_percent'] > 95:
            is_central_like = True # Initial guess
            
        # Rule 2: Hypopneas with low flow reduction might be central if flow shape is normal
        # (Requires flow shape features not yet calculated here, but placeholder for idea)
        if event['event_type'] == 'hypopnea_candidate' and event['avg_flow_reduction_percent'] < 50:
            is_central_like = True # Initial guess
            
        # Rule 3: Events with significant pressure increase during/after might be obstructive
        # (Requires analyzing pressure signal around event - feature engineering step will do this)
        # For now, using the dummy data's pressure change for the first event:
        if index == 0 and event['event_type'] == 'hypopnea_candidate': # First event in dummy data is obstructive-like
             is_obstructive_like = True
             is_central_like = False # Override
        
        # Rule 4: Second event in dummy data is central-like apnea
        if index == 1 and event['event_type'] == 'apnea_candidate':
            is_central_like = True
            is_obstructive_like = False

        # Assign label based on heuristics
        if is_obstructive_like and not is_central_like:
            events_df.loc[index, 'heuristic_label'] = 'likely_obstructive'
        elif is_central_like and not is_obstructive_like:
            events_df.loc[index, 'heuristic_label'] = 'likely_central'
        elif is_obstructive_like and is_central_like: # Conflicting, keep ambiguous
            events_df.loc[index, 'heuristic_label'] = 'ambiguous'
        else: # No strong indicators, keep ambiguous
            events_df.loc[index, 'heuristic_label'] = 'ambiguous'
            
    print("\nEvents with heuristic labels (first few):")
    print(events_df[['event_start_time', 'event_type', 'heuristic_label']].head(10))
    print("\nLabel distribution:")
    print(events_df['heuristic_label'].value_counts())

    # Save the events_df with heuristic labels for the next notebook (feature engineering)
    events_with_labels_path = '../data/events_with_heuristic_labels.csv'
    events_df.to_csv(events_with_labels_path, index=False)
    print(f"\nSaved events with heuristic labels to: {events_with_labels_path}")
    
else:
    print("Skipping heuristic labeling as no events were detected or previous steps failed.")

## 1.5 Next Steps

1.  **Refine Heuristic Labeling:** The rules above are very basic. For a real dataset, this step would involve more careful inspection of event plots and defining more robust rules, possibly incorporating features from `feature_engineering.py` in an iterative loop.
2.  **Feature Engineering:** Proceed to the next notebook (`02_Feature_Engineering_and_Model_Training.ipynb`) to calculate detailed features for these labeled events.
3.  **Model Training:** Train a classifier using the engineered features and these heuristic labels.
4.  **Evaluation and Iteration:** Evaluate the model. If performance is insufficient, revisit heuristic labeling, feature engineering, or try different models.