In [None]:
# Packages

import pandas as pd
import plotly.graph_objects as go
import plotly.subplots as sp
import pytz
from datetime import timedelta

In [None]:

PSG_FILE = '../data/bishkek_csr/03_train_ready/nasal_files/05-04-2025_nasal.csv'
RESPECK_FILE = '../data/bishkek_csr/03_train_ready/respeck/05-04-2025_respeck.csv'
LABELS_FILE = '../data/bishkek_csr/03_train_ready/event_exports/05-04-2025_event_export.csv'

respeck_df = pd.read_csv(RESPECK_FILE)
respeck_df['timestamp'] = pd.to_datetime(respeck_df['alignedTimestamp'], unit='ms')
tz = pytz.timezone('Asia/Bishkek')
respeck_df['timestamp'] = respeck_df['timestamp'].dt.tz_localize('UTC').dt.tz_convert(tz)
respeck_df.set_index('timestamp', inplace=True)

psg_df = pd.read_csv(PSG_FILE)
psg_df['timestamp'] = pd.to_datetime(psg_df['UnixTimestamp'], unit='ms')
tz = pytz.timezone('Asia/Bishkek')
psg_df['timestamp'] = psg_df['timestamp'].dt.tz_localize('UTC').dt.tz_convert(tz)
psg_df.set_index('timestamp', inplace=True)

labels_df = pd.read_csv(LABELS_FILE)
labels_df['timestamp'] = pd.to_datetime(labels_df['UnixTimestamp'], unit='ms')
tz = pytz.timezone('Asia/Bishkek')
labels_df['timestamp'] = labels_df['timestamp'].dt.tz_localize('UTC').dt.tz_convert(tz)
labels_df.set_index('timestamp', inplace=True)

In [None]:
import plotly.graph_objects as go
import plotly.subplots as sp
from datetime import timedelta
import pandas as pd
import math

def plot_single_event(respeck_df, psg_df, event_time, event_row, buffer_minutes=2):
    """
    Plot a single event with PSG and Respeck data in separate subplots.
    
    Parameters:
    - respeck_df: DataFrame with Respeck sensor data
    - psg_df: DataFrame with PSG data  
    - event_time: Timestamp of the event
    - event_row: Series containing event details
    - buffer_minutes: Minutes to show before and after event
    """
    
    # Create figure with 2 subplots
    fig = sp.make_subplots(
        rows=2, cols=1,
        subplot_titles=[
            f'{event_row.get("Event", "Event")} at {event_time.strftime("%Y-%m-%d %H:%M:%S")}',
            None
        ],
        vertical_spacing=0.1
    )
    
    buffer_td = timedelta(minutes=buffer_minutes)
    start_time = event_time - buffer_td
    end_time = event_time + buffer_td
    
    # Parse duration
    duration = 30.0
    duration_val = event_row.get('Duration')
    if duration_val is not None:
        try:
            duration = float(str(duration_val).replace(',', '.'))
        except (ValueError, TypeError):
            pass
    
    event_end_time = event_time + timedelta(seconds=duration)
    
    # Filter data
    respeck_window = respeck_df[start_time:end_time]
    psg_window = psg_df[start_time:end_time]
    
    # Plot PSG data (top subplot)
    if not psg_window.empty and 'Resp nasal' in psg_window.columns:
        fig.add_trace(go.Scatter(
            x=psg_window.index, 
            y=psg_window['Resp nasal'], 
            mode='lines', 
            name='PSG Nasal Resp',
            line=dict(color='blue', width=2), 
            opacity=0.8
        ), row=1, col=1)
    
    # Plot Respeck data (bottom subplot)
    if not respeck_window.empty:
        if 'breathingSignal' in respeck_window.columns:
            fig.add_trace(go.Scatter(
                x=respeck_window.index, 
                y=respeck_window['breathingSignal'], 
                mode='lines',
                name='Respeck Breathing', 
                line=dict(color='orange', width=2),
                opacity=0.9
            ), row=2, col=1)
        
        if 'x' in respeck_window.columns:
            fig.add_trace(go.Scatter(
                x=respeck_window.index, 
                y=respeck_window['x'], 
                mode='lines',
                name='Respeck X', 
                line=dict(color='red', width=1),
                opacity=0.7
            ), row=2, col=1)
        
        if 'y' in respeck_window.columns:
            fig.add_trace(go.Scatter(
                x=respeck_window.index, 
                y=respeck_window['y'], 
                mode='lines',
                name='Respeck Y', 
                line=dict(color='green', width=1),
                opacity=0.7
            ), row=2, col=1)

        if 'z' in respeck_window.columns:
            fig.add_trace(go.Scatter(
                x=respeck_window.index, 
                y=respeck_window['z'], 
                mode='lines',
                name='Respeck Z', 
                line=dict(color='magenta', width=1),
                opacity=0.7
            ), row=2, col=1)

    # Add event markers to both subplots
    event_display_name = event_row.get('Event', 'Event')
    for r in [1, 2]:
        fig.add_vrect(
            x0=event_time, x1=event_end_time, 
            fillcolor="red", opacity=0.2, 
            layer="below", line_width=0,
            annotation_text=f"{event_display_name} ({duration:.1f}s)" if r == 1 else "",
            annotation_position="top left", 
            row=r, col=1
        )
        fig.add_vline(
            x=event_time, 
            line_dash="dash", line_color="red", line_width=1.5,
            opacity=0.8, 
            row=r, col=1
        )

    # Set x-axis ranges and link them
    fig.update_xaxes(range=[start_time, end_time], row=1, col=1)
    fig.update_xaxes(range=[start_time, end_time], matches='x', row=2, col=1)

    # Update axis titles
    fig.update_yaxes(title_text="PSG Nasal Resp.", row=1, col=1)
    fig.update_yaxes(title_text="Respeck Data", row=2, col=1)
    fig.update_xaxes(title_text="Time", row=2, col=1)

    # Update layout
    fig.update_layout(
        height=800,
        title_text=f"Analysis of {event_display_name}",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
    )
    
    fig.show()


def plot_events_batch(respeck_df, psg_df, osa_events_df, buffer_minutes=2, events_per_plot=5):
    """
    Plot events in batches to avoid subplot limit issues.
    
    Parameters:
    - respeck_df: DataFrame with Respeck sensor data
    - psg_df: DataFrame with PSG data
    - osa_events_df: DataFrame with event labels
    - buffer_minutes: Minutes to show before and after each event
    - events_per_plot: Number of events to show per plot
    """
    
    if osa_events_df.empty:
        print("No events found in the data.")
        return
    
    n_events = len(osa_events_df)
    n_batches = math.ceil(n_events / events_per_plot)
    
    print(f"Found {n_events} events. Creating {n_batches} batch plots...")
    
    for batch_idx in range(n_batches):
        start_idx = batch_idx * events_per_plot
        end_idx = min(start_idx + events_per_plot, n_events)
        batch_events = osa_events_df.iloc[start_idx:end_idx]
        
        n_events_in_batch = len(batch_events)
        
        # Create subplot titles
        subplot_titles = []
        for i, (event_time, event_row) in enumerate(batch_events.iterrows()):
            event_display_name = event_row.get('Event', 'Event')
            subplot_titles.append(f'{event_display_name} #{start_idx + i + 1} at {event_time.strftime("%H:%M:%S")}')
            subplot_titles.append(None)
        
        # Calculate appropriate vertical spacing
        total_rows = n_events_in_batch * 2
        max_spacing = 1.0 / (total_rows - 1) if total_rows > 1 else 0.1
        vertical_spacing = min(0.03, max_spacing * 0.8)  # Use 80% of max allowed
        
        # Create subplots
        fig = sp.make_subplots(
            rows=total_rows, cols=1,
            subplot_titles=subplot_titles,
            vertical_spacing=vertical_spacing
        )
        
        buffer_td = timedelta(minutes=buffer_minutes)
        
        for idx, (event_time, event_row) in enumerate(batch_events.iterrows()):
            psg_row = idx * 2 + 1
            respeck_row = idx * 2 + 2
            
            start_time = event_time - buffer_td
            end_time = event_time + buffer_td
            
            # Parse duration
            duration = 30.0
            duration_val = event_row.get('Duration')
            if duration_val is not None:
                try:
                    duration = float(str(duration_val).replace(',', '.'))
                except (ValueError, TypeError):
                    pass
            
            event_end_time = event_time + timedelta(seconds=duration)
            
            # Filter data
            respeck_window = respeck_df[start_time:end_time]
            psg_window = psg_df[start_time:end_time]
            
            # Plot PSG data
            if not psg_window.empty and 'Resp nasal' in psg_window.columns:
                fig.add_trace(go.Scatter(
                    x=psg_window.index, y=psg_window['Resp nasal'], mode='lines', 
                    name='PSG Nasal Resp', line=dict(color='blue', width=2), 
                    opacity=0.8, showlegend=(batch_idx == 0 and idx == 0)
                ), row=psg_row, col=1)
            
            # Plot Respeck data
            if not respeck_window.empty:
                if 'breathingSignal' in respeck_window.columns:
                    fig.add_trace(go.Scatter(
                        x=respeck_window.index, y=respeck_window['breathingSignal'], 
                        mode='lines', name='Respeck Breathing', 
                        line=dict(color='orange', width=2), opacity=0.9, 
                        showlegend=(batch_idx == 0 and idx == 0)
                    ), row=respeck_row, col=1)
                
                for col_name, color in [('x', 'red'), ('y', 'green'), ('z', 'magenta')]:
                    if col_name in respeck_window.columns:
                        fig.add_trace(go.Scatter(
                            x=respeck_window.index, y=respeck_window[col_name], 
                            mode='lines', name=f'Respeck {col_name.upper()}', 
                            line=dict(color=color, width=1), opacity=0.7, 
                            showlegend=(batch_idx == 0 and idx == 0)
                        ), row=respeck_row, col=1)

            # Add event markers
            event_display_name = event_row.get('Event', 'Event')
            for r in [psg_row, respeck_row]:
                fig.add_vrect(
                    x0=event_time, x1=event_end_time, fillcolor="red", opacity=0.2, 
                    layer="below", line_width=0,
                    annotation_text=f"{event_display_name} ({duration:.1f}s)" if r == psg_row else "",
                    annotation_position="top left", row=r, col=1
                )
                fig.add_vline(
                    x=event_time, line_dash="dash", line_color="red", line_width=1.5,
                    opacity=0.8, row=r, col=1
                )

            # Set x-axis ranges and link them
            fig.update_xaxes(range=[start_time, end_time], row=psg_row, col=1)
            fig.update_xaxes(range=[start_time, end_time], matches=f'x{psg_row}', row=respeck_row, col=1)

            # Update axis titles
            fig.update_yaxes(title_text="PSG Nasal", row=psg_row, col=1)
            fig.update_yaxes(title_text="Respeck", row=respeck_row, col=1)
            if idx == n_events_in_batch - 1:  # Only add x-axis title to last subplot
                fig.update_xaxes(title_text="Time", row=respeck_row, col=1)

        # Update layout
        fig.update_layout(
            height=max(800, 300 * n_events_in_batch),
            title_text=f"Events Batch {batch_idx + 1}/{n_batches} (Events {start_idx + 1}-{end_idx})",
            legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
        )
        
        fig.show()


def plot_events_individual(respeck_df, psg_df, osa_events_df, buffer_minutes=2, max_events=None):
    """
    Plot each event as a separate individual plot.
    
    Parameters:
    - respeck_df: DataFrame with Respeck sensor data
    - psg_df: DataFrame with PSG data
    - osa_events_df: DataFrame with event labels
    - buffer_minutes: Minutes to show before and after each event
    - max_events: Maximum number of events to plot (None for all)
    """
    
    if osa_events_df.empty:
        print("No events found in the data.")
        return
    
    n_events = len(osa_events_df)
    if max_events is not None:
        n_events = min(n_events, max_events)
        osa_events_df = osa_events_df.iloc[:n_events]
    
    print(f"Creating {n_events} individual plots...")
    
    for idx, (event_time, event_row) in enumerate(osa_events_df.iterrows()):
        print(f"Plotting event {idx + 1}/{n_events}")
        plot_single_event(respeck_df, psg_df, event_time, event_row, buffer_minutes)

        # Option 1: Plot events in batches (recommended for many events)
# plot_events_batch(respeck_df, psg_df, labels_df[labels_df['Event'] == 'Obstructive Apnea'], 
#                   buffer_minutes=15, events_per_plot=3)

# Option 2: Plot each event individually (good for detailed analysis)
# plot_events_individual(respeck_df, psg_df, labels_df[labels_df['Event'] == 'Obstructive Apnea'], 
#                        buffer_minutes=15, max_events=5)

# Option 3: Plot a single specific event
# event_idx = 0  # First event
# events = labels_df[labels_df['Event'] == 'Obstructive Apnea']
# if not events.empty:
#     event_time, event_row = list(events.iterrows())[event_idx]
#     plot_single_event(respeck_df, psg_df, event_time, event_row, buffer_minutes=15)

In [None]:
from calculateContinuousBreathFeatures import *
from respiratoryFeatures import *

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns

def load_and_process_events(csv_file_path):
    """
    Load and process the event CSV file, handling the comma decimal format.
    
    Parameters:
    - csv_file_path: Path to the CSV file
    
    Returns:
    - events_df: Processed DataFrame with proper timestamps and durations
    """
    # Load the CSV
    events_df = pd.read_csv(csv_file_path)
    
    print(f"Loaded {len(events_df)} events from CSV")
    print(f"Event types: {events_df['Event'].value_counts().to_dict()}")
    
    # Convert Unix timestamp to datetime
    events_df['start_time'] = pd.to_datetime(events_df['UnixTimestamp'], unit='ms')
    
    # Parse duration (handle comma as decimal separator)
    events_df['duration_seconds'] = events_df['Duration'].str.replace(',', '.').astype(float)
    
    # Calculate end time
    events_df['end_time'] = events_df['start_time'] + pd.to_timedelta(events_df['duration_seconds'], unit='s')
    
    # Sort by start time
    events_df = events_df.sort_values('start_time').reset_index(drop=True)
    
    print(f"Time range: {events_df['start_time'].min()} to {events_df['end_time'].max()}")
    print(f"Duration range: {events_df['duration_seconds'].min():.1f}s to {events_df['duration_seconds'].max():.1f}s")
    
    return events_df

def create_labeled_respiratory_data(respeck_df, events_df, breathing_col='breathingSignal'):
    """
    Create a labeled version of respiratory data with precise event timing.
    
    Parameters:
    - respeck_df: DataFrame with respiratory data (must have datetime index)
    - events_df: Processed events DataFrame with start_time, end_time columns
    - breathing_col: Name of the breathing signal column
    
    Returns:
    - labeled_df: DataFrame with respiratory data and event labels
    """
    print("Creating labeled respiratory data...")
    
    # Ensure respeck_df has datetime index
    if not isinstance(respeck_df.index, pd.DatetimeIndex):
        print("Converting respeck_df index to datetime...")
        respeck_df.index = pd.to_datetime(respeck_df.index)
    
    # Create a copy of respiratory data
    labeled_df = respeck_df.copy()
    
    # Initialize all as 'Normal'
    labeled_df['event_label'] = 'Normal'
    labeled_df['event_duration'] = np.nan
    labeled_df['event_id'] = np.nan
    
    # Check time overlap
    respeck_start = labeled_df.index.min()
    respeck_end = labeled_df.index.max()
    events_start = events_df['start_time'].min()
    events_end = events_df['end_time'].max()
    
    print(f"Respeck data: {respeck_start} to {respeck_end}")
    print(f"Events data: {events_start} to {events_end}")
    
    # Find overlapping events
    overlapping_events = events_df[
        (events_df['end_time'] >= respeck_start) & 
        (events_df['start_time'] <= respeck_end)
    ].copy()
    
    print(f"Found {len(overlapping_events)} events overlapping with respiratory data")
    
    # Label each event period
    events_applied = 0
    for idx, event in overlapping_events.iterrows():
        # Find respiratory data points within this event
        event_mask = (
            (labeled_df.index >= event['start_time']) & 
            (labeled_df.index <= event['end_time'])
        )
        
        data_points_in_event = event_mask.sum()
        
        if data_points_in_event > 0:
            # Apply event label to this time period
            labeled_df.loc[event_mask, 'event_label'] = event['Event']
            labeled_df.loc[event_mask, 'event_duration'] = event['duration_seconds']
            labeled_df.loc[event_mask, 'event_id'] = idx
            events_applied += 1
            
            if events_applied <= 5:  # Show first 5 for debugging
                print(f"  Event {idx}: {event['Event']} at {event['start_time']} "
                      f"({data_points_in_event} data points)")
    
    print(f"Successfully applied {events_applied} events to respiratory data")
    
    # Summary statistics
    label_counts = labeled_df['event_label'].value_counts()
    print(f"\nLabel distribution:")
    for label, count in label_counts.items():
        percentage = (count / len(labeled_df)) * 100
        print(f"  {label}: {count:,} samples ({percentage:.1f}%)")
    
    return labeled_df

def analyze_event_coverage(labeled_df, events_df):
    """
    Analyze how well events are covered in the respiratory data.
    """
    print("\n=== Event Coverage Analysis ===")
    
    # Get unique events that were actually applied
    applied_events = labeled_df[labeled_df['event_label'] != 'Normal']['event_id'].unique()
    applied_events = applied_events[~np.isnan(applied_events)]
    
    print(f"Events successfully applied: {len(applied_events)} out of {len(events_df)}")
    
    # Analyze coverage by event type
    coverage_stats = []
    for event_type in events_df['Event'].unique():
        type_events = events_df[events_df['Event'] == event_type]
        applied_type_events = events_df.loc[applied_events][
            events_df.loc[applied_events]['Event'] == event_type
        ]
        
        coverage_stats.append({
            'Event_Type': event_type,
            'Total_Events': len(type_events),
            'Applied_Events': len(applied_type_events),
            'Coverage_Rate': len(applied_type_events) / len(type_events) * 100 if len(type_events) > 0 else 0,
            'Total_Duration': type_events['duration_seconds'].sum(),
            'Applied_Duration': applied_type_events['duration_seconds'].sum() if len(applied_type_events) > 0 else 0
        })
    
    coverage_df = pd.DataFrame(coverage_stats)
    coverage_df = coverage_df.sort_values('Coverage_Rate', ascending=False)
    
    print("\nCoverage by event type:")
    print(coverage_df.to_string(index=False))
    
    return coverage_df

def extract_features_by_precise_labels(labeled_df, feature_extraction_func, 
                                     segment_length_minutes=5):
    """
    Extract features using precise event labels (no time windows needed).
    
    Parameters:
    - labeled_df: DataFrame with precise event labels
    - feature_extraction_func: Your respiratory feature extraction function
    - segment_length_minutes: Length of segments to analyze
    
    Returns:
    - features_list: List of feature dictionaries
    """
    print(f"Extracting features using precise labels...")
    
    all_features = []
    segment_length = pd.Timedelta(minutes=segment_length_minutes)
    
    # Get all unique event periods
    event_periods = []
    
    # Group consecutive samples with the same label
    labeled_df['label_group'] = (labeled_df['event_label'] != labeled_df['event_label'].shift()).cumsum()
    
    for group_id, group_data in labeled_df.groupby('label_group'):
        if len(group_data) < 50:  # Skip very short segments
            continue
            
        event_type = group_data['event_label'].iloc[0]
        start_time = group_data.index.min()
        end_time = group_data.index.max()
        duration = (end_time - start_time).total_seconds()
        
        event_periods.append({
            'event_type': event_type,
            'start_time': start_time,
            'end_time': end_time,
            'duration': duration,
            'sample_count': len(group_data)
        })
    
    print(f"Found {len(event_periods)} distinct event periods")
    
    # Extract features for each period
    for i, period in enumerate(event_periods):
        try:
            # Get data for this period
            period_data = labeled_df[
                (labeled_df.index >= period['start_time']) & 
                (labeled_df.index <= period['end_time'])
            ]
            print(period_data)
            
            if 'breathingSignal' not in period_data.columns or len(period_data) < 50:
                continue
            
            # Extract features
            features = feature_extraction_func(
                period_data['timestamp'].values,
                period_data['breathingSignal'].values
            )
            
            if features is not None:
                # Add metadata
                features['event_type'] = period['event_type']
                features['event_time'] = period['start_time']
                features['period_duration'] = period['duration']
                features['sample_count'] = period['sample_count']
                
                all_features.append(features)
                
                if i % 50 == 0:
                    print(f"  Processed {i+1}/{len(event_periods)} periods...")
                
        except Exception as e:
            print(f"Error processing period {i}: {e}")
            continue
    
    print(f"Successfully extracted features for {len(all_features)} periods")
    return all_features

def visualize_event_timeline(labeled_df, sample_hours=2):
    """
    Visualize the event labeling over time.
    """
    print(f"Creating timeline visualization for first {sample_hours} hours...")
    
    # Sample data for visualization
    start_time = labeled_df.index.min()
    end_time = start_time + pd.Timedelta(hours=sample_hours)
    sample_data = labeled_df[(labeled_df.index >= start_time) & (labeled_df.index <= end_time)]
    
    if len(sample_data) == 0:
        print("No data available for visualization period")
        return
    
    # Create visualization
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10), sharex=True)
    
    # Plot respiratory signal
    ax1.plot(sample_data.index, sample_data['breathingSignal'], 'b-', alpha=0.7, linewidth=0.5)
    ax1.set_ylabel('Breathing Signal')
    ax1.set_title(f'Respiratory Signal with Event Labels ({sample_hours} hours)')
    ax1.grid(True, alpha=0.3)
    
    # Plot event labels
    unique_labels = sample_data['event_label'].unique()
    colors = plt.cm.Set1(np.linspace(0, 1, len(unique_labels)))
    label_colors = dict(zip(unique_labels, colors))
    
    # Create colored background for events
    for label in unique_labels:
        if label == 'Normal':
            continue
        label_data = sample_data[sample_data['event_label'] == label]
        if len(label_data) > 0:
            ax1.scatter(label_data.index, label_data['breathingSignal'], 
                       c=[label_colors[label]], alpha=0.6, s=1, label=label)
    
    ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Plot label timeline
    label_numeric = pd.Categorical(sample_data['event_label']).codes
    ax2.scatter(sample_data.index, label_numeric, c=[label_colors[label] for label in sample_data['event_label']], 
               s=2, alpha=0.8)
    ax2.set_ylabel('Event Type')
    ax2.set_xlabel('Time')
    ax2.set_yticks(range(len(unique_labels)))
    ax2.set_yticklabels(unique_labels)
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print event statistics for this period
    print(f"\nEvent statistics for {sample_hours}-hour sample:")
    event_stats = sample_data['event_label'].value_counts()
    for label, count in event_stats.items():
        duration_minutes = count * (sample_data.index[1] - sample_data.index[0]).total_seconds() / 60
        print(f"  {label}: {count:,} samples ({duration_minutes:.1f} minutes)")

def process_existing_events_df(events_df):
    """
    Process your existing events DataFrame to add proper timestamps.
    
    Parameters:
    - events_df: Your existing events DataFrame
    
    Returns:
    - processed_events_df: DataFrame with start_time, end_time columns
    """
    print(f"Processing existing events DataFrame with {len(events_df)} events")
    
    # Create a copy to avoid modifying original
    processed_df = events_df.copy()
    
    # Convert Unix timestamp to datetime (assuming milliseconds)
    processed_df['start_time'] = pd.to_datetime(processed_df['UnixTimestamp'], unit='ms').dt.tz_localize('UTC').dt.tz_convert(tz)
    
    # Parse duration (handle comma as decimal separator)
    processed_df['duration_seconds'] = processed_df['Duration'].str.replace(',', '.').astype(float)
    
    # Calculate end time
    processed_df['end_time'] = processed_df['start_time'] + pd.to_timedelta(processed_df['duration_seconds'], unit='s')
    
    # Sort by start time
    processed_df = processed_df.sort_values('start_time').reset_index(drop=True)
    
    print(f"Event types: {processed_df['Event'].value_counts().to_dict()}")
    print(f"Time range: {processed_df['start_time'].min()} to {processed_df['end_time'].max()}")
    print(f"Duration range: {processed_df['duration_seconds'].min():.1f}s to {processed_df['duration_seconds'].max():.1f}s")
    
    return processed_df

def comprehensive_precise_analysis(respeck_df, events_df, feature_extraction_func):
    """
    Run complete analysis using precise event timing with existing events DataFrame.
    
    Parameters:
    - respeck_df: Your respiratory DataFrame
    - events_df: Your existing events DataFrame 
    - feature_extraction_func: Your respiratory feature extraction function
    """
    print("=== Comprehensive Precise Event Analysis ===")
    
    # Step 1: Process existing events DataFrame
    print("\n1. Processing existing events DataFrame...")
    processed_events_df = process_existing_events_df(events_df)
    
    # Step 2: Create labeled respiratory data
    print("\n2. Creating labeled respiratory data...")
    labeled_df = create_labeled_respiratory_data(respeck_df, processed_events_df)
    
    # Step 3: Analyze coverage
    print("\n3. Analyzing event coverage...")
    coverage_df = analyze_event_coverage(labeled_df, processed_events_df)
    
    # Step 4: Visualize timeline
    print("\n4. Creating timeline visualization...")
    visualize_event_timeline(labeled_df, sample_hours=2)
    
    # Step 5: Extract features with precise labels
    print("\n5. Extracting features with precise labels...")
    features_list = extract_features_by_precise_labels(labeled_df, feature_extraction_func)
    
    return {
        'labeled_df': labeled_df,
        'events_df': processed_events_df,
        'coverage_df': coverage_df,
        'features_list': features_list
    }

# Usage example with your existing events_df:

# Import your feature extraction function

# Run comprehensive precise analysis using your existing events_df
results = comprehensive_precise_analysis(
    respeck_df=respeck_df,  # Your respiratory DataFrame
    events_df=labels_df,    # Your existing events DataFrame
    feature_extraction_func=calculate_TS_breathFeatures
)




In [None]:
# Fix the missing timestamp column and re-run feature extraction

print("=== Fixing Feature Extraction ===")

# Get your labeled_df from results
labeled_df = results['labeled_df'].copy()

# Add the timestamp column from the index
labeled_df['timestamp'] = labeled_df.index
print("✅ Added 'timestamp' column from index")

# Verify the fix
print(f"Now has 'timestamp' column: {'timestamp' in labeled_df.columns}")
print(f"Sample timestamps: {labeled_df['timestamp'].head(3).tolist()}")

# Now let's re-run the feature extraction with the corrected DataFrame
def extract_features_by_precise_labels_fixed(labeled_df, feature_extraction_func, 
                                           segment_length_minutes=5):
    """
    Extract features using precise event labels with proper timestamp handling.
    """
    print(f"Extracting features using precise labels...")
    
    all_features = []
    
    # Group consecutive samples with the same label
    labeled_df['label_group'] = (labeled_df['event_label'] != labeled_df['event_label'].shift()).cumsum()
    
    # Get all unique event periods
    event_periods = []
    for group_id, group_data in labeled_df.groupby('label_group'):
        if len(group_data) < 50:  # Skip very short segments
            continue
            
        event_type = group_data['event_label'].iloc[0]
        start_time = group_data.index.min()
        end_time = group_data.index.max()
        duration = (end_time - start_time).total_seconds()
        
        event_periods.append({
            'event_type': event_type,
            'start_time': start_time,
            'end_time': end_time,
            'duration': duration,
            'sample_count': len(group_data)
        })
    
    print(f"Found {len(event_periods)} distinct event periods")
    
    # Show breakdown by event type
    period_counts = {}
    for period in event_periods:
        event_type = period['event_type']
        period_counts[event_type] = period_counts.get(event_type, 0) + 1
    
    print("Periods by event type:")
    for event_type, count in sorted(period_counts.items()):
        print(f"  {event_type}: {count} periods")
    
    # Extract features for each period
    successful_extractions = 0
    for i, period in enumerate(event_periods):
        try:
            # Get data for this period
            period_data = labeled_df[
                (labeled_df.index >= period['start_time']) & 
                (labeled_df.index <= period['end_time'])
            ]
            
            if 'breathingSignal' not in period_data.columns or len(period_data) < 50:
                continue
            
            # Extract features using the timestamp column
            features = feature_extraction_func(
                period_data['timestamp'].to_numpy(),
                period_data['breathingSignal'].values
            )
            
            if features is not None:
                # Add metadata
                features['event_type'] = period['event_type']
                features['event_time'] = period['start_time']
                features['period_duration'] = period['duration']
                features['sample_count'] = period['sample_count']
                
                all_features.append(features)
                successful_extractions += 1
                
                if i % 100 == 0:
                    print(f"  Processed {i+1}/{len(event_periods)} periods... ({successful_extractions} successful)")
                
        except Exception as e:
            if i < 5:  # Only show first 5 errors
                print(f"Error processing period {i} ({period['event_type']}): {e}")
            continue
    
    print(f"Successfully extracted features for {successful_extractions} out of {len(event_periods)} periods")
    
    # Show successful extractions by event type
    success_counts = {}
    for features in all_features:
        event_type = features['event_type']
        success_counts[event_type] = success_counts.get(event_type, 0) + 1
    
    print("Successful extractions by event type:")
    for event_type, count in sorted(success_counts.items()):
        print(f"  {event_type}: {count} periods")
    
    return all_features

# Test on a small sample first
print("\n=== Testing Feature Extraction on Sample ===")
sample_data = labeled_df.head(1000)
sample_data['timestamp'] = sample_data.index

try:
    test_features = calculate_TS_breathFeatures(
        sample_data['timestamp'].to_numpy(),
        sample_data['breathingSignal'].values
    )
    
    if test_features is not None:
        print("✅ Feature extraction test successful!")
        print(f"Extracted features: {list(test_features.keys())}")
    else:
        print("❌ Feature extraction returned None")
        
except Exception as e:
    print(f"❌ Feature extraction failed: {e}")
    print("This might be due to signal quality or length issues")

# Now run the full feature extraction
print("\n=== Running Full Feature Extraction ===")
features_list = extract_features_by_precise_labels_fixed(
    labeled_df, 
    calculate_TS_breathFeatures
)

print(f"\n=== Results ===")
print(f"Total features extracted: {len(features_list)}")

# Update the results
results['features_list'] = features_list
results['labeled_df'] = labeled_df  # Updated with timestamp column


In [None]:
# Add these functions after your existing code

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

def aggregate_breath_features(features_list):
    """
    Convert list of feature dictionaries to aggregated DataFrame for analysis.
    Handles time-series data by computing statistical summaries.
    """
    print(f"=== Aggregation Debug Info ===")
    print(f"Number of feature dictionaries: {len(features_list)}")
    
    if len(features_list) == 0:
        print("❌ ERROR: features_list is empty!")
        return pd.DataFrame()
    
    # Check first few feature dictionaries
    print(f"Keys in first feature dict: {list(features_list[0].keys())}")
    if len(features_list) > 0:
        print(f"Sample event_type: {features_list[0].get('event_type', 'MISSING')}")
    
    aggregated_features = []
    
    for features in features_list:
        event_features = {
            'event_type': features['event_type'],
            'event_time': features['event_time'],
            'period_duration': features.get('period_duration', np.nan),
            'sample_count': features.get('sample_count', np.nan)
        }
        
        # Aggregate time-series features to event-level statistics
        for feature_name, values in features.items():
            if feature_name in ['event_type', 'event_time', 'period_duration', 'sample_count', 'timestamp', 'breathingSignal']:
                continue
            
            # Handle different data types
            if hasattr(values, 'values'):  # pandas Series
                values = values.values
            
            # Convert to numpy array and handle different cases
            if isinstance(values, (list, np.ndarray, pd.Series)):
                try:
                    # Convert to numpy array
                    values_array = np.array(values, dtype=float)
                    
                    # Remove NaN and infinite values
                    values_clean = values_array[np.isfinite(values_array)]
                    
                    if len(values_clean) > 0:
                        # Basic statistics
                        event_features[f'{feature_name}_mean'] = np.mean(values_clean)
                        event_features[f'{feature_name}_std'] = np.std(values_clean)
                        event_features[f'{feature_name}_median'] = np.median(values_clean)
                        event_features[f'{feature_name}_min'] = np.min(values_clean)
                        event_features[f'{feature_name}_max'] = np.max(values_clean)
                        event_features[f'{feature_name}_range'] = np.max(values_clean) - np.min(values_clean)
                        
                        # Percentiles
                        event_features[f'{feature_name}_q25'] = np.percentile(values_clean, 25)
                        event_features[f'{feature_name}_q75'] = np.percentile(values_clean, 75)
                        event_features[f'{feature_name}_iqr'] = np.percentile(values_clean, 75) - np.percentile(values_clean, 25)
                        
                        # Additional statistics for time-series
                        event_features[f'{feature_name}_skew'] = pd.Series(values_clean).skew()
                        event_features[f'{feature_name}_kurtosis'] = pd.Series(values_clean).kurtosis()
                        
                        # Variability measures
                        if np.mean(values_clean) != 0:
                            event_features[f'{feature_name}_cv'] = np.std(values_clean) / np.abs(np.mean(values_clean))
                        else:
                            event_features[f'{feature_name}_cv'] = 0
                            
                        # Count of valid values
                        event_features[f'{feature_name}_count'] = len(values_clean)
                        
                        # For breath-level features, add trend analysis
                        if feature_name in ['auc_values', 'breath_durations', 'inhalation_durations', 'exhalation_durations']:
                            if len(values_clean) >= 2:
                                # Trend analysis (linear slope)
                                x = np.arange(len(values_clean))
                                slope = np.polyfit(x, values_clean, 1)[0] if len(values_clean) > 1 else 0
                                event_features[f'{feature_name}_trend'] = slope
                                
                                # Rate of change
                                rate_of_change = np.mean(np.diff(values_clean)) if len(values_clean) > 1 else 0
                                event_features[f'{feature_name}_rate_change'] = rate_of_change
                                
                except (ValueError, TypeError) as e:
                    continue
                    
            elif isinstance(values, (int, float)) and np.isfinite(values):
                # Single value features
                event_features[feature_name] = values
        
        aggregated_features.append(event_features)
    
    return pd.DataFrame(aggregated_features)

def statistical_comparison_multi_event(features_df):
    """
    Perform statistical comparisons between each event type and normal.
    """
    if 'Normal' not in features_df['event_type'].values:
        print("No 'Normal' samples found for comparison")
        return None
    
    # Get numerical columns
    numerical_cols = features_df.select_dtypes(include=[np.number]).columns
    numerical_cols = [col for col in numerical_cols if col not in ['period_duration', 'sample_count']]
    
    # Get normal data
    normal_data = features_df[features_df['event_type'] == 'Normal']
    
    comparison_results = []
    event_types = [et for et in features_df['event_type'].unique() if et != 'Normal']
    
    print(f"\n=== Statistical Comparisons vs Normal (n={len(normal_data)}) ===")
    
    for event_type in event_types:
        event_data = features_df[features_df['event_type'] == event_type]
        print(f"\n{event_type} (n={len(event_data)}) vs Normal:")
        
        for col in numerical_cols:
            normal_vals = normal_data[col].dropna()
            event_vals = event_data[col].dropna()
            
            if len(normal_vals) > 1 and len(event_vals) > 1:
                # Perform t-test
                t_stat, p_value = stats.ttest_ind(event_vals, normal_vals, equal_var=False)
                
                # Calculate effect size (Cohen's d)
                pooled_std = np.sqrt(((len(event_vals) - 1) * event_vals.var() + 
                                    (len(normal_vals) - 1) * normal_vals.var()) / 
                                   (len(event_vals) + len(normal_vals) - 2))
                cohens_d = (event_vals.mean() - normal_vals.mean()) / pooled_std if pooled_std > 0 else 0
                
                comparison_results.append({
                    'event_type': event_type,
                    'feature': col,
                    'event_mean': event_vals.mean(),
                    'event_std': event_vals.std(),
                    'normal_mean': normal_vals.mean(),
                    'normal_std': normal_vals.std(),
                    'mean_difference': event_vals.mean() - normal_vals.mean(),
                    't_statistic': t_stat,
                    'p_value': p_value,
                    'cohens_d': cohens_d,
                    'significant': p_value < 0.05,
                    'effect_size_magnitude': 'Large' if abs(cohens_d) > 0.8 else 'Medium' if abs(cohens_d) > 0.5 else 'Small'
                })
    
    comparison_df = pd.DataFrame(comparison_results)
    
    # Show significant results for each event type
    for event_type in event_types:
        event_results = comparison_df[comparison_df['event_type'] == event_type]
        significant_results = event_results[event_results['significant']].sort_values('p_value')
        
        if len(significant_results) > 0:
            print(f"\nSignificant features for {event_type} ({len(significant_results)} total):")
            for _, row in significant_results.head(5).iterrows():
                direction = "↑" if row['mean_difference'] > 0 else "↓"
                print(f"  {direction} {row['feature']}: p={row['p_value']:.4f}, "
                      f"effect={row['cohens_d']:.3f} ({row['effect_size_magnitude']})")
    
    return comparison_df

def plot_event_comparisons(features_df, comparison_features=None):
    """
    Create comprehensive visualizations comparing all event types vs normal.
    """
    if comparison_features is None:
        # Select top features based on variance
        numeric_cols = features_df.select_dtypes(include=[np.number]).columns
        feature_variance = features_df[numeric_cols].var().sort_values(ascending=False)
        comparison_features = feature_variance.head(12).index.tolist()
    
    # Get event type counts
    event_counts = features_df['event_type'].value_counts()
    print(f"Sample sizes: {event_counts.to_dict()}")
    
    # Create comprehensive comparison plots
    n_features = len(comparison_features)
    n_cols = 3
    n_rows = (n_features + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 6 * n_rows))
    if n_rows == 1:
        axes = [axes] if n_cols == 1 else axes
    else:
        axes = axes.flatten()
    
    for i, feature in enumerate(comparison_features):
        if i >= len(axes):
            break
            
        ax = axes[i]
        
        # Box plot with all event types
        box_plot = sns.boxplot(data=features_df, x='event_type', y=feature, ax=ax)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
        ax.set_title(f'{feature}')
        ax.set_xlabel('')
        
        # Add sample sizes to x-axis labels
        new_labels = []
        for label in ax.get_xticklabels():
            event_name = label.get_text()
            count = event_counts.get(event_name, 0)
            new_labels.append(f'{event_name}\n(n={count})')
        ax.set_xticklabels(new_labels)
    
    # Hide empty subplots
    for i in range(len(comparison_features), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

def plot_top_significant_features(comparison_df, features_df, top_n=10):
    """
    Plot the top most significant features across all event types.
    """
    if comparison_df is None:
        print("No comparison results available")
        return
    
    # Get top significant features by effect size
    significant_features = comparison_df[comparison_df['significant']].nlargest(top_n, 'cohens_d')
    
    if len(significant_features) == 0:
        print("No significant features found")
        return
    
    print(f"Plotting top {len(significant_features)} most significant features:")
    
    # Create subplots
    n_features = len(significant_features)
    n_cols = 2
    n_rows = (n_features + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
    if n_rows == 1:
        axes = [axes] if n_cols == 1 else axes
    else:
        axes = axes.flatten()
    
    for i, (_, row) in enumerate(significant_features.iterrows()):
        if i >= len(axes):
            break
            
        ax = axes[i]
        feature = row['feature']
        event_type = row['event_type']
        
        # Create comparison plot for this specific feature and event type
        plot_data = features_df[features_df['event_type'].isin(['Normal', event_type])]
        
        sns.boxplot(data=plot_data, x='event_type', y=feature, ax=ax)
        
        # Add statistics to title
        direction = "↑" if row['mean_difference'] > 0 else "↓"
        ax.set_title(f'{direction} {feature} ({event_type})\n'
                     f'p={row["p_value"]:.4f}, effect={row["cohens_d"]:.3f}')
        ax.set_xlabel('')
    
    # Hide empty subplots
    for i in range(len(significant_features), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

# Now run the analysis with your existing results:

# Now run the aggregation and comparison
if len(features_list) > 0:
    print("\n=== Running Analysis ===")
    
    # Import the analysis functions if not already loaded
    # (The aggregate_breath_features and statistical_comparison_multi_event functions from before)
    
    # Aggregate features
    features_df = aggregate_breath_features(features_list)
    print(f"Created aggregated features DataFrame: {features_df.shape}")
    
    # Run statistical comparisons
    comparison_results = statistical_comparison_multi_event(features_df)
    
    # Create visualizations
    if comparison_results is not None:
        print("\nCreating visualizations...")
        plot_event_comparisons(features_df)
        plot_top_significant_features(comparison_results, features_df, top_n=8)
    
    # Save results
    features_df.to_csv('aggregated_respiratory_features_fixed.csv', index=False)
    if comparison_results is not None:
        comparison_results.to_csv('statistical_comparison_results_fixed.csv', index=False)
    
    print("✅ Analysis complete!")
    
else:
    print("❌ No features were extracted. Check the error messages above.")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

def create_osa_binary_classification(features_df):
    """
    Create binary OSA vs Non-OSA classification from your features DataFrame.
    """
    print("=== Creating OSA Binary Classification ===")
    
    # Create binary labels: OSA vs Non-OSA
    features_df_binary = features_df.copy()
    features_df_binary['is_osa'] = (features_df_binary['event_type'] == 'Obstructive Apnea').astype(int)
    features_df_binary['binary_label'] = features_df_binary['is_osa'].map({1: 'OSA', 0: 'Non-OSA'})
    
    # Show class distribution
    class_counts = features_df_binary['binary_label'].value_counts()
    print(f"Class distribution:")
    print(f"  OSA: {class_counts.get('OSA', 0)} samples")
    print(f"  Non-OSA: {class_counts.get('Non-OSA', 0)} samples")
    print(f"  OSA percentage: {(class_counts.get('OSA', 0) / len(features_df_binary) * 100):.1f}%")
    
    # Show what's included in Non-OSA
    non_osa_breakdown = features_df[features_df['event_type'] != 'Obstructive Apnea']['event_type'].value_counts()
    print(f"\nNon-OSA breakdown:")
    for event_type, count in non_osa_breakdown.items():
        print(f"  {event_type}: {count}")
    
    return features_df_binary

def statistical_comparison_osa_vs_rest(features_df_binary):
    """
    Statistical comparison: OSA vs Non-OSA (everything else).
    """
    print("\n=== OSA vs Non-OSA Statistical Analysis ===")
    
    # Get numerical columns
    numerical_cols = features_df_binary.select_dtypes(include=[np.number]).columns
    numerical_cols = [col for col in numerical_cols if col not in ['period_duration', 'sample_count', 'is_osa']]
    
    # Separate OSA vs Non-OSA
    osa_data = features_df_binary[features_df_binary['binary_label'] == 'OSA']
    non_osa_data = features_df_binary[features_df_binary['binary_label'] == 'Non-OSA']
    
    print(f"OSA samples: {len(osa_data)}")
    print(f"Non-OSA samples: {len(non_osa_data)}")
    
    comparison_results = []
    
    for col in numerical_cols:
        osa_vals = osa_data[col].dropna()
        non_osa_vals = non_osa_data[col].dropna()
        
        if len(osa_vals) > 1 and len(non_osa_vals) > 1:
            # Perform t-test
            t_stat, p_value = stats.ttest_ind(osa_vals, non_osa_vals, equal_var=False)
            
            # Calculate effect size (Cohen's d)
            pooled_std = np.sqrt(((len(osa_vals) - 1) * osa_vals.var() + 
                                (len(non_osa_vals) - 1) * non_osa_vals.var()) / 
                               (len(osa_vals) + len(non_osa_vals) - 2))
            cohens_d = (osa_vals.mean() - non_osa_vals.mean()) / pooled_std if pooled_std > 0 else 0
            
            comparison_results.append({
                'feature': col,
                'osa_mean': osa_vals.mean(),
                'osa_std': osa_vals.std(),
                'non_osa_mean': non_osa_vals.mean(),
                'non_osa_std': non_osa_vals.std(),
                'mean_difference': osa_vals.mean() - non_osa_vals.mean(),
                't_statistic': t_stat,
                'p_value': p_value,
                'cohens_d': cohens_d,
                'significant': p_value < 0.05,
                'effect_size_magnitude': 'Large' if abs(cohens_d) > 0.8 else 'Medium' if abs(cohens_d) > 0.5 else 'Small'
            })
    
    comparison_df = pd.DataFrame(comparison_results)
    
    # Sort by significance and effect size
    significant_results = comparison_df[comparison_df['significant']].sort_values('p_value')
    
    print(f"\nSignificant features for OSA detection ({len(significant_results)} total):")
    for _, row in significant_results.head(10).iterrows():
        direction = "↑" if row['mean_difference'] > 0 else "↓"
        print(f"  {direction} {row['feature']}: p={row['p_value']:.4f}, "
              f"effect={row['cohens_d']:.3f} ({row['effect_size_magnitude']})")
    
    return comparison_df

def plot_osa_classification_features(features_df_binary, comparison_df, top_n=12):
    """
    Plot top discriminative features for OSA classification.
    """
    print(f"\n=== Plotting Top {top_n} OSA Classification Features ===")
    
    # Get top significant features
    if comparison_df is not None:
        significant_features = comparison_df[comparison_df['significant']].copy()
        significant_features['abs_cohens_d'] = significant_features['cohens_d'].abs()
        significant_features = significant_features.nlargest(top_n, 'abs_cohens_d')
        feature_names = significant_features['feature'].tolist()
    else:
        # Fallback to variance-based selection
        numeric_cols = features_df_binary.select_dtypes(include=[np.number]).columns
        feature_variance = features_df_binary[numeric_cols].var().sort_values(ascending=False)
        feature_names = feature_variance.head(top_n).index.tolist()
    
    # Create comparison plots
    n_features = len(feature_names)
    n_cols = 3
    n_rows = (n_features + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 6 * n_rows))
    if n_rows == 1:
        axes = [axes] if n_cols == 1 else axes
    else:
        axes = axes.flatten()
    
    for i, feature in enumerate(feature_names):
        if i >= len(axes):
            break
            
        ax = axes[i]
        
        # Box plot: OSA vs Non-OSA
        sns.boxplot(data=features_df_binary, x='binary_label', y=feature, ax=ax)
        
        # Add statistics to title if available
        if comparison_df is not None:
            feature_stats = comparison_df[comparison_df['feature'] == feature]
            if not feature_stats.empty:
                row = feature_stats.iloc[0]
                direction = "↑" if row['mean_difference'] > 0 else "↓"
                ax.set_title(f'{direction} {feature}\np={row["p_value"]:.4f}, effect={row["cohens_d"]:.3f}')
            else:
                ax.set_title(feature)
        else:
            ax.set_title(feature)
        
        ax.set_xlabel('')
        
        # Add sample sizes
        osa_count = len(features_df_binary[features_df_binary['binary_label'] == 'OSA'])
        non_osa_count = len(features_df_binary[features_df_binary['binary_label'] == 'Non-OSA'])
        ax.set_xticklabels([f'Non-OSA\n(n={non_osa_count})', f'OSA\n(n={osa_count})'])
    
    # Hide empty subplots
    for i in range(len(feature_names), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

def machine_learning_osa_classification(features_df_binary):
    """
    Build and evaluate machine learning models for OSA classification.
    """
    print("\n=== Machine Learning OSA Classification ===")
    
    # Prepare features and labels
    feature_cols = features_df_binary.select_dtypes(include=[np.number]).columns
    feature_cols = [col for col in feature_cols if col not in ['period_duration', 'sample_count', 'is_osa']]
    
    X = features_df_binary[feature_cols].fillna(0)
    y = features_df_binary['is_osa']
    
    print(f"Features used: {len(feature_cols)}")
    print(f"Total samples: {len(X)}")
    print(f"OSA samples: {y.sum()}")
    print(f"Non-OSA samples: {len(y) - y.sum()}")
    
    if len(np.unique(y)) < 2:
        print("❌ Cannot perform classification: only one class present.")
        return None
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    rf.fit(X_train_scaled, y_train)
    
    # Cross-validation
    cv_scores = cross_val_score(rf, X_train_scaled, y_train, cv=5, scoring='roc_auc')
    print(f"\nCross-validation AUC scores: {cv_scores}")
    print(f"Mean CV AUC: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")
    
    # Test performance
    y_pred = rf.predict(X_test_scaled)
    y_pred_proba = rf.predict_proba(X_test_scaled)[:, 1]
    
    print(f"\n=== Test Set Performance ===")
    print(classification_report(y_test, y_pred, target_names=['Non-OSA', 'OSA']))
    
    # AUC Score
    auc_score = roc_auc_score(y_test, y_pred_proba)
    print(f"AUC Score: {auc_score:.3f}")
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"\nConfusion Matrix:")
    print(f"                Predicted")
    print(f"Actual    Non-OSA    OSA")
    print(f"Non-OSA      {cm[0,0]:3d}    {cm[0,1]:3d}")
    print(f"OSA          {cm[1,0]:3d}    {cm[1,1]:3d}")
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nTop 10 Most Important Features for OSA Classification:")
    for _, row in feature_importance.head(10).iterrows():
        print(f"  {row['feature']}: {row['importance']:.4f}")
    
    # Plot ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc_score:.3f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve for OSA Classification')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
    
    # Plot feature importance
    plt.figure(figsize=(12, 8))
    top_features = feature_importance.head(15)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title('Top 15 Features for OSA Classification')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    return {
        'model': rf,
        'scaler': scaler,
        'feature_importance': feature_importance,
        'cv_scores': cv_scores,
        'auc_score': auc_score,
        'confusion_matrix': cm
    }

# Run the complete OSA classification analysis
print("=== OSA BINARY CLASSIFICATION ANALYSIS ===")

# 1. Create binary classification dataset
features_df_binary = create_osa_binary_classification(features_df)

# 2. Statistical comparison
comparison_results = statistical_comparison_osa_vs_rest(features_df_binary)

# 3. Plot top discriminative features
plot_osa_classification_features(features_df_binary, comparison_results, top_n=12)

# 4. Machine learning classification
# ml_results = machine_learning_osa_classification(features_df_binary)

# 5. Save results
print("\n=== Saving Results ===")
features_df_binary.to_csv('osa_binary_classification_data.csv', index=False)
if comparison_results is not None:
    comparison_results.to_csv('osa_vs_rest_statistical_comparison.csv', index=False)

print("\n✅ OSA Classification Analysis Complete!")
print("Key files saved:")
print("  - osa_binary_classification_data.csv")
print("  - osa_vs_rest_statistical_comparison.csv")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
import os
import glob
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

class MultiNightOSAAnalyzer:
    """
    Comprehensive multi-night OSA analysis framework.
    """
    
    def __init__(self, data_directory):
        """
        Initialize the analyzer with a directory containing multiple nights of data.
        
        Parameters:
        - data_directory: Path to directory containing night-specific data files
        """
        self.data_directory = data_directory
        self.night_results = {}
        self.aggregated_features = None
        self.aggregated_comparison = None
        self.final_model = None
        
    def process_single_night(self, respeck_data, events_data, night_id, feature_extraction_func):
        """
        Process a single night's data using your existing pipeline.
        
        Parameters:
        - respeck_data: DataFrame with respiratory data for this night
        - events_data: DataFrame with events for this night
        - night_id: Identifier for this night (e.g., date, patient_night_1, etc.)
        - feature_extraction_func: Your feature extraction function
        
        Returns:
        - night_results: Dictionary with results for this night
        """
        print(f"\n=== Processing Night: {night_id} ===")
        
        try:
            # Use your existing pipeline functions
            # Step 1: Process events
            processed_events_df = self.process_existing_events_df(events_data)
            
            # Step 2: Create labeled respiratory data
            labeled_df = self.create_labeled_respiratory_data(respeck_data, processed_events_df)
            
            # Step 3: Add timestamp column
            labeled_df['timestamp'] = labeled_df.index
            
            # Step 4: Extract features
            features_list = self.extract_features_by_precise_labels_fixed(
                labeled_df, feature_extraction_func
            )
            
            # Step 5: Aggregate features
            if len(features_list) > 0:
                features_df = self.aggregate_breath_features(features_list)
                
                # Add night identifier
                features_df['night_id'] = night_id
                
                # Create binary OSA classification
                features_df_binary = self.create_osa_binary_classification(features_df)
                
                night_result = {
                    'night_id': night_id,
                    'labeled_df': labeled_df,
                    'features_df': features_df_binary,
                    'n_samples': len(features_df_binary),
                    'n_osa': len(features_df_binary[features_df_binary['binary_label'] == 'OSA']),
                    'n_non_osa': len(features_df_binary[features_df_binary['binary_label'] == 'Non-OSA']),
                    'success': True
                }
                
                print(f"✅ Night {night_id}: {len(features_df_binary)} samples "
                      f"({night_result['n_osa']} OSA, {night_result['n_non_osa']} Non-OSA)")
                
                return night_result
                
            else:
                print(f"❌ Night {night_id}: No features extracted")
                return {'night_id': night_id, 'success': False, 'error': 'No features extracted'}
                
        except Exception as e:
            print(f"❌ Night {night_id}: Error - {e}")
            return {'night_id': night_id, 'success': False, 'error': str(e)}
    
    def process_multiple_nights(self, night_data_list, feature_extraction_func):
        """
        Process multiple nights of data.
        
        Parameters:
        - night_data_list: List of dictionaries, each containing:
          {'night_id': str, 'respeck_data': DataFrame, 'events_data': DataFrame}
        - feature_extraction_func: Your feature extraction function
        
        Returns:
        - processing_summary: Summary of processing results
        """
        print("=== MULTI-NIGHT OSA ANALYSIS ===")
        print(f"Processing {len(night_data_list)} nights of data...")
        
        successful_nights = []
        failed_nights = []
        
        for night_data in night_data_list:
            result = self.process_single_night(
                night_data['respeck_data'],
                night_data['events_data'], 
                night_data['night_id'],
                feature_extraction_func
            )
            
            if result['success']:
                successful_nights.append(result)
                self.night_results[result['night_id']] = result
            else:
                failed_nights.append(result)
        
        print(f"\n=== Processing Summary ===")
        print(f"Successful nights: {len(successful_nights)}")
        print(f"Failed nights: {len(failed_nights)}")
        
        if failed_nights:
            print(f"Failed night details:")
            for failed in failed_nights:
                print(f"  {failed['night_id']}: {failed['error']}")
        
        return {
            'successful_nights': len(successful_nights),
            'failed_nights': len(failed_nights),
            'successful_results': successful_nights,
            'failed_results': failed_nights
        }
    
    def aggregate_all_nights(self):
        """
        Aggregate features from all successfully processed nights.
        """
        print("\n=== Aggregating Multi-Night Data ===")
        
        if not self.night_results:
            print("❌ No night results to aggregate")
            return None
        
        # Combine all features DataFrames
        all_features = []
        night_summary = []
        
        for night_id, night_result in self.night_results.items():
            features_df = night_result['features_df']
            all_features.append(features_df)
            
            night_summary.append({
                'night_id': night_id,
                'total_samples': night_result['n_samples'],
                'osa_samples': night_result['n_osa'],
                'non_osa_samples': night_result['n_non_osa'],
                'osa_percentage': (night_result['n_osa'] / night_result['n_samples']) * 100
            })
        
        # Concatenate all features
        self.aggregated_features = pd.concat(all_features, ignore_index=True)
        
        print(f"Aggregated data summary:")
        print(f"  Total nights: {len(self.night_results)}")
        print(f"  Total samples: {len(self.aggregated_features)}")
        
        # Show class distribution
        class_counts = self.aggregated_features['binary_label'].value_counts()
        print(f"  OSA samples: {class_counts.get('OSA', 0)}")
        print(f"  Non-OSA samples: {class_counts.get('Non-OSA', 0)}")
        print(f"  OSA percentage: {(class_counts.get('OSA', 0) / len(self.aggregated_features) * 100):.1f}%")
        
        # Show per-night breakdown
        night_summary_df = pd.DataFrame(night_summary)
        print(f"\nPer-night breakdown:")
        print(night_summary_df.to_string(index=False))
        
        return self.aggregated_features
    
    def statistical_analysis_aggregated(self):
        """
        Perform statistical analysis on aggregated multi-night data.
        """
        print("\n=== Multi-Night Statistical Analysis ===")
        
        if self.aggregated_features is None:
            print("❌ No aggregated features available. Run aggregate_all_nights() first.")
            return None
        
        # Statistical comparison
        self.aggregated_comparison = self.statistical_comparison_osa_vs_rest(self.aggregated_features)
        
        return self.aggregated_comparison
    
    def build_final_model(self, test_size=0.2, cv_folds=10):
        """
        Build final OSA classification model using all aggregated data.
        """
        print(f"\n=== Building Final OSA Classification Model ===")
        
        if self.aggregated_features is None:
            print("❌ No aggregated features available")
            return None
        
        # Prepare features and labels
        feature_cols = self.aggregated_features.select_dtypes(include=[np.number]).columns
        feature_cols = [col for col in feature_cols if col not in ['period_duration', 'sample_count', 'is_osa']]
        
        X = self.aggregated_features[feature_cols].fillna(0)
        y = self.aggregated_features['is_osa']
        
        print(f"Model training summary:")
        print(f"  Features: {len(feature_cols)}")
        print(f"  Total samples: {len(X)}")
        print(f"  OSA samples: {y.sum()}")
        print(f"  Non-OSA samples: {len(y) - y.sum()}")
        print(f"  Nights included: {self.aggregated_features['night_id'].nunique()}")
        
        # Stratified split to maintain class balance
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42, stratify=y
        )
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Train Random Forest with optimized parameters
        rf = RandomForestClassifier(
            n_estimators=200,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42,
            class_weight='balanced',
            n_jobs=-1
        )
        rf.fit(X_train_scaled, y_train)
        
        # Stratified cross-validation
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(rf, X_train_scaled, y_train, cv=skf, scoring='roc_auc')
        
        print(f"\n{cv_folds}-Fold Cross-Validation Results:")
        print(f"  AUC scores: {cv_scores}")
        print(f"  Mean AUC: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")
        
        # Test performance
        y_pred = rf.predict(X_test_scaled)
        y_pred_proba = rf.predict_proba(X_test_scaled)[:, 1]
        
        test_auc = roc_auc_score(y_test, y_pred_proba)
        
        print(f"\nTest Set Performance:")
        print(f"  AUC: {test_auc:.3f}")
        print(classification_report(y_test, y_pred, target_names=['Non-OSA', 'OSA']))
        
        # Feature importance
        feature_importance = pd.DataFrame({
            'feature': feature_cols,
            'importance': rf.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print(f"\nTop 10 Most Important Features:")
        for _, row in feature_importance.head(10).iterrows():
            print(f"  {row['feature']}: {row['importance']:.4f}")
        
        # Store final model
        self.final_model = {
            'model': rf,
            'scaler': scaler,
            'feature_importance': feature_importance,
            'cv_scores': cv_scores,
            'test_auc': test_auc,
            'feature_cols': feature_cols,
            'confusion_matrix': confusion_matrix(y_test, y_pred)
        }
        
        return self.final_model
    
    def plot_multi_night_results(self):
        """
        Create comprehensive visualizations for multi-night analysis.
        """
        print("\n=== Creating Multi-Night Visualizations ===")
        
        if self.aggregated_features is None:
            print("❌ No aggregated features available")
            return
        
        # 1. Per-night OSA distribution
        plt.figure(figsize=(15, 8))
        
        plt.subplot(2, 3, 1)
        night_counts = self.aggregated_features.groupby(['night_id', 'binary_label']).size().unstack(fill_value=0)
        night_counts.plot(kind='bar', ax=plt.gca(), color=['lightblue', 'salmon'])
        plt.title('OSA Distribution by Night')
        plt.xlabel('Night ID')
        plt.ylabel('Sample Count')
        plt.legend(['Non-OSA', 'OSA'])
        plt.xticks(rotation=45)
        
        # 2. Overall class distribution
        plt.subplot(2, 3, 2)
        class_counts = self.aggregated_features['binary_label'].value_counts()
        plt.pie(class_counts.values, labels=class_counts.index, autopct='%1.1f%%', colors=['lightblue', 'salmon'])
        plt.title('Overall OSA Distribution')
        
        # 3. Top discriminative features (if comparison available)
        if self.aggregated_comparison is not None:
            plt.subplot(2, 3, 3)
            top_features = self.aggregated_comparison[self.aggregated_comparison['significant']].nlargest(10, lambda x: abs(x)['cohens_d'])
            plt.barh(range(len(top_features)), top_features['cohens_d'])
            plt.yticks(range(len(top_features)), [f[:20] + '...' if len(f) > 20 else f for f in top_features['feature']])
            plt.xlabel('Effect Size (Cohen\'s d)')
            plt.title('Top 10 Discriminative Features')
        
        # 4. Model performance (if model available)
        if self.final_model is not None:
            plt.subplot(2, 3, 4)
            cv_scores = self.final_model['cv_scores']
            plt.boxplot(cv_scores)
            plt.ylabel('AUC Score')
            plt.title(f'Cross-Validation Performance\n(Mean: {cv_scores.mean():.3f})')
            
            # 5. Feature importance
            plt.subplot(2, 3, 5)
            top_importance = self.final_model['feature_importance'].head(10)
            plt.barh(range(len(top_importance)), top_importance['importance'])
            plt.yticks(range(len(top_importance)), [f[:20] + '...' if len(f) > 20 else f for f in top_importance['feature']])
            plt.xlabel('Importance')
            plt.title('Top 10 Feature Importance')
            
            # 6. Confusion matrix
            plt.subplot(2, 3, 6)
            cm = self.final_model['confusion_matrix']
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                       xticklabels=['Non-OSA', 'OSA'], yticklabels=['Non-OSA', 'OSA'])
            plt.title('Confusion Matrix')
            plt.xlabel('Predicted')
            plt.ylabel('Actual')
        
        plt.tight_layout()
        plt.show()
    
    def save_results(self, output_directory='multi_night_osa_results'):
        """
        Save all results to files.
        """
        print(f"\n=== Saving Results to {output_directory} ===")
        
        os.makedirs(output_directory, exist_ok=True)
        
        # Save aggregated features
        if self.aggregated_features is not None:
            self.aggregated_features.to_csv(f'{output_directory}/aggregated_features.csv', index=False)
            print("✅ Saved aggregated_features.csv")
        
        # Save statistical comparison
        if self.aggregated_comparison is not None:
            self.aggregated_comparison.to_csv(f'{output_directory}/statistical_comparison.csv', index=False)
            print("✅ Saved statistical_comparison.csv")
        
        # Save model and feature importance
        if self.final_model is not None:
            self.final_model['feature_importance'].to_csv(f'{output_directory}/feature_importance.csv', index=False)
            
            # Save model summary
            model_summary = {
                'cv_mean_auc': self.final_model['cv_scores'].mean(),
                'cv_std_auc': self.final_model['cv_scores'].std(),
                'test_auc': self.final_model['test_auc'],
                'total_samples': len(self.aggregated_features),
                'osa_samples': self.aggregated_features['is_osa'].sum(),
                'n_nights': self.aggregated_features['night_id'].nunique(),
                'n_features': len(self.final_model['feature_cols'])
            }
            
            pd.DataFrame([model_summary]).to_csv(f'{output_directory}/model_summary.csv', index=False)
            print("✅ Saved feature_importance.csv and model_summary.csv")
        
        print(f"All results saved to {output_directory}/")
    
    # Include all your existing helper functions here
    # (process_existing_events_df, create_labeled_respiratory_data, etc.)
    # For brevity, I'll just reference them - you can copy them from your existing code
    
    def process_existing_events_df(self, events_df):
        # Your existing function
        pass
    
    def create_labeled_respiratory_data(self, respeck_df, events_df):
        # Your existing function
        pass
    
    def extract_features_by_precise_labels_fixed(self, labeled_df, feature_extraction_func):
        # Your existing function
        pass
    
    def aggregate_breath_features(self, features_list):
        # Your existing function
        pass
    
    def create_osa_binary_classification(self, features_df):
        # Your existing function
        pass
    
    def statistical_comparison_osa_vs_rest(self, features_df_binary):
        # Your existing function
        pass

# Usage example:
"""
# Example usage for multi-night analysis

# Prepare your data - list of dictionaries with night data
night_data_list = [
    {
        'night_id': '2025-04-04',
        'respeck_data': respeck_df_night1,  # Your respeck DataFrame for night 1
        'events_data': events_df_night1     # Your events DataFrame for night 1
    },
    {
        'night_id': '2025-04-05', 
        'respeck_data': respeck_df_night2,
        'events_data': events_df_night2
    },
    # Add more nights...
]

# Initialize analyzer
analyzer = MultiNightOSAAnalyzer('path/to/data/directory')

# Process all nights
processing_summary = analyzer.process_multiple_nights(night_data_list, calculate_TS_breathFeatures)

# Aggregate results
aggregated_features = analyzer.aggregate_all_nights()

# Statistical analysis
comparison_results = analyzer.statistical_analysis_aggregated()

# Build final model
final_model = analyzer.build_final_model(test_size=0.2, cv_folds=10)

# Create visualizations
analyzer.plot_multi_night_results()

# Save all results
analyzer.save_results('multi_night_osa_analysis_results')

print("Multi-night OSA analysis complete!")
"""

In [None]:
import pandas as pd
import numpy as np
import os
import glob
import pytz
from datetime import datetime
import re

class AutomatedMultiNightLoader:
    """
    Automatically load multiple nights of data from organized directory structure.
    """
    
    def __init__(self, base_directory='../data/bishkek_csr/03_train_ready'):
        """
        Initialize with base directory containing the organized data.
        
        Expected structure:
        base_directory/
        ├── respeck/
        │   ├── 05-04-2025_respeck.csv
        │   ├── 06-04-2025_respeck.csv
        │   └── ...
        ├── nasal_files/
        │   ├── 05-04-2025_nasal.csv
        │   ├── 06-04-2025_nasal.csv
        │   └── ...
        └── event_exports/
            ├── 05-04-2025_event_export.csv
            ├── 06-04-2025_event_export.csv
            └── ...
        """
        self.base_directory = base_directory
        self.respeck_dir = os.path.join(base_directory, 'respeck')
        self.nasal_dir = os.path.join(base_directory, 'nasal_files')
        self.events_dir = os.path.join(base_directory, 'event_exports')
        self.timezone = pytz.timezone('Asia/Bishkek')
        
    def find_all_dates(self):
        """
        Scan directories to find all available dates with complete data sets.
        
        Returns:
        - available_dates: List of dates that have all three required files
        """
        print("=== Scanning for Available Nights ===")
        
        # Get all respeck files and extract dates
        respeck_files = glob.glob(os.path.join(self.respeck_dir, '*_respeck.csv'))
        nasal_files = glob.glob(os.path.join(self.nasal_dir, '*_nasal.csv'))
        event_files = glob.glob(os.path.join(self.events_dir, '*_event_export.csv'))
        
        print(f"Found files:")
        print(f"  Respeck files: {len(respeck_files)}")
        print(f"  Nasal files: {len(nasal_files)}")
        print(f"  Event files: {len(event_files)}")
        
        # Extract dates from filenames
        def extract_date(filename):
            # Extract date pattern like "05-04-2025" from filename
            match = re.search(r'(\d{2}-\d{2}-\d{4})', os.path.basename(filename))
            return match.group(1) if match else None
        
        respeck_dates = set([extract_date(f) for f in respeck_files if extract_date(f)])
        nasal_dates = set([extract_date(f) for f in nasal_files if extract_date(f)])
        event_dates = set([extract_date(f) for f in event_files if extract_date(f)])
        
        # Find dates that have all three types of files
        complete_dates = respeck_dates & nasal_dates & event_dates
        
        # Sort dates chronologically
        complete_dates = sorted(list(complete_dates))
        
        print(f"\nDates with complete data sets: {len(complete_dates)}")
        for date in complete_dates:
            print(f"  {date}")
        
        if len(complete_dates) == 0:
            print("❌ No complete data sets found!")
            print(f"Respeck dates: {sorted(respeck_dates)}")
            print(f"Nasal dates: {sorted(nasal_dates)}")
            print(f"Event dates: {sorted(event_dates)}")
        
        return complete_dates
    
    def load_single_night(self, date):
        """
        Load data for a single night using the same format as your existing code.
        
        Parameters:
        - date: Date string in format "DD-MM-YYYY"
        
        Returns:
        - night_data: Dictionary with loaded data or error info
        """
        try:
            print(f"Loading data for {date}...")
            
            # Construct file paths
            respeck_file = os.path.join(self.respeck_dir, f'{date}_respeck.csv')
            nasal_file = os.path.join(self.nasal_dir, f'{date}_nasal.csv')
            events_file = os.path.join(self.events_dir, f'{date}_event_export.csv')
            
            # Verify files exist
            for file_path, file_type in [(respeck_file, 'respeck'), (nasal_file, 'nasal'), (events_file, 'events')]:
                if not os.path.exists(file_path):
                    raise FileNotFoundError(f"{file_type} file not found: {file_path}")
            
            # Load Respeck data (using your exact format)
            respeck_df = pd.read_csv(respeck_file)
            respeck_df['timestamp'] = pd.to_datetime(respeck_df['alignedTimestamp'], unit='ms')
            respeck_df['timestamp'] = respeck_df['timestamp'].dt.tz_localize('UTC').dt.tz_convert(self.timezone)
            respeck_df.set_index('timestamp', inplace=True)
            
            # Load PSG data (using your exact format)
            psg_df = pd.read_csv(nasal_file)
            psg_df['timestamp'] = pd.to_datetime(psg_df['UnixTimestamp'], unit='ms')
            psg_df['timestamp'] = psg_df['timestamp'].dt.tz_localize('UTC').dt.tz_convert(self.timezone)
            psg_df.set_index('timestamp', inplace=True)
            
            # Load Labels data (using your exact format)
            labels_df = pd.read_csv(events_file)
            labels_df['timestamp'] = pd.to_datetime(labels_df['UnixTimestamp'], unit='ms')
            labels_df['timestamp'] = labels_df['timestamp'].dt.tz_localize('UTC').dt.tz_convert(self.timezone)
            labels_df.set_index('timestamp', inplace=True)
            
            print(f"✅ {date}: Loaded {len(respeck_df)} respeck, {len(psg_df)} PSG, {len(labels_df)} events")
            
            return {
                'night_id': date,
                'respeck_data': respeck_df,
                'psg_data': psg_df,
                'events_data': labels_df,
                'success': True
            }
            
        except Exception as e:
            print(f"❌ {date}: Error loading data - {e}")
            return {
                'night_id': date,
                'success': False,
                'error': str(e)
            }
    
    def load_all_nights(self, max_nights=None):
        """
        Load data for all available nights.
        
        Parameters:
        - max_nights: Maximum number of nights to load (None for all)
        
        Returns:
        - night_data_list: List of night data dictionaries
        - loading_summary: Summary of loading results
        """
        print("=== LOADING ALL NIGHTS ===")
        
        # Find available dates
        available_dates = self.find_all_dates()
        
        if not available_dates:
            return [], {'successful': 0, 'failed': 0, 'errors': []}
        
        # Limit number of nights if specified
        if max_nights is not None:
            available_dates = available_dates[:max_nights]
            print(f"Loading first {len(available_dates)} nights (limited by max_nights={max_nights})")
        
        # Load each night
        night_data_list = []
        successful_loads = 0
        failed_loads = 0
        errors = []
        
        for date in available_dates:
            night_data = self.load_single_night(date)
            
            if night_data['success']:
                night_data_list.append(night_data)
                successful_loads += 1
            else:
                failed_loads += 1
                errors.append(f"{date}: {night_data['error']}")
        
        loading_summary = {
            'total_nights': len(available_dates),
            'successful': successful_loads,
            'failed': failed_loads,
            'errors': errors
        }
        
        print(f"\n=== Loading Summary ===")
        print(f"Total nights attempted: {loading_summary['total_nights']}")
        print(f"Successfully loaded: {loading_summary['successful']}")
        print(f"Failed to load: {loading_summary['failed']}")
        
        if errors:
            print(f"Errors encountered:")
            for error in errors:
                print(f"  {error}")
        
        return night_data_list, loading_summary
    
    def preview_data_structure(self):
        """
        Preview the data structure and file organization.
        """
        print("=== Data Structure Preview ===")
        
        for subdir in ['respeck', 'nasal_files', 'event_exports']:
            full_path = os.path.join(self.base_directory, subdir)
            if os.path.exists(full_path):
                files = glob.glob(os.path.join(full_path, '*.csv'))
                print(f"{subdir}/: {len(files)} files")
                if files:
                    # Show first few filenames
                    for file in sorted(files)[:3]:
                        print(f"  {os.path.basename(file)}")
                    if len(files) > 3:
                        print(f"  ... and {len(files) - 3} more")
            else:
                print(f"{subdir}/: Directory not found")

# Enhanced MultiNightOSAAnalyzer that works with the automated loader
class EnhancedMultiNightOSAAnalyzer(MultiNightOSAAnalyzer):
    """
    Enhanced version that works with the automated data loader.
    """
    
    def __init__(self, base_directory='../data/bishkek_csr/03_train_ready'):
        super().__init__(base_directory)
        self.loader = AutomatedMultiNightLoader(base_directory)
    
    def load_and_process_all_nights(self, feature_extraction_func, max_nights=None):
        """
        Automatically load and process all available nights.
        
        Parameters:
        - feature_extraction_func: Your feature extraction function
        - max_nights: Maximum number of nights to process (None for all)
        
        Returns:
        - processing_summary: Complete summary of loading and processing
        """
        print("=== AUTOMATED MULTI-NIGHT OSA ANALYSIS ===")
        
        # Step 1: Load all night data
        night_data_list, loading_summary = self.loader.load_all_nights(max_nights)
        
        if not night_data_list:
            print("❌ No data loaded successfully")
            return loading_summary
        
        # Step 2: Process all loaded nights
        processing_summary = self.process_multiple_nights(night_data_list, feature_extraction_func)
        
        # Combine loading and processing summaries
        combined_summary = {
            'loading': loading_summary,
            'processing': processing_summary,
            'overall_success': len(self.night_results),
            'total_attempted': loading_summary['total_nights']
        }
        
        return combined_summary

# Complete usage example
def run_complete_multi_night_analysis(base_directory='../data/bishkek_csr/03_train_ready', 
                                    max_nights=None):
    """
    Run complete multi-night OSA analysis with automated data loading.
    
    Parameters:
    - base_directory: Path to your data directory
    - max_nights: Maximum nights to analyze (None for all)
    
    Returns:
    - analyzer: Complete analyzer with all results
    """
    
    print("🌙 COMPLETE MULTI-NIGHT OSA ANALYSIS 🌙")
    print("=" * 50)
    
    # Step 1: Initialize analyzer
    analyzer = EnhancedMultiNightOSAAnalyzer(base_directory)
    
    # Step 2: Preview available data
    analyzer.loader.preview_data_structure()
    
    # Step 3: Load and process all nights
    processing_summary = analyzer.load_and_process_all_nights(
        calculate_TS_breathFeatures,  # Your feature extraction function
        max_nights=max_nights
    )
    
    if processing_summary['overall_success'] == 0:
        print("❌ No nights processed successfully")
        return analyzer
    
    # Step 4: Aggregate results
    print("\n" + "=" * 50)
    aggregated_features = analyzer.aggregate_all_nights()
    
    # Step 5: Statistical analysis
    comparison_results = analyzer.statistical_analysis_aggregated()
    
    # Step 6: Build final model
    final_model = analyzer.build_final_model(test_size=0.2, cv_folds=10)
    
    # Step 7: Create visualizations
    analyzer.plot_multi_night_results()
    
    # Step 8: Save all results
    analyzer.save_results('complete_multi_night_osa_analysis')
    
    print("\n" + "=" * 50)
    print("✅ COMPLETE MULTI-NIGHT ANALYSIS FINISHED!")
    print(f"📊 Processed {processing_summary['overall_success']} nights successfully")
    print(f"💾 Results saved to 'complete_multi_night_osa_analysis/'")
    
    return analyzer

# Simple usage:

# Run complete analysis on all available nights
analyzer = run_complete_multi_night_analysis()

# Or limit to first 5 nights for testing
analyzer = run_complete_multi_night_analysis(max_nights=5)

# Or specify different directory
analyzer = run_complete_multi_night_analysis(
    base_directory='../data/bishkek_csr/03_train_ready',
    max_nights=10
)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
import os
import glob
import pytz
from datetime import datetime, timedelta
import re
import warnings
warnings.filterwarnings('ignore')

class CompleteMultiNightOSAAnalyzer:
    """
    Complete multi-night OSA analysis with all required functions implemented.
    """
    
    def __init__(self, base_directory='../data/bishkek_csr/03_train_ready'):
        self.base_directory = base_directory
        self.respeck_dir = os.path.join(base_directory, 'respeck')
        self.nasal_dir = os.path.join(base_directory, 'nasal_files')
        self.events_dir = os.path.join(base_directory, 'event_exports')
        self.timezone = pytz.timezone('Asia/Bishkek')
        self.night_results = {}
        self.aggregated_features = None
        self.aggregated_comparison = None
        self.final_model = None
    
    def find_all_dates(self):
        """Find all available dates with complete data sets."""
        print("=== Scanning for Available Nights ===")
        
        respeck_files = glob.glob(os.path.join(self.respeck_dir, '*_respeck.csv'))
        nasal_files = glob.glob(os.path.join(self.nasal_dir, '*_nasal.csv'))
        event_files = glob.glob(os.path.join(self.events_dir, '*_event_export.csv'))
        
        print(f"Found files:")
        print(f"  Respeck files: {len(respeck_files)}")
        print(f"  Nasal files: {len(nasal_files)}")
        print(f"  Event files: {len(event_files)}")
        
        def extract_date(filename):
            match = re.search(r'(\d{2}-\d{2}-\d{4})', os.path.basename(filename))
            return match.group(1) if match else None
        
        respeck_dates = set([extract_date(f) for f in respeck_files if extract_date(f)])
        nasal_dates = set([extract_date(f) for f in nasal_files if extract_date(f)])
        event_dates = set([extract_date(f) for f in event_files if extract_date(f)])
        
        complete_dates = respeck_dates & nasal_dates & event_dates
        complete_dates = sorted(list(complete_dates))
        
        print(f"\nDates with complete data sets: {len(complete_dates)}")
        for date in complete_dates:
            print(f"  {date}")
        
        return complete_dates
    
    def load_single_night(self, date):
        """Load data for a single night."""
        try:
            print(f"Loading data for {date}...")
            
            respeck_file = os.path.join(self.respeck_dir, f'{date}_respeck.csv')
            nasal_file = os.path.join(self.nasal_dir, f'{date}_nasal.csv')
            events_file = os.path.join(self.events_dir, f'{date}_event_export.csv')
            
            # Load Respeck data
            respeck_df = pd.read_csv(respeck_file)
            respeck_df['timestamp'] = pd.to_datetime(respeck_df['alignedTimestamp'], unit='ms')
            respeck_df['timestamp'] = respeck_df['timestamp'].dt.tz_localize('UTC').dt.tz_convert(self.timezone)
            respeck_df.set_index('timestamp', inplace=True)
            
            # Load Events data
            labels_df = pd.read_csv(events_file)
            labels_df['timestamp'] = pd.to_datetime(labels_df['UnixTimestamp'], unit='ms')
            labels_df['timestamp'] = labels_df['timestamp'].dt.tz_localize('UTC').dt.tz_convert(self.timezone)
            labels_df.set_index('timestamp', inplace=True)
            
            print(f"✅ {date}: Loaded {len(respeck_df)} respeck, {len(labels_df)} events")
            
            return {
                'night_id': date,
                'respeck_data': respeck_df,
                'events_data': labels_df,
                'success': True
            }
            
        except Exception as e:
            print(f"❌ {date}: Error loading data - {e}")
            return {
                'night_id': date,
                'success': False,
                'error': str(e)
            }
    
    def process_existing_events_df(self, events_df):
        """Process events DataFrame to add proper timestamps."""
        print(f"Processing existing events DataFrame with {len(events_df)} events")
        
        processed_df = events_df.copy()
        processed_df['start_time'] = pd.to_datetime(processed_df['UnixTimestamp'], unit='ms').dt.tz_localize('UTC').dt.tz_convert(self.timezone)
        processed_df['duration_seconds'] = processed_df['Duration'].str.replace(',', '.').astype(float)
        processed_df['end_time'] = processed_df['start_time'] + pd.to_timedelta(processed_df['duration_seconds'], unit='s')
        processed_df = processed_df.sort_values('start_time').reset_index(drop=True)
        
        print(f"Event types: {processed_df['Event'].value_counts().to_dict()}")
        return processed_df
    
    def create_labeled_respiratory_data(self, respeck_df, events_df, breathing_col='breathingSignal'):
        """Create labeled respiratory data with precise event timing."""
        print("Creating labeled respiratory data...")
        
        if not isinstance(respeck_df.index, pd.DatetimeIndex):
            respeck_df.index = pd.to_datetime(respeck_df.index)
        
        labeled_df = respeck_df.copy()
        labeled_df['event_label'] = 'Normal'
        labeled_df['event_duration'] = np.nan
        labeled_df['event_id'] = np.nan
        
        respeck_start = labeled_df.index.min()
        respeck_end = labeled_df.index.max()
        events_start = events_df['start_time'].min()
        events_end = events_df['end_time'].max()
        
        print(f"Respeck data: {respeck_start} to {respeck_end}")
        print(f"Events data: {events_start} to {events_end}")
        
        # Handle timezone issues
        if respeck_start.tz is not None and events_start.tz is None:
            events_df = events_df.copy()
            events_df['start_time'] = events_df['start_time'].dt.tz_localize(respeck_start.tz)
            events_df['end_time'] = events_df['end_time'].dt.tz_localize(respeck_start.tz)
            events_start = events_df['start_time'].min()
            events_end = events_df['end_time'].max()
        elif respeck_start.tz is None and events_start.tz is not None:
            labeled_df.index = labeled_df.index.tz_localize(events_start.tz)
            respeck_start = labeled_df.index.min()
            respeck_end = labeled_df.index.max()
        
        # Check for time overlap
        overlap_start = max(respeck_start, events_start)
        overlap_end = min(respeck_end, events_end)
        
        if overlap_end <= overlap_start:
            # Try adding 24 hours to events if they're from previous day
            time_diff = events_start - respeck_start
            if time_diff.total_seconds() < -12*3600:
                events_df = events_df.copy()
                events_df['start_time'] = events_df['start_time'] + pd.Timedelta(hours=24)
                events_df['end_time'] = events_df['end_time'] + pd.Timedelta(hours=24)
                events_start = events_df['start_time'].min()
                events_end = events_df['end_time'].max()
                overlap_start = max(respeck_start, events_start)
                overlap_end = min(respeck_end, events_end)
        
        if overlap_end > overlap_start:
            overlap_duration = overlap_end - overlap_start
            print(f"✅ Overlap found: {overlap_start} to {overlap_end} ({overlap_duration})")
        else:
            print(f"❌ No overlap found between respeck and events data")
            return labeled_df
        
        # Find overlapping events
        overlapping_events = events_df[
            (events_df['end_time'] >= respeck_start) & 
            (events_df['start_time'] <= respeck_end)
        ].copy()
        
        print(f"Found {len(overlapping_events)} events overlapping with respiratory data")
        
        # Label each event period
        events_applied = 0
        total_event_samples = 0
        
        for idx, event in overlapping_events.iterrows():
            event_mask = (
                (labeled_df.index >= event['start_time']) & 
                (labeled_df.index <= event['end_time'])
            )
            
            data_points_in_event = event_mask.sum()
            
            if data_points_in_event > 0:
                labeled_df.loc[event_mask, 'event_label'] = event['Event']
                labeled_df.loc[event_mask, 'event_duration'] = event['duration_seconds']
                labeled_df.loc[event_mask, 'event_id'] = idx
                events_applied += 1
                total_event_samples += data_points_in_event
                
                if events_applied <= 5:
                    print(f"  Event {idx}: {event['Event']} at {event['start_time']} "
                          f"({data_points_in_event} data points)")
        
        print(f"Successfully applied {events_applied} events to respiratory data")
        print(f"Total event samples: {total_event_samples}")
        
        # Summary statistics
        label_counts = labeled_df['event_label'].value_counts()
        print(f"\nLabel distribution:")
        for label, count in label_counts.items():
            percentage = (count / len(labeled_df)) * 100
            print(f"  {label}: {count:,} samples ({percentage:.1f}%)")
        
        return labeled_df
    
    def extract_features_by_precise_labels_fixed(self, labeled_df, feature_extraction_func, 
                                               segment_length_minutes=5):
        """Extract features using precise event labels."""
        print(f"Extracting features using precise labels...")
        
        all_features = []
        
        # Group consecutive samples with the same label
        labeled_df['label_group'] = (labeled_df['event_label'] != labeled_df['event_label'].shift()).cumsum()
        
        event_periods = []
        for group_id, group_data in labeled_df.groupby('label_group'):
            if len(group_data) < 50:  # Skip very short segments
                continue
                
            event_type = group_data['event_label'].iloc[0]
            start_time = group_data.index.min()
            end_time = group_data.index.max()
            duration = (end_time - start_time).total_seconds()
            
            event_periods.append({
                'event_type': event_type,
                'start_time': start_time,
                'end_time': end_time,
                'duration': duration,
                'sample_count': len(group_data)
            })
        
        print(f"Found {len(event_periods)} distinct event periods")
        
        # Show breakdown by event type
        period_counts = {}
        for period in event_periods:
            event_type = period['event_type']
            period_counts[event_type] = period_counts.get(event_type, 0) + 1
        
        print("Periods by event type:")
        for event_type, count in sorted(period_counts.items()):
            print(f"  {event_type}: {count} periods")
        
        successful_extractions = 0
        for i, period in enumerate(event_periods):
            try:
                period_data = labeled_df[
                    (labeled_df.index >= period['start_time']) & 
                    (labeled_df.index <= period['end_time'])
                ]
                
                if 'breathingSignal' not in period_data.columns or len(period_data) < 50:
                    continue
                
                # Extract features using the timestamp column
                features = feature_extraction_func(
                    period_data['timestamp'].to_numpy(),
                    period_data['breathingSignal'].values
                )
                
                if features is not None:
                    features['event_type'] = period['event_type']
                    features['event_time'] = period['start_time']
                    features['period_duration'] = period['duration']
                    features['sample_count'] = period['sample_count']
                    
                    all_features.append(features)
                    successful_extractions += 1
                    
                    if i % 100 == 0:
                        print(f"  Processed {i+1}/{len(event_periods)} periods... ({successful_extractions} successful)")
                    
            except Exception as e:
                if i < 5:
                    print(f"Error processing period {i} ({period['event_type']}): {e}")
                continue
        
        print(f"Successfully extracted features for {successful_extractions} out of {len(event_periods)} periods")
        
        # Show successful extractions by event type
        success_counts = {}
        for features in all_features:
            event_type = features['event_type']
            success_counts[event_type] = success_counts.get(event_type, 0) + 1
        
        print("Successful extractions by event type:")
        for event_type, count in sorted(success_counts.items()):
            print(f"  {event_type}: {count} periods")
        
        return all_features
    
    def aggregate_breath_features(self, features_list):
        """Convert list of feature dictionaries to aggregated DataFrame."""
        aggregated_features = []
        
        for features in features_list:
            event_features = {
                'event_type': features['event_type'],
                'event_time': features['event_time'],
                'period_duration': features.get('period_duration', np.nan),
                'sample_count': features.get('sample_count', np.nan)
            }
            
            for feature_name, values in features.items():
                if feature_name in ['event_type', 'event_time', 'period_duration', 'sample_count', 'timestamp', 'breathingSignal']:
                    continue
                
                if hasattr(values, 'values'):
                    values = values.values
                
                if isinstance(values, (list, np.ndarray, pd.Series)):
                    try:
                        values_array = np.array(values, dtype=float)
                        values_clean = values_array[np.isfinite(values_array)]
                        
                        if len(values_clean) > 0:
                            event_features[f'{feature_name}_mean'] = np.mean(values_clean)
                            event_features[f'{feature_name}_std'] = np.std(values_clean)
                            event_features[f'{feature_name}_median'] = np.median(values_clean)
                            event_features[f'{feature_name}_min'] = np.min(values_clean)
                            event_features[f'{feature_name}_max'] = np.max(values_clean)
                            event_features[f'{feature_name}_range'] = np.max(values_clean) - np.min(values_clean)
                            event_features[f'{feature_name}_q25'] = np.percentile(values_clean, 25)
                            event_features[f'{feature_name}_q75'] = np.percentile(values_clean, 75)
                            event_features[f'{feature_name}_iqr'] = np.percentile(values_clean, 75) - np.percentile(values_clean, 25)
                            event_features[f'{feature_name}_skew'] = pd.Series(values_clean).skew()
                            event_features[f'{feature_name}_kurtosis'] = pd.Series(values_clean).kurtosis()
                            
                            if np.mean(values_clean) != 0:
                                event_features[f'{feature_name}_cv'] = np.std(values_clean) / np.abs(np.mean(values_clean))
                            else:
                                event_features[f'{feature_name}_cv'] = 0
                                
                            event_features[f'{feature_name}_count'] = len(values_clean)
                            
                            if feature_name in ['auc_values', 'breath_durations', 'inhalation_durations', 'exhalation_durations']:
                                if len(values_clean) >= 2:
                                    x = np.arange(len(values_clean))
                                    slope = np.polyfit(x, values_clean, 1)[0] if len(values_clean) > 1 else 0
                                    event_features[f'{feature_name}_trend'] = slope
                                    
                                    rate_of_change = np.mean(np.diff(values_clean)) if len(values_clean) > 1 else 0
                                    event_features[f'{feature_name}_rate_change'] = rate_of_change
                                    
                    except (ValueError, TypeError) as e:
                        continue
                        
                elif isinstance(values, (int, float)) and np.isfinite(values):
                    event_features[feature_name] = values
            
            aggregated_features.append(event_features)
        
        return pd.DataFrame(aggregated_features)
    
    def create_osa_binary_classification(self, features_df):
        """Create binary OSA vs Non-OSA classification."""
        features_df_binary = features_df.copy()
        features_df_binary['is_osa'] = (features_df_binary['event_type'] == 'Obstructive Apnea').astype(int)
        features_df_binary['binary_label'] = features_df_binary['is_osa'].map({1: 'OSA', 0: 'Non-OSA'})
        return features_df_binary
    
    def process_single_night(self, respeck_data, events_data, night_id, feature_extraction_func):
        """Process a single night's data."""
        print(f"\n=== Processing Night: {night_id} ===")
        
        try:
            # Step 1: Process events
            processed_events_df = self.process_existing_events_df(events_data)
            
            # Step 2: Create labeled respiratory data
            labeled_df = self.create_labeled_respiratory_data(respeck_data, processed_events_df)
            
            # Step 3: Add timestamp column
            labeled_df['timestamp'] = labeled_df.index
            
            # Step 4: Extract features
            features_list = self.extract_features_by_precise_labels_fixed(
                labeled_df, feature_extraction_func
            )
            
            # Step 5: Aggregate features
            if len(features_list) > 0:
                features_df = self.aggregate_breath_features(features_list)
                features_df['night_id'] = night_id
                features_df_binary = self.create_osa_binary_classification(features_df)
                
                night_result = {
                    'night_id': night_id,
                    'labeled_df': labeled_df,
                    'features_df': features_df_binary,
                    'n_samples': len(features_df_binary),
                    'n_osa': len(features_df_binary[features_df_binary['binary_label'] == 'OSA']),
                    'n_non_osa': len(features_df_binary[features_df_binary['binary_label'] == 'Non-OSA']),
                    'success': True
                }
                
                print(f"✅ Night {night_id}: {len(features_df_binary)} samples "
                      f"({night_result['n_osa']} OSA, {night_result['n_non_osa']} Non-OSA)")
                
                return night_result
                
            else:
                print(f"❌ Night {night_id}: No features extracted")
                return {'night_id': night_id, 'success': False, 'error': 'No features extracted'}
                
        except Exception as e:
            print(f"❌ Night {night_id}: Error - {e}")
            import traceback
            traceback.print_exc()
            return {'night_id': night_id, 'success': False, 'error': str(e)}
    
    def load_and_process_all_nights(self, feature_extraction_func, max_nights=None):
        """Load and process all available nights."""
        print("=== AUTOMATED MULTI-NIGHT OSA ANALYSIS ===")
        
        # Find available dates
        available_dates = self.find_all_dates()
        if not available_dates:
            return {'successful_nights': 0, 'failed_nights': 0}
        
        if max_nights is not None:
            available_dates = available_dates[:max_nights]
            print(f"Loading first {len(available_dates)} nights (limited by max_nights={max_nights})")
        
        # Load and process each night
        successful_nights = []
        failed_nights = []
        
        for date in available_dates:
            # Load night data
            night_data = self.load_single_night(date)
            
            if night_data['success']:
                # Process night data
                result = self.process_single_night(
                    night_data['respeck_data'],
                    night_data['events_data'], 
                    night_data['night_id'],
                    feature_extraction_func
                )
                
                if result['success']:
                    successful_nights.append(result)
                    self.night_results[result['night_id']] = result
                else:
                    failed_nights.append(result)
            else:
                failed_nights.append(night_data)
        
        print(f"\n=== Processing Summary ===")
        print(f"Successful nights: {len(successful_nights)}")
        print(f"Failed nights: {len(failed_nights)}")
        
        if failed_nights:
            print(f"Failed night details:")
            for failed in failed_nights:
                print(f"  {failed['night_id']}: {failed['error']}")
        
        return {
            'successful_nights': len(successful_nights),
            'failed_nights': len(failed_nights),
            'successful_results': successful_nights,
            'failed_results': failed_nights
        }
    
    def aggregate_all_nights(self):
        """Aggregate features from all successfully processed nights."""
        print("\n=== Aggregating Multi-Night Data ===")
        
        if not self.night_results:
            print("❌ No night results to aggregate")
            return None
        
        all_features = []
        night_summary = []
        
        for night_id, night_result in self.night_results.items():
            features_df = night_result['features_df']
            all_features.append(features_df)
            
            night_summary.append({
                'night_id': night_id,
                'total_samples': night_result['n_samples'],
                'osa_samples': night_result['n_osa'],
                'non_osa_samples': night_result['n_non_osa'],
                'osa_percentage': (night_result['n_osa'] / night_result['n_samples']) * 100
            })
        
        self.aggregated_features = pd.concat(all_features, ignore_index=True)
        
        print(f"Aggregated data summary:")
        print(f"  Total nights: {len(self.night_results)}")
        print(f"  Total samples: {len(self.aggregated_features)}")
        
        class_counts = self.aggregated_features['binary_label'].value_counts()
        print(f"  OSA samples: {class_counts.get('OSA', 0)}")
        print(f"  Non-OSA samples: {class_counts.get('Non-OSA', 0)}")
        print(f"  OSA percentage: {(class_counts.get('OSA', 0) / len(self.aggregated_features) * 100):.1f}%")
        
        night_summary_df = pd.DataFrame(night_summary)
        print(f"\nPer-night breakdown:")
        print(night_summary_df.to_string(index=False))
        
        return self.aggregated_features

# Simple usage function
def run_complete_multi_night_analysis(base_directory='../data/bishkek_csr/03_train_ready', 
                                    max_nights=None):
    """Run complete multi-night OSA analysis."""
    
    print("🌙 COMPLETE MULTI-NIGHT OSA ANALYSIS 🌙")
    print("=" * 50)
    
    # Initialize analyzer
    analyzer = CompleteMultiNightOSAAnalyzer(base_directory)
    
    # Load and process all nights
    processing_summary = analyzer.load_and_process_all_nights(
        calculate_TS_breathFeatures,
        max_nights=max_nights
    )
    
    if processing_summary['successful_nights'] == 0:
        print("❌ No nights processed successfully")
        return analyzer
    
    # Aggregate results
    print("\n" + "=" * 50)
    aggregated_features = analyzer.aggregate_all_nights()
    
    if aggregated_features is not None:
        print("✅ MULTI-NIGHT ANALYSIS COMPLETED!")
        print(f"📊 Processed {processing_summary['successful_nights']} nights successfully")
        
        # Show class distribution
        class_counts = aggregated_features['binary_label'].value_counts()
        print(f"📈 OSA samples: {class_counts.get('OSA', 0)}")
        print(f"📈 Non-OSA samples: {class_counts.get('Non-OSA', 0)}")
    
    return analyzer

# Usage:

# analyzer = run_complete_multi_night_analysis(max_nights=3)

# Or run all nights
analyzer = run_complete_multi_night_analysis()
