# Signal Analysis with PELT and HDBSCAN

This notebook provides an interactive environment for analyzing signals using:
1. **PELT (Pruned Exact Linear Time)** for change point detection
2. **HDBSCAN** for clustering and smoothing the detected transitions

## Setup

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import hdbscan
import ruptures as rpt

from datetime import datetime, date, timedelta
import ipywidgets as widgets
from IPython.display import display, HTML

# Import our custom modules
from lib.db_connection import DatabaseConnection
from lib.signal_utils import SignalProcessor
from lib.visualization import SignalVisualizer

# Set up plotting
plt.style.use('seaborn-v0_8-darkgrid')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Initialize database connection
db = DatabaseConnection()
print("✅ Environment loaded successfully!")

: 

## Configure Analysis Parameters

Use the widgets below to select:
- Signal type to analyze
- Date range for analysis
- PELT and HDBSCAN parameters

In [None]:
# Get available signals
signal_configs = db.get_signal_configs()
signal_options = [(f"{row['display_name']} ({row['signal_name']})", row['signal_name']) 
                  for _, row in signal_configs.iterrows()]

# Create widgets
signal_dropdown = widgets.Dropdown(
    options=signal_options,
    value=signal_options[0][1] if signal_options else None,
    description='Signal:',
    style={'description_width': 'initial'}
)

date_picker = widgets.DatePicker(
    description='Analysis Date:',
    value=date.today() - timedelta(days=1),
    style={'description_width': 'initial'}
)

# PELT parameters
pelt_penalty = widgets.FloatSlider(
    value=1.0,
    min=0.1,
    max=5.0,
    step=0.1,
    description='PELT Penalty:',
    style={'description_width': 'initial'}
)

min_segment_size = widgets.IntSlider(
    value=5,
    min=2,
    max=20,
    step=1,
    description='Min Segment Size:',
    style={'description_width': 'initial'}
)

# HDBSCAN parameters
min_cluster_size = widgets.IntSlider(
    value=3,
    min=2,
    max=10,
    step=1,
    description='Min Cluster Size:',
    style={'description_width': 'initial'}
)

epsilon = widgets.FloatSlider(
    value=1800.0,  # 30 minutes in seconds
    min=300.0,
    max=7200.0,
    step=300.0,
    description='Epsilon (seconds):',
    style={'description_width': 'initial'}
)

# Display widgets
display(HTML("<h3>Signal Selection</h3>"))
display(signal_dropdown, date_picker)

display(HTML("<h3>PELT Parameters</h3>"))
display(pelt_penalty, min_segment_size)

display(HTML("<h3>HDBSCAN Parameters</h3>"))
display(min_cluster_size, epsilon)

## Cell 1: Data Retrieval and Visualization

Fetch signal data from the database and visualize raw data points.

In [None]:
# Get selected parameters
selected_signal = signal_dropdown.value
selected_date = date_picker.value

# Get signal configuration
signal_config = signal_configs[signal_configs['signal_name'] == selected_signal].iloc[0]

print(f"📊 Analyzing: {signal_config['display_name']}")
print(f"📅 Date: {selected_date}")
print(f"📝 Description: {signal_config['description']}")
print(f"📏 Unit: {signal_config['unit']}")
print(f"🏗️ Archetype: {signal_config['archetype']}")

# Fetch data
df = db.get_signals_for_date_range(
    signal_name=selected_signal,
    start_date=selected_date,
    end_date=selected_date
)

print(f"\n📊 Found {len(df)} signals for {selected_date}")

if not df.empty:
    # Show data sample
    print("\n🔍 Data Sample:")
    display(df.head())
    
    # Plot raw signals
    fig = SignalVisualizer.plot_raw_signals(
        df, 
        signal_config['display_name'],
        title=f"{signal_config['display_name']} - Raw Data for {selected_date}"
    )
    fig.show()
    
    # Store processed data for next cells
    values, timestamps_numeric, timestamps = SignalProcessor.prepare_signal_data(df)
    print(f"\n✅ Data prepared for analysis: {len(values)} points")
else:
    print("⚠️ No data found for the selected date and signal")

## Cell 2: PELT Change Point Detection

Apply PELT algorithm to detect change points in the signal.

In [None]:
if 'values' in locals() and len(values) > 0:
    # Detect collection periods first
    collection_periods = SignalProcessor.detect_collection_periods(df)
    print(f"🔍 Found {len(collection_periods)} collection periods")
    
    all_change_points = []
    all_segments = []
    
    # Process each collection period
    for i, (start_idx, end_idx) in enumerate(collection_periods):
        period_values = values[start_idx:end_idx+1]
        period_timestamps = timestamps[start_idx:end_idx+1]
        
        print(f"\n📍 Period {i+1}: {period_timestamps[0]} to {period_timestamps[-1]}")
        print(f"   Points: {len(period_values)}")
        
        # Run PELT on this period
        if len(period_values) >= min_segment_size.value * 2:
            change_points = SignalProcessor.run_pelt_detection(
                period_values,
                cost_function="l2",
                min_segment_size=min_segment_size.value,
                penalty_multiplier=pelt_penalty.value
            )
            
            # Adjust indices to global
            global_change_points = [start_idx + cp for cp in change_points]
            all_change_points.extend(global_change_points)
            
            # Create segments
            segments = SignalProcessor.create_segments_from_changepoints(
                period_values,
                period_timestamps,
                change_points,
                selected_signal
            )
            all_segments.extend(segments)
            
            print(f"   Change points: {len(change_points)}")
            print(f"   Segments: {len(segments)}")
    
    # Visualize PELT results
    print(f"\n📊 Total change points detected: {len(all_change_points)}")
    print(f"📊 Total segments created: {len(all_segments)}")
    
    # Plot results
    fig = SignalVisualizer.plot_pelt_detection(
        df,
        all_change_points,
        all_segments,
        signal_config['display_name'],
        title=f"{signal_config['display_name']} - PELT Detection Results"
    )
    fig.show()
    
    # Show segment summary
    if all_segments:
        segments_df = pd.DataFrame(all_segments)
        print("\n📋 Segment Summary:")
        display(segments_df[['state', 'duration_minutes', 'mean_value', 'std_value']]
                .groupby('state')
                .agg({
                    'duration_minutes': ['count', 'sum', 'mean'],
                    'mean_value': ['mean', 'std']
                })
                .round(2))
else:
    print("⚠️ No data available for PELT analysis")

## Cell 3: HDBSCAN Clustering and Smoothing

Apply HDBSCAN to cluster and smooth the PELT-detected segments into meaningful events.

In [None]:
if 'all_segments' in locals() and all_segments:
    # Prepare features for HDBSCAN
    start_of_day = datetime.combine(selected_date, datetime.min.time())
    features = SignalProcessor.prepare_hdbscan_features(all_segments, start_of_day)
    
    print(f"🔢 Feature matrix shape: {features.shape}")
    print(f"📊 Features: temporal position, duration, mean value (all normalized)")
    
    # Run HDBSCAN
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size.value,
        min_samples=2,
        metric='manhattan',
        cluster_selection_epsilon=epsilon.value,
        algorithm='best'
    )
    
    cluster_labels = clusterer.fit_predict(features)
    
    # Analyze clustering results
    n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
    n_noise = list(cluster_labels).count(-1)
    
    print(f"\n🎯 HDBSCAN Results:")
    print(f"   Clusters found: {n_clusters}")
    print(f"   Noise points: {n_noise}")
    print(f"   Clustered points: {len(cluster_labels) - n_noise}")
    
    # Visualize clustering
    fig = SignalVisualizer.plot_hdbscan_clustering(
        all_segments,
        features,
        cluster_labels,
        signal_config['display_name'],
        title=f"{signal_config['display_name']} - HDBSCAN Event Detection"
    )
    fig.show()
    
    # Create events from clusters
    events = []
    for cluster_id in set(cluster_labels):
        if cluster_id == -1:  # Skip noise
            continue
            
        # Get segments in this cluster
        cluster_mask = cluster_labels == cluster_id
        cluster_segments = [s for i, s in enumerate(all_segments) if cluster_mask[i]]
        
        if cluster_segments:
            # Create event from cluster
            event_start = min(s['start_time'] for s in cluster_segments)
            event_end = max(s['end_time'] for s in cluster_segments)
            
            # Determine event type based on dominant state
            states = [s['state'] for s in cluster_segments]
            dominant_state = max(set(states), key=states.count)
            
            event = {
                'cluster_id': cluster_id,
                'start_time': event_start,
                'end_time': event_end,
                'duration_minutes': (event_end - event_start).total_seconds() / 60,
                'dominant_state': dominant_state,
                'n_segments': len(cluster_segments),
                'mean_value': np.mean([s['mean_value'] for s in cluster_segments])
            }
            events.append(event)
    
    # Display events summary
    if events:
        events_df = pd.DataFrame(events)
        print("\n📅 Detected Events:")
        display(events_df.sort_values('start_time'))
        
        # Summary statistics
        print("\n📊 Event Statistics:")
        print(f"   Total events: {len(events)}")
        print(f"   Total duration: {events_df['duration_minutes'].sum():.1f} minutes")
        print(f"   Average event duration: {events_df['duration_minutes'].mean():.1f} minutes")
        print(f"   Dominant states: {events_df['dominant_state'].value_counts().to_dict()}")
else:
    print("⚠️ No segments available for HDBSCAN clustering")

## Summary and Export

Generate a comprehensive summary of the analysis.

In [None]:
if 'df' in locals() and not df.empty:
    # Get existing events from database (if any)
    db_events = db.get_events_for_date(selected_date)
    
    # Create combined visualization
    fig = SignalVisualizer.plot_combined_analysis(
        df,
        all_segments if 'all_segments' in locals() else [],
        cluster_labels if 'cluster_labels' in locals() else np.array([]),
        db_events if not db_events.empty else None,
        signal_config['display_name']
    )
    fig.show()
    
    # Generate summary statistics
    if 'all_segments' in locals() and 'cluster_labels' in locals():
        summary_stats = SignalVisualizer.create_summary_stats(
            df,
            all_segments,
            cluster_labels
        )
        print("\n📊 Analysis Summary:")
        display(summary_stats)
    
    print("\n✅ Analysis complete!")
else:
    print("⚠️ No data available for summary")

## Cleanup

In [None]:
# Close database connection
db.close()
print("🔒 Database connection closed")