In [None]:
import pandas as pd
import os
import numpy as np
import pytz
import plotly.graph_objects as go
import plotly.express as px
import plotly.subplots as sp
import pytz as tz
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from scipy.signal import butter, filtfilt
from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from respiratoryFeatures import *
from calculateContinuousBreathFeatures import *


In [None]:
RESPECK_FILE = '../data/bishkek_csr/03_train_ready/respeck/08-05-2025_respeck.csv'
PSG_FILE = '../data/bishkek_csr/03_train_ready/nasal_files/08-05-2025_nasal.csv'
LABELS_FILE = '../data/bishkek_csr/03_train_ready/event_exports/08-05-2025_event_export.csv'
OUTPUT_FILE = './08-05-2025_respeck_features.csv'

# --- Load Data ---
print("Loading data...")

respeck_df = pd.read_csv(RESPECK_FILE)
respeck_df['timestamp'] = pd.to_datetime(respeck_df['alignedTimestamp'], unit='ms')
tz = pytz.timezone('Asia/Bishkek')
respeck_df['timestamp'] = respeck_df['timestamp'].dt.tz_localize('UTC').dt.tz_convert(tz)
# respeck_df.set_index('timestamp', inplace=True, drop=False)



psg_df = pd.read_csv(PSG_FILE)
psg_df['timestamp'] = pd.to_datetime(psg_df['UnixTimestamp'], unit='ms')
tz = pytz.timezone('Asia/Bishkek')
psg_df['timestamp'] = psg_df['timestamp'].dt.tz_localize('UTC').dt.tz_convert(tz)

# psg_df.set_index('timestamp', inplace=True)

labels_df = pd.read_csv(LABELS_FILE)
labels_df['timestamp'] = pd.to_datetime(labels_df['UnixTimestamp'], unit='ms')
tz = pytz.timezone('Asia/Bishkek')
labels_df['timestamp'] = labels_df['timestamp'].dt.tz_localize('UTC').dt.tz_convert(tz)
# labels_df.set_index('timestamp', inplace=True)


start_time_respeck = respeck_df['timestamp'].min()
end_time_respeck = respeck_df['timestamp'].max()

start_time_psg = psg_df['timestamp'].min()
end_time_psg = psg_df['timestamp'].max()

overlap_start = max(start_time_respeck, start_time_psg)
overlap_end = min(end_time_respeck, end_time_psg)

respeck_df = respeck_df[(respeck_df['timestamp'] >= overlap_start) & (respeck_df['timestamp'] <= overlap_end)]
psg_df = psg_df[(psg_df['timestamp'] >= overlap_start) & (psg_df['timestamp'] <= overlap_end)]

print(overlap_start)
print(overlap_end)

respeck_df = respeck_df.dropna(subset=['breathingSignal'])

In [None]:
import pandas as pd

def dict_to_dataframe(data_dict):
    """
    Convert dictionary with potentially different-length arrays to DataFrame
    """
    # Find the maximum length
    max_length = max(len(v) if hasattr(v, '__len__') else 1 for v in data_dict.values())
    
    # Create a new dictionary with padded arrays
    padded_dict = {}
    for key, value in data_dict.items():
        if hasattr(value, '__len__') and len(value) > 1:
            # If it's an array, pad with NaN if needed
            if len(value) < max_length:
                padded_value = np.concatenate([value, np.full(max_length - len(value), np.nan)])
            else:
                padded_value = value
            padded_dict[key] = padded_value
        else:
            # If it's a scalar, repeat it for the entire length
            padded_dict[key] = np.full(max_length, value)
    
    return pd.DataFrame(padded_dict)

x = calculate_TS_breathFeatures(respeck_df['timestamp'].to_numpy(), respeck_df['breathingSignal'].to_numpy())
# Convert to DataFrame
df_features = dict_to_dataframe(x)
print(df_features.head())


In [None]:
import pandas as pd
import numpy as np
import glob
import os
from scipy import stats, signal as scipy_signal
from scipy.stats import spearmanr, skew, kurtosis
from collections import Counter
import warnings

# XGBoost and ML imports
import xgboost as xgb
from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from calculateContinuousBreathFeatures import *

# Visualization and interpretability
import matplotlib.pyplot as plt
import seaborn as sns
import shap
def extract_clinical_breath_features_xgb(timestamps, signal, window_size_sec=30):
    """
    Extract clinical breath features optimized for XGBoost training
    """
    
    sampling_rate = 12.5
    features = {}
    
    
    # Basic statistical features
    features['signal_mean'] = np.mean(signal)
    features['signal_std'] = np.std(signal)
    features['signal_var'] = np.var(signal)
    features['signal_range'] = np.max(signal) - np.min(signal)
    features['signal_skewness'] = skew(signal) if len(signal) > 2 else 0
    features['signal_kurtosis'] = kurtosis(signal) if len(signal) > 3 else 0
    features['rms'] = np.sqrt(np.mean(signal**2))
    
    # Zero crossing rate
    zero_crossings = np.sum(np.diff(np.sign(signal - np.mean(signal))) != 0)
    features['zero_crossing_rate'] = zero_crossings / len(signal)
    
    # Activity level
    features['activity_level'] = np.mean(np.abs(np.diff(signal)))
    
    # Frequency domain features

    freqs, psd = scipy_signal.welch(signal, fs=sampling_rate, nperseg=min(128, len(signal)//4))
    
    # OSA-specific frequency bands
    very_low_freq = (freqs >= 0.008) & (freqs < 0.04)   # Apnea cycling
    low_freq = (freqs >= 0.04) & (freqs < 0.15)         # Abnormal patterns
    normal_breathing = (freqs >= 0.15) & (freqs < 0.6)   # Normal breathing
    high_freq = (freqs >= 0.6) & (freqs < 2.0)          # Effort/artifacts
    
    total_power = np.sum(psd) + 1e-10
    features['vlf_power_ratio'] = np.sum(psd[very_low_freq]) / total_power if np.any(very_low_freq) else 0
    features['lf_power_ratio'] = np.sum(psd[low_freq]) / total_power if np.any(low_freq) else 0
    features['normal_power_ratio'] = np.sum(psd[normal_breathing]) / total_power if np.any(normal_breathing) else 0
    features['hf_power_ratio'] = np.sum(psd[high_freq]) / total_power if np.any(high_freq) else 0
    
    # Dominant frequency
    if len(psd) > 0:
        dominant_freq_idx = np.argmax(psd)
        features['dominant_frequency'] = freqs[dominant_freq_idx]
        features['dominant_power'] = psd[dominant_freq_idx]
    else:
        features['dominant_frequency'] = 0
        features['dominant_power'] = 0
            
    
    # Try to get breathing-specific features
 
    breath_features = calculate_TS_breathFeatures(timestamps, signal)
    
    if breath_features and 'amplitude' in breath_features:
        amplitudes = np.array(breath_features['amplitude'])
        if len(amplitudes) > 0:
            features['amplitude_mean'] = np.mean(amplitudes)
            features['amplitude_std'] = np.std(amplitudes)
            features['amplitude_cv'] = features['amplitude_std'] / (features['amplitude_mean'] + 1e-10)
            
            # Key OSA features
            features['amplitude_p10'] = np.percentile(amplitudes, 10)
            features['amplitude_p50'] = np.percentile(amplitudes, 50)
            features['amplitude_p90'] = np.percentile(amplitudes, 90)
            features['amplitude_reduction_ratio'] = 1 - (features['amplitude_p10'] / (features['amplitude_p90'] + 1e-10))
    
    if breath_features and 'breath_durations' in breath_features:
        durations = np.array(breath_features['breath_durations'])
        if len(durations) > 0:
            features['breath_duration_mean'] = np.mean(durations)
            features['breath_duration_std'] = np.std(durations)
            features['breath_duration_cv'] = features['breath_duration_std'] / (features['breath_duration_mean'] + 1e-10)
            features['long_breath_ratio'] = np.sum(durations > 20) / len(durations)
    
    if breath_features and 'rr' in breath_features:
        rr = np.array(breath_features['rr'])
        rr = rr[~np.isnan(rr)]
        if len(rr) > 0:
            features['respiratory_rate_mean'] = np.mean(rr)
            features['respiratory_rate_std'] = np.std(rr)
            features['respiratory_rate_cv'] = features['respiratory_rate_std'] / (features['respiratory_rate_mean'] + 1e-10)

    
    # Additional clinical features
    signal_envelope = np.abs(scipy_signal.hilbert(signal - np.mean(signal)))
    features['envelope_std'] = np.std(signal_envelope)
    features['envelope_cv'] = features['envelope_std'] / (np.mean(signal_envelope) + 1e-10)
    
    # Effort-flow mismatch proxy
    high_freq_power = features.get('hf_power_ratio', 0)
    normal_power = features.get('normal_power_ratio', 1)
    features['effort_flow_mismatch'] = high_freq_power / (normal_power + 1e-10)
    
    # Short/long term variability
    if len(signal) >= 20:
        short_segments = signal[::5]  # Every 5th sample
        features['short_term_variability'] = np.std(short_segments)
        
        if len(signal) >= 60:
            long_segments = signal[::12]  # Every 12th sample (~1 second)
            features['long_term_trend'] = np.abs(np.polyfit(range(len(long_segments)), long_segments, 1)[0])
        else:
            features['long_term_trend'] = 0
    else:
        features['short_term_variability'] = 0
        features['long_term_trend'] = 0


    return features

breath_features_dict = extract_clinical_breath_features_xgb(
    respeck_df['timestamp'].to_numpy(),
    respeck_df['breathingSignal'].to_numpy()
)
print(breath_features_dict)