In [4]:
import pandas as pd
import os
import numpy as np
import pytz
import plotly.graph_objects as go
import plotly.express as px
import plotly.subplots as sp
import pytz as tz
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.discriminant_analysis import StandardScaler
import matplotlib.pyplot as plt
from scipy.signal import correlate, butter, filtfilt
from scipy.signal.windows import hann
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

In [5]:
# Load Cheyne-stokes

PSG = '../data/bishkek_csr/03_train_ready/nasal_files/25-04-2025_nasal.csv'
RESPECK = '../data/bishkek_csr/03_train_ready/respeck/25-04-2025_respeck.csv'

# --- Load Data ---
respeck_df = pd.read_csv(RESPECK)
respeck_df['timestamp'] = pd.to_datetime(respeck_df['interpolatedPhoneTimestamp'], unit='ms')
tz = pytz.timezone('Asia/Bishkek')
respeck_df['timestamp'] = respeck_df['timestamp'].dt.tz_localize('UTC').dt.tz_convert(tz)

psg_df = pd.read_csv(PSG)
psg_df['timestamp'] = pd.to_datetime(psg_df['UnixTimestamp'], unit='ms')
tz = pytz.timezone('Asia/Bishkek')
psg_df['timestamp'] = psg_df['timestamp'].dt.tz_localize('UTC').dt.tz_convert(tz)




In [None]:
from pylab import *
from memoize import *
from .util import *
import scipy.signal
from scipy.signal import filtfilt, lfilter
from scipy.signal.filter_design import butter
from .quat import Quat
from .FixedPoint import FXfamily, FXnum
import os
from mpl_toolkits.mplot3d import *
import logging
from .graph import *


def calculateThresholdLevelsSlow(signal, rmsBackwardLength, rmsForwardLength, rmsMultiplier, hamming=False):

    def threshold(i):
        rmsWindowStart = max(i-rmsBackwardLength, 0)
        rmsWindowEnd = min(i+rmsForwardLength, len(signal))
        sig = signal[rmsWindowStart:rmsWindowEnd]
        if any(isnan(sig)):
            return NaN
        if (hamming):
            rmsJoint = rmsHamming(sig)
        else:
            rmsJoint = rms(sig)
        result = rmsJoint * rmsMultiplier

        return result

    result = amap(threshold, range(len(signal)))
    result[:rmsBackwardLength] = nan
    result[-rmsForwardLength:] = nan
    return result

def calculateThresholdLevels(signal, rmsBackwardLength, rmsForwardLength, rmsMultiplier, symmetrical):
    result = nans((len(signal), 2))
    
    if not symmetrical:
        
        #fill sum of squares buffers
        posValues = []
        negValues = []
        windowLength = rmsBackwardLength + rmsForwardLength
        if len(signal) < windowLength:
            return result
        
        lastBananaIndex = nan
            
        for i in range(windowLength - 1):
            if signal[i] >= 0:
                posValues.append(signal[i])
            elif signal[i] < 0:
                negValues.append(signal[i])
            else: # if nan
                lastBananaIndex = i
                
        posArray = array(posValues)
        negArray = array(negValues)
        
        sumOfSquaresPos = sum(posArray**2)
        posCount = len(posArray)
        sumOfSquaresNeg = sum(negArray**2)
        negCount = len(negArray)
        
        for i in range(0, len(signal)):
            if i < rmsBackwardLength or i >= len(signal) - rmsForwardLength:
                posResult = nan
                negResult = nan
            else:
                newValue = signal[i+rmsForwardLength-1]
                if isnan(newValue):
                    lastBananaIndex = i+rmsForwardLength-1
                else:
                    if newValue >= 0:
                        sumOfSquaresPos += newValue**2
                        posCount += 1
                    elif newValue < 0:
                        sumOfSquaresNeg += newValue**2
                        negCount += 1
                
                if not isnan(lastBananaIndex) and i - lastBananaIndex <= rmsBackwardLength:
                    posResult = nan
                    negResult = nan
                else:
                    posResult = sqrt(sumOfSquaresPos / posCount) * rmsMultiplier
                    negResult = -sqrt(sumOfSquaresNeg / negCount) * rmsMultiplier
                
                oldValue = signal[i-rmsBackwardLength]
                
                if oldValue >= 0:
                    sumOfSquaresPos -= oldValue**2
                    posCount -= 1
                elif oldValue < 0:
                    sumOfSquaresNeg -= oldValue**2
                    negCount -=1
            result[i,0] = posResult
            result[i,1] = negResult
            
        return result
    
    else:
        #fill sum of squares buffers
        allValues = []
        windowLength = rmsBackwardLength + rmsForwardLength
        if len(signal) < windowLength:
            return result
        
        #print "signal length: " + str(len(signal))
        #print "windowLength: " + str(windowLength)
        #print "backward length: " + str(rmsBackwardLength)
        #print "forward length: " + str(rmsForwardLength)
        
        lastBananaIndex = nan
        
        for i in range(windowLength - 1):
            if not isnan(signal[i]):
                allValues.append(signal[i])
            else:
                lastBananaIndex = i
        allArray = array(allValues)
        
        sumOfSquaresAll = sum(allArray**2)
        allCount = len(allArray)
        
        for i in range(0, len(signal)):
            if i < rmsBackwardLength or i >= len(signal) - rmsForwardLength:
                allResult = nan
            else:
                newValue = signal[i+rmsForwardLength-1]
                if isnan(newValue):
                    lastBananaIndex = i+rmsForwardLength-1
                else:
                    sumOfSquaresAll += newValue**2
                    allCount += 1
                
                if not isnan(lastBananaIndex) and i - lastBananaIndex <= rmsBackwardLength:
                    allResult = nan
                else:
                    allResult = sqrt(sumOfSquaresAll / allCount) * rmsMultiplier
                
                oldValue = signal[i-rmsBackwardLength]
                if not isnan(oldValue):
                    sumOfSquaresAll -= oldValue**2
                    allCount -= 1
                    
            result[i,0] = allResult
            result[i,1] = -allResult
        #figure()
        #plot(signal)
        #plot(result)
        #show()
        return result

def calculateBreathTimes(signal, posThresholds, negThresholds, minThreshold, zeroCrossingBreathStart):
    
    def breathTimes(startIndex, endIndex):

        def setInitialState(startValue, posThreshold, negThreshold):
            if startValue < negThreshold:
                state = LOW
            elif startValue > posThreshold:
                state = HIGH
            else:
                state = MID_UNKNOWN
            return state
    
        state = setInitialState(signal[startIndex], posThresholds[startIndex], negThresholds[startIndex])
        times = []
    
        for i in range(startIndex + 1, endIndex + 1):
            posThreshold = posThresholds[i]
            negThreshold = negThresholds[i]
            if state == LOW and signal[i] > negThreshold:
                state = MID_RISING
            elif state == HIGH and signal[i] < posThreshold:
                state = MID_FALLING
            elif (state == MID_RISING or state == MID_UNKNOWN) and signal[i] > posThreshold:
                state = HIGH
            elif (state == MID_FALLING or state == MID_UNKNOWN) and signal[i] < negThreshold:
                state = LOW
                times.append(i)

        if zeroCrossingBreathStart:
            zeroCrossingBreathTimes = []
            for t in times:
                for i in range(t,-1,-1):
                    if signal[i] >= 0:
                        zeroCrossingBreathTimes.append(i)
                        break
            return zeroCrossingBreathTimes
        else:
            return times

    LOW, MID_FALLING, MID_UNKNOWN, MID_RISING, HIGH = range(5)

    
    invalidated = ones(shape(signal), dtype=bool)
    for i in range(len(invalidated)):
        if posThresholds[i] > minThreshold or negThresholds[i] < -minThreshold:
            invalidated[i] = False
    

    minIslandLength = 0
    islandLimits = findIslandLimits(invalidated, minIslandLength)
    
    times = []
    for (start, end) in islandLimits:
        bt = breathTimes(start, end)
        if len(bt) > 0:
            times.append(bt)

    return times

def breathTimesTrace(breathTimes, signalLength):

    trace = zeros(signalLength, dtype='bool')

    for i in range(len(breathTimes)):
        trace[breathTimes[i]] = 1

    return trace

def calculateBreathingRates(breathTimes, signalLength, sampleRate):
    #print "sample rate: ", sampleRate
    if len(breathTimes) == 0:
        return nans(signalLength)

    trace = nans(signalLength)
    rates = []

    #rates[:breathTimes[0]] = nan

    for bt in breathTimes:
        islandRates = []
        for i in range(1, len(bt)):
            rate = 60.0 * float(sampleRate) / float((bt[i] - bt[i-1]))
            trace[bt[i-1]:bt[i]] = rate
            islandRates.append(rate)
            
        rates.append(islandRates)

    #rates[breathTimes[-1]:] = nan

    return rates, trace

def calculateAvgBreathingRates(breathTimesList, signalLength, sampleRate, windowTime, backwardsLength, forwardsLength):

    if len(breathTimesList) == 0:
        return nans(signalLength)

    #print breathTimesList

    avgRates = nans(signalLength)

    for breathTimes in breathTimesList:
        for i in range(breathTimes[0] + backwardsLength, breathTimes[-1] - forwardsLength):
            breathCount = 0
            startIndex = i - backwardsLength
            endIndex = i + forwardsLength
            for b in breathTimes:
                if b >= startIndex and b <= endIndex:
                    breathCount += 1
            avgRates[i] = breathCount * (60 / windowTime)

    return avgRates

def calculateSmoothedBreathingRates(rateSignal, smoothness):
    from scipy.interpolate import UnivariateSpline
    first = firstValidIndex(rateSignal)
    last = lastValidIndex(rateSignal)
    validRateSignal = rateSignal[first:last]
    x = range(len(validRateSignal))
    y = validRateSignal
    sm = smoothness * len(validRateSignal)
    #print "smoothness: " + str(sm)
    s = UnivariateSpline(x,y,s=sm)
    smoothedRates = nans(len(rateSignal))
    smoothedRates[first:last] = s(x)
    return smoothedRates

def breathingVolumes(breathTimes, signal, sampleRate):

    if len(breathTimes) == 0:
        return nans(len(signal))

    volumes = empty(len(signal))

    volumes[:breathTimes[0]] = nan

    for i in range(1, len(breathTimes)):
        volume = trapz(abs(signal[breathTimes[i-1]:breathTimes[i]]))
        volumes[breathTimes[i-1]:breathTimes[i]] = volume / sampleRate

    volumes[breathTimes[-1]:] = nan

    return volumes


In [None]:
def countLocalMaximas(values):
    count = 0
    if len(values) < 3:
        return 1
    if len(values) > 1 and values[0] > values[1]:
        count += 1
    if len(values) > 1 and values[-1] > values[-2]:
        count += 1
    for i in range(1, len(values) - 1):
        if values[i] > values[i - 1] and values[i] > values[i + 1]:
            count += 1
    return count

def countLocalMinimas(values):
    count = 0
    if len(values) < 3:
        return 1
    if len(values) > 1 and values[0] < values[1]:
        count += 1
    if len(values) > 1 and values[-1] < values[-2]:
        count += 1
    for i in range(1, len(values) - 1):
        if values[i] < values[i - 1] and values[i] < values[i + 1]:
            count += 1
    return count


# Zac's code: Generating Respiratory Rate Variability (RRV)
def generate_RRV(sliced):
    sliced = sliced.dropna()
    if sliced.size == 0:
        return np.nan
    breathingSignal = sliced.values
    N = breathingSignal.shape[-1]
    y = breathingSignal
    yf = np.fft.fft(y)
    yff = 2.0/N * np.abs(yf[:N//2])
    temp_DCnotremov = yff
    if len(temp_DCnotremov) == 0 or len(temp_DCnotremov) == 1: 
        return 0.0
    else:
        DC = np.amax(temp_DCnotremov)
        maxi = np.argmax(temp_DCnotremov)
        temp_DCremov = np.delete(temp_DCnotremov, maxi)
        H1 = np.amax(temp_DCremov)
        return 100-(H1/DC)*100


def getBreaths(df, patient):
    minThreshold = 0.001
    mult = 1e-2
    
    signal = list(df.breathingSignal)
    
    time_diff = df['phoneTimestampBishkek'].diff()
    time_diff.map(lambda x: x.total_seconds()).mean()
    
    window_size = int((30 / time_diff.dropna().apply(lambda x: x.total_seconds()).mean()) // 2)
    threshs = calculateThresholdLevels(list(signal), window_size, window_size, mult, False)
    posThresh = threshs[:, 0]
    negThresh = threshs[:, 1]

    times = calculateBreathTimes(list(signal), posThresh, negThresh, minThreshold, False)

    total = set()
    minBreathLength = float("inf")
    maxBreathLength = float("-inf")
    for i in range(0, len(times)):
        vals = times[i]
        for j in range(0, len(vals)-1):
            start, end = vals[j], vals[j+1]
            minBreathLength = min(minBreathLength, end-start+1)
            maxBreathLength = max(maxBreathLength, end-start+1)
            for k in range(start, end+1):
                total.add(k)

    f = list(df.breathingSignal.dropna())
    a = f"{patient}: Uses Breath From {len(total)}/{len(f)} = {round((len(total)/len(f)) * 100, 2)}% Signal"
    b = f"{patient}: Max Breath Length: {maxBreathLength} points. Min Breath Length: {minBreathLength} points"
    print(a)
    print(b)
        
    return times

def extractFeatures(df, patient):
    times = getBreaths(df, patient)

    areas = []
    extremas = []
    peakRespiratoryFlows = []
    types = []
    durations = []
    activityLevels = []
    activityTypes = []
    starts = []
    ends = []

    activityLevel = np.array(df.activityLevel)
    activityType = np.array(df.activityType)
    signal = np.array(df.breathingSignal)
    timestamps = list(df.phoneTimestampBishkek)

    for i in range(0, len(times)):
        if i % 25 == 0:
            print(f"{i}/{len(times)}... ", end=" ")
        vals = times[i]
        
        for j in range(0, len(vals)-1):
            start, end = vals[j], vals[j+1]
            flag = False
            breath = signal[start:end+1]
            breakPoint = start
            for k, val in enumerate(breath):
                if val >= 0.005: # arbitrary but to remove noise...
                    breakPoint = start + k
                    break

            # compute inhalation
            inhalation, inhalation_times = signal[start:breakPoint], timestamps[start:breakPoint]
            exhalation, exhalation_times = signal[breakPoint:end+1], timestamps[breakPoint:end+1]
                    
            level = activityLevel[start:end+1].mean()
            modeType = np.mode(activityType[start:end+1])[0]
            
            # compute inhalation
            if len(inhalation) > 1:
                peak = max(abs(np.array(inhalation)))
                extrema = countLocalMaximas(inhalation)
                dx = (inhalation_times[-1]-inhalation_times[0]).total_seconds() / len(inhalation)
                area = abs(np.trapz(y=inhalation,dx=dx))
                duration = (inhalation_times[-1]-inhalation_times[0]).total_seconds()
                
                areas.append(area)
                extremas.append(extrema)
                peakRespiratoryFlows.append(peak)
                types.append("Inhalation")
                durations.append(duration)
                activityLevels.append(level)
                activityTypes.append(modeType)
                starts.append(inhalation_times[0])
                ends.append(inhalation_times[-1])

            # compute exhalation
            if len(exhalation) > 1:
                peak = max(abs(np.array(exhalation)))
                extrema = countLocalMinimas(exhalation)    
                dx = (exhalation_times[-1]-exhalation_times[0]).total_seconds() / len(exhalation)
                area = abs(np.trapz(y=exhalation,dx=dx))  
                duration = (exhalation_times[-1]-exhalation_times[0]).total_seconds()
                
                areas.append(area)
                extremas.append(extrema)
                peakRespiratoryFlows.append(peak)
                types.append("Exhalation")
                durations.append(duration)
                activityLevels.append(level)
                activityTypes.append(modeType)
                starts.append(exhalation_times[0])
                ends.append(exhalation_times[-1])

    return pd.DataFrame(data={"type": types, "area": areas, "peakRespiratoryFlow": peakRespiratoryFlows, "extremas": extremas, "duration": durations, "meanActivityLevel": activityLevels, "modeActivityType": activityTypes, "startTimestamp": starts, "endTimestamp": ends})

In [7]:
# === Define Flow Regularity Function ===

def getRegularity(df):

    scaler = MinMaxScaler()

    # 1st PCA: ['area', 'peakRespiratoryFlow']

    df_norm_2d = scaler.fit_transform(df[['area', 'peakRespiratoryFlow']])

    pca_2d = PCA(n_components=1)

    pca_2d.fit(df_norm_2d)

    pc1 = pca_2d.components_[0]

    te = np.linalg.norm(df_norm_2d - np.outer(df_norm_2d @ pc1, pc1), axis=1)

 

    # 2nd PCA: ['area', 'peakRespiratoryFlow', 'BR_mean']

    df_norm_3d = scaler.fit_transform(df[['area', 'peakRespiratoryFlow', 'BR_mean']])

    pca_3d = PCA(n_components=3)

    pca_3d.fit(df_norm_3d)

    pcs = pca_3d.components_

 

    d1 = np.linalg.norm(df_norm_3d - np.outer(df_norm_3d @ pcs[0], pcs[0]), axis=1)

    d2 = np.linalg.norm(df_norm_3d - np.outer(df_norm_3d @ pcs[1], pcs[1]), axis=1)

    d3 = np.linalg.norm(df_norm_3d - np.outer(df_norm_3d @ pcs[2], pcs[2]), axis=1)

 

    score = te + (d1 - d2 + d3)

    score = (score - score.min()) / (score.max() - score.min())

    return 1 - score

 

# === Clean & Process All Files ===

input_folder = "../data/bishkek_csr/03_train_ready/respeck"

output_folder = "./features2"

os.makedirs(output_folder, exist_ok=True)

 

files = [f for f in os.listdir(input_folder) if f.endswith("_respeck.csv")]

 

for file in files:

    print(f"Processing {file}")

    path = os.path.join(input_folder, file)

    features_df = pd.read_csv(path)

 

    # Drop missing values

    features_df = features_df.dropna()

    print("After dropna:", len(features_df))

 

    # Remove invalid values

    for col in ['area', 'peakRespiratoryFlow', 'extremas']:

        features_df = features_df[features_df[col] >= 0]

    print("After removing invalid values:", len(features_df))

 

    # Optional: switch inhalation/exhalation if needed

    mean_exh = features_df[features_df['type'] == 'Exhalation']['duration'].mean()

    mean_inh = features_df[features_df['type'] == 'Inhalation']['duration'].mean()

    if mean_exh < mean_inh:

        features_df['type'] = features_df['type'].map(lambda x: "Inhalation" if x == "Exhalation" else "Exhalation")

    print("After type switch (if needed):", len(features_df))

 

    # Filter too long/short durations

    features_df = features_df[features_df['duration'] < 5]

    print("After duration filter:", len(features_df))

 

    # Remove outliers with IQR rule

    cols = ['extremas', 'area', 'peakRespiratoryFlow']

    Q3 = features_df[cols].quantile(0.75)

    IQR = Q3 - features_df[cols].quantile(0.25)

    threshold = Q3 + 1.5 * IQR

    mask = (features_df[cols] <= threshold).all(axis=1)

    processed_df = features_df[mask]

    print("After outlier removal:", len(processed_df))

 

    # Only add regularity if enough rows exist and required cols are present

    if len(processed_df) > 10 and all(c in processed_df.columns for c in ['area', 'peakRespiratoryFlow', 'BR_mean']):

        processed_df['flowRegularity'] = getRegularity(processed_df)

    else:

        processed_df['flowRegularity'] = np.nan

        print(" Not enough rows or missing columns — regularity set to NaN")

 

    # Save cleaned file

    out_path = os.path.join(output_folder, file.replace("_merged", "_cleaned"))

    processed_df.to_csv(out_path, index=False)

    print(f"Saved: {out_path}\n")

Processing 04-04-2025_respeck.csv
After dropna: 7895


KeyError: 'area'

In [None]:
folder_path = "MC_sample/cleaned_with_regularity"

csv_files = [f for f in os.listdir(folder_path) if f.endswith("_cleaned.csv")]

patient_stats = []


for file in csv_files:

    path = os.path.join(folder_path, file)

    df = pd.read_csv(path)

    if 'startTimestamp' not in df.columns: continue

    df['startTimestamp'] = pd.to_datetime(df['startTimestamp'], errors='coerce')

    df['date'] = df['startTimestamp'].dt.date

    num_days = df['date'].nunique()

    num_breaths = len(df)

    avg_per_day = num_breaths / num_days if num_days > 0 else 0

    patient_stats.append((file, num_days, num_breaths, avg_per_day))

 

# Sort and display

sorted_patients = sorted(patient_stats, key=lambda x: (-x[1], -x[3]))

for file, days, total, avg in sorted_patients:

    print(f"{file}: Days = {days}, Total Breaths = {total}, Avg/Day = {avg:.1f}")

 