In [1]:
import tempfile
import pandas as pd
pd.set_option('display.max_rows', 100)

import numpy as np
import os
from tqdm import tqdm
from glob import glob
from datetime import timedelta

In [3]:
df = pd.read_csv("./data/collection_mesa_hr_30_240/0001_combined.csv.gz")
df["sleep"] = df["stages"] > 0
df["linetime"] = pd.to_datetime(df["linetime"])

In [4]:
def sadeh_algorithm(activity, min_threshold=0, minNat=50, maxNat=100, window_past=6,
                    window_nat=11, window_centered=11):
    """
    Sadeh model for classifying sleep vs active
    """
    act = activity.copy()

    _mean = act.rolling(window=window_centered, center=True, min_periods=1).mean()
    _std = act.rolling(window=window_past, min_periods=1).std()
    _nat = ((act >= minNat) & (act <= maxNat)).rolling(window=window_nat, center=True, min_periods=1).sum()

    _LocAct = (act + 1.).apply(np.log)

    sadeh = (7.601 - 0.065 * _mean - 0.056 * _std - 0.0703 * _LocAct - 1.08 * _nat)
    sadeh.name = "Sadeh"

    # Returns a series with binary values: 1 for sleep, 0 for awake
    return (sadeh > min_threshold).astype(int)

df["sadeh"] = sadeh_algorithm(df["activity"])

In [5]:
def scripps_clinic_algorithm(activity, scaler=0.204):

    act_series = dict()
    act_series["_a0"] = activity.fillna(0.0)

    # Enrich the dataframe with temporary values
    for i in range(1, 11):
        act_series["_a-%d" % (i)] = activity.shift(i).fillna(0.0)
        act_series["_a+%d" % (i)] = activity.shift(-i).fillna(0.0)

    # Calculates Scripps clinic algorithm
    scripps = scaler * (0.0064 * act_series["_a-10"] + 0.0074 * act_series["_a-9"] +
                        0.0112 * act_series["_a-8"] + 0.0112 * act_series["_a-7"] +
                        0.0118 * act_series["_a-6"] + 0.0118 * act_series["_a-5"] +
                        0.0128 * act_series["_a-4"] + 0.0188 * act_series["_a-3"] +
                        0.0280 * act_series["_a-2"] + 0.0664 * act_series["_a-1"] +
                        0.0300 * act_series["_a0"] + 0.0112 * act_series["_a+1"] +
                        0.0100 * act_series["_a+2"])

    # Returns a series with binary values: 1 for sleep, 0 for awake
    return (scripps < 1.0).astype(int)
df["scripps"] = scripps_clinic_algorithm(df["activity"])

In [6]:
def ground_truth_sleeponset_prediction(df, col="sleep", number_consec_epochs=10):
    """ Sleep col is 1 if subject is sleeping and 0 otherwise.
        The ground truth would be at least 10 awakes, then sleep onset, and at least 10 sleeps.
        Or.... 10 Falses followed by 10 Trues
    
    """
    # Gets the average label in the past X (=10) epochs. 
    # A mean value of 10 here means that the subject was sleeping for at least 10 consecutive epochs
    rolling_mean = df[col].rolling(number_consec_epochs).mean()
    
    rolling_mean_p10  = rolling_mean.shift(-(number_consec_epochs))
    
    return (rolling_mean <= 0) & (rolling_mean_p10 >= 1)

gt = df[ground_truth_sleeponset_prediction(df, "sleep", 10)]
gt

Unnamed: 0,activity,linetime,mean_hr,stages,mesaid,sleep,sadeh,scripps
596,0.0,2017-01-06 01:33:30,64.70877,0,1,False,1,1
705,0.0,2017-01-06 02:32:30,63.972163,0,1,False,1,1
1197,0.0,2017-01-06 06:42:00,74.989511,0,1,False,1,1


In [7]:
pred_scripps = df[ground_truth_sleeponset_prediction(df, "scripps", 10)]
pred_scripps

Unnamed: 0,activity,linetime,mean_hr,stages,mesaid,sleep,sadeh,scripps
173,0.0,2017-01-05 21:56:30,71.845511,1,1,True,1,0
248,0.0,2017-01-05 22:35:00,70.047317,0,1,False,1,0
304,0.0,2017-01-05 23:04:30,73.59018,0,1,False,1,0
588,0.0,2017-01-06 01:29:30,68.790494,0,1,False,1,0
813,0.0,2017-01-06 03:27:00,63.023003,4,1,True,1,0
1143,0.0,2017-01-06 06:15:00,67.434649,0,1,False,1,0


In [8]:
pred_sadeh = df[ground_truth_sleeponset_prediction(df, "sadeh", 10)]
pred_sadeh 

Unnamed: 0,activity,linetime,mean_hr,stages,mesaid,sleep,sadeh,scripps
107,29.0,2017-01-05 21:23:30,73.717111,0,1,False,0,0
583,5.0,2017-01-06 01:27:00,68.183903,0,1,False,0,0


In [9]:
def calc_eval_metrics(gt, pred, tolerance_in_minutes=5):
    
    if len(pred) == 0:
        return 0, 0, 0
    
    # Checks the recall: from all possible events how many I could find?
    okay, wrong = 0, 0
    for (_, gt_row) in gt.iterrows():

        found_window = False
        for (_, pred_row) in pred.iterrows():
            t_diff = gt_row["linetime"] - pred_row["linetime"]

            if timedelta(minutes=-tolerance_in_minutes) <= t_diff <= timedelta(minutes=tolerance_in_minutes):
                found_window = True
                break

        if found_window:
            okay += 1
        else:
            wrong += 1

    recall = okay / (okay+wrong)

    # Checks the precision: from all my guesses how many are right?
    okay, wrong = 0, 0
    for (_, pred_row) in pred.iterrows():
        found_window = False
        for (_, gt_row) in gt.iterrows():
            t_diff = gt_row["linetime"] - pred_row["linetime"]
            if timedelta(minutes=-tolerance_in_minutes) <= t_diff <= timedelta(minutes=tolerance_in_minutes):
                found_window = True
                break
            #TODO: take into account when many sleep onset events are close to each other

        if found_window:
            okay += 1
        else:
            wrong += 1

    precision = okay / (okay+wrong)

    if (precision + recall) > 0:
        f1 = (2. * precision * recall) / (precision + recall)
    else:
        f1 = 0

    return precision, recall, f1

In [10]:
calc_eval_metrics(gt, pred_sadeh, 60)

(0.5, 0.3333333333333333, 0.4)

In [11]:
calc_eval_metrics(gt, pred_scripps, 60)

(0.5, 1.0, 0.6666666666666666)

In [12]:
input_files = glob("./data/collection_mesa_hr_30_240/*.csv.gz")

ps, rs, f1s = {"sadeh": [], "scripps": []}, {"sadeh": [], "scripps": []}, {"sadeh": [], "scripps": []}
for file in input_files:

    df = pd.read_csv(file)
    df["sleep"] = df["stages"] > 0
    df["linetime"] = pd.to_datetime(df["linetime"])
    
    df["sadeh"] = sadeh_algorithm(df["activity"])
    df["scripps"] = scripps_clinic_algorithm(df["activity"])
    
    for sleep_alg in ["scripps", "sadeh"]:
        pred = df[ground_truth_sleeponset_prediction(df, sleep_alg, 10)]
        p, r, f1 = calc_eval_metrics(gt, pred, 20)
        ps[sleep_alg].append(p)
        rs[sleep_alg].append(r)
        f1s[sleep_alg].append(f1)
        

In [13]:
for sleep_alg in ["scripps", "sadeh"]:
    print("Alg: %s, Precision: %.3f, Recall: %.3f, F1: %.3f" % (sleep_alg, np.array(ps[sleep_alg]).mean(),
                                                       np.array(rs[sleep_alg]).mean(),
                                                       np.array(f1s[sleep_alg]).mean())
         )

Alg: scripps, Precision: 0.028, Recall: 0.032, F1: 0.028
Alg: sadeh, Precision: 0.019, Recall: 0.021, F1: 0.019


In [None]:
# ML NN algorithm
# TODO ...
