In [1]:
import tempfile
import pandas as pd
pd.set_option('display.max_rows', 100)

import numpy as np
import os
from tqdm import tqdm
from glob import glob
from datetime import timedelta

In [57]:
def read_file(filename):
    df = pd.read_csv(filename)
    df["sleep"] = df["stages"] > 0
    df["linetime"] = pd.to_datetime(df["linetime"])
    return df

df = read_file("./data/collection_mesa_hr_30_240/0001_combined.csv.gz")

In [58]:
def generate_slide_wins(df, winsize=11):
    seq_id = 0
    transformed_df = []
    list_of_indexes=[]
    labels = []
    df.index.to_series().rolling(winsize, center=True).apply((lambda x: list_of_indexes.append(x.tolist()) or 0), raw=False)
    for idx in list_of_indexes:
        labels.append(df.iloc[idx].iloc[winsize//2]["sleep"])
        tmp_df = df.iloc[idx].copy()
        tmp_df["seq_id"] = seq_id
        seq_id += 1
        transformed_df.append(tmp_df)

    return pd.concat(transformed_df), pd.Series(labels)

In [None]:
all_features, all_ys, all_ids = [], [], []

input_files = glob("./data/collection_mesa_hr_30_240/*.csv.gz")

for file in input_files:
    df = read_file(file)
    transformed_df, labels = generate_slide_wins(df, 21) # -10, Epoch, +10

    extracted_features = extract_features(transformed_df[["activity", "mean_hr", "linetime", "seq_id"]], 
                                      column_id="seq_id", column_sort="linetime")
    
    assert extracted_features.shape[0] == labels.shape[0]
    
    ids = pd.Series(labels.shape[0]*[df["mesaid"].unique()[0]])
    
    all_features.append(extracted_features)
    all_ys.append(labels)
    all_ids.append(ids)


Feature Extraction: 100%|██████████| 30/30 [00:13<00:00,  2.29it/s]
Feature Extraction: 100%|██████████| 30/30 [00:17<00:00,  1.67it/s]
Feature Extraction: 100%|██████████| 30/30 [00:16<00:00,  1.79it/s]
Feature Extraction: 100%|██████████| 30/30 [00:20<00:00,  1.50it/s]
Feature Extraction: 100%|██████████| 30/30 [00:18<00:00,  1.67it/s]
Feature Extraction: 100%|██████████| 30/30 [00:17<00:00,  1.73it/s]
Feature Extraction: 100%|██████████| 30/30 [00:16<00:00,  1.82it/s]
Feature Extraction: 100%|██████████| 30/30 [00:18<00:00,  1.58it/s]
Feature Extraction: 100%|██████████| 30/30 [00:15<00:00,  1.91it/s]
Feature Extraction: 100%|██████████| 30/30 [00:16<00:00,  1.77it/s]
Feature Extraction: 100%|██████████| 30/30 [00:14<00:00,  2.06it/s]
Feature Extraction: 100%|██████████| 30/30 [00:19<00:00,  1.54it/s]
Feature Extraction: 100%|██████████| 30/30 [00:18<00:00,  1.63it/s]
Feature Extraction: 100%|██████████| 30/30 [00:17<00:00,  1.71it/s]
Feature Extraction: 100%|██████████| 30/30 [00:1

In [None]:
# This process took long time, but I am saving the final dataframes to files here...
# Just need to load those into memory...

In [None]:
# TODO: use sklearn pipelines to perform a 5-CV evaluation and predictions of the labels


In [None]:
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

impute(all_features)
features_filtered = select_features(all_features, labels) # This needs to be done in the training set only