# Please note: this notebook is not intented to be used in this tutorial.
It just has a few commands used to preprocess and mask the raw MESA dataset

In [1]:
import os
import numpy as np
import pandas as pd
from glob import glob

In [2]:
DATA_FOLDER = "/Users/joao/Dropbox/transfer/original_mesa"
NTRAIN = 100
NTEST = 100

In [3]:
def get_pids():

    pids = set()
    files = glob(os.path.join(DATA_FOLDER, "*"))
    for f in files:
        pid = os.path.basename(f).strip("mesa-sleep-").split("_")[0]
        pids.add(pid)

    return pids
    
pids = sorted(get_pids())
list(pids[:10])

['0001',
 '0016',
 '0021',
 '0028',
 '0033',
 '0036',
 '0046',
 '0050',
 '0052',
 '0074']

In [4]:
def get_data(pid):
    
    time = pd.read_csv(os.path.join(DATA_FOLDER, "mesa-sleep-%s_time.txt" % (pid)), names=["time"])
    counts = pd.read_csv(os.path.join(DATA_FOLDER, "mesa-sleep-%s_count_feature.txt" % (pid)), names=["counts"])
    labels = pd.read_csv(os.path.join(DATA_FOLDER, "mesa-sleep-%s_sleep_labels.txt" % (pid)), names=["label"])
    
    # Shift hr index to start at 1
    hr = pd.read_csv(os.path.join(DATA_FOLDER, "mesa-sleep-%s_hr_feature.csv" % (pid)), names=["hr"])
    hr.index += 1 
    
    
    df = pd.concat([time, counts, labels], axis=1).set_index("time")
    df = pd.merge(df, hr, left_index=True, right_index=True, how="outer")
    df["pid"] = str(pid)
    df.index.name = 'time'
    df = df.reset_index()
    df["time"] = df["time"] - 1
    
    return df

In [5]:
get_data("0001").head(50)

Unnamed: 0,time,counts,label,hr,pid
0,0,,,75.0,1
1,1,,,75.0,1
2,2,,,76.0,1
3,3,,,76.0,1
4,4,,,76.0,1
5,5,,,75.0,1
6,6,,,75.0,1
7,7,,,75.0,1
8,8,,,76.0,1
9,9,,,75.0,1


In [6]:
len(pids)

867

In [7]:
dfs = []

for pid in pids[:NTRAIN]:
    print(pid)
    df_tmp = get_data(pid)
    dfs.append(df_tmp)
    
dfs = pd.concat(dfs)
dfs = dfs.reset_index(drop=True)

0001
0016
0021
0028
0033
0036
0046
0050
0052
0074
0107
0111
0120
0121
0125
0133
0138
0144
0152
0155
0159
0167
0171
0193
0197
0220
0251
0271
0275
0282
0286
0292
0295
0299
0301
0306
0318
0323
0332
0339
0374
0380
0382
0386
0392
0393
0402
0423
0427
0435
0443
0445
0459
0470
0474
0476
0495
0499
0501
0509
0518
0522
0526
0528
0529
0534
0545
0550
0554
0555
0558
0589
0604
0612
0626
0632
0640
0657
0664
0677
0686
0688
0694
0702
0711
0712
0715
0716
0727
0728
0762
0768
0782
0784
0791
0796
0801
0804
0807
0811


In [8]:
dfs["newpid"] = 0
new_pids = dfs.groupby(["pid"]).apply(lambda x: x.index[0])
dfs.loc[new_pids, "newpid"] = 1
dfs["newpid"] = dfs["newpid"].cumsum()
dfs["pid"] = dfs["newpid"] - 1
del dfs["newpid"]

In [9]:
dfs.head(40)

Unnamed: 0,time,counts,label,hr,pid
0,0,,,75.0,0
1,1,,,75.0,0
2,2,,,76.0,0
3,3,,,76.0,0
4,4,,,76.0,0
5,5,,,75.0,0
6,6,,,75.0,0
7,7,,,75.0,0
8,8,,,76.0,0
9,9,,,75.0,0


In [10]:
np.random.seed(0)

In [11]:
# Add noise to activtity counts
nvalues = dfs[~dfs["counts"].isna()].shape[0]
noise = np.random.normal(0, 1, nvalues)
dfs["counts_noise"] = None
dfs.loc[~dfs["counts"].isna(), "counts_noise"] = noise

dfs["new_counts"] = dfs["counts_noise"] + dfs["counts"]
dfs["new_counts"] = dfs["new_counts"].clip(lower=0)
dfs["new_counts"] = dfs["new_counts"].astype(float).round()

dfs["counts"] = dfs["new_counts"]
del dfs["new_counts"]
del dfs["counts_noise"]

In [12]:
# Add noise to hr counts
nvalues = dfs[~dfs["hr"].isna()].shape[0]
noise = np.random.normal(0, 1, nvalues)
dfs["hr_noise"] = None
dfs.loc[~dfs["hr"].isna(), "hr_noise"] = noise

dfs["new_hr"] = dfs["hr_noise"] + dfs["hr"]
dfs["new_hr"] = dfs["new_hr"].clip(lower=0)
dfs["new_hr"] = dfs["new_hr"].astype(float).round()

dfs["hr"] = dfs["new_hr"]
del dfs["new_hr"]
del dfs["hr_noise"]


In [13]:
dfs.head(40)

Unnamed: 0,time,counts,label,hr,pid
0,0,,,74.0,0
1,1,,,74.0,0
2,2,,,75.0,0
3,3,,,77.0,0
4,4,,,76.0,0
5,5,,,76.0,0
6,6,,,75.0,0
7,7,,,76.0,0
8,8,,,76.0,0
9,9,,,74.0,0


In [14]:
dfs.columns = ["time", "act", "sleep_phase", "hr", "pid"]
dfs.to_csv("datasets/tutorial_sleep_training_data.csv.gz", index=False)