In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv("./datasets/tutorial_sleep_training_data.csv.gz")

In [4]:
df1 = df[df["pid"] == 1].dropna()
df1

Unnamed: 0,time,act,sleep_phase,hr,pid
29,29,0.0,0.0,73.0,1
59,59,0.0,0.0,75.0,1
89,89,0.0,0.0,76.0,1
119,119,0.0,0.0,75.0,1
149,149,85.0,0.0,80.0,1
...,...,...,...,...,...
38249,38249,41.0,0.0,69.0,1
38279,38279,59.0,0.0,70.0,1
38309,38309,6.0,0.0,70.0,1
38339,38339,0.0,0.0,74.0,1


In [5]:
df1["act"].mean()

12.388584831899921

In [6]:
df1[0:10]["act"].mean(), df1[1:11]["act"].mean(), df1[2:12]["act"].mean()

(8.5, 8.5, 8.5)

In [7]:
# Centered window of size 11 (5 + 1 + 5)
i = 5
winsize = 10
df_slice = df1[i-(winsize//2):i+(winsize//2)+1]

print("Shape: ", df_slice.shape)
df_slice

Shape:  (11, 5)


Unnamed: 0,time,act,sleep_phase,hr,pid
29,29,0.0,0.0,73.0,1
59,59,0.0,0.0,75.0,1
89,89,0.0,0.0,76.0,1
119,119,0.0,0.0,75.0,1
149,149,85.0,0.0,80.0,1
179,179,0.0,0.0,77.0,1
209,209,0.0,0.0,77.0,1
239,239,0.0,0.0,77.0,1
269,269,0.0,0.0,77.0,1
299,299,0.0,0.0,77.0,1


In [8]:
# %%timeit
n = df1.shape[0]
winsize = 11

result_loop = []
for i in range(0, n):
    r = df1[i-(winsize//2):i+(winsize//2)+1]["act"].mean()
    result_loop.append(r)


In [9]:
result_loop = pd.Series(result_loop)
result_loop[:20]

0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
5      7.727273
6      7.727273
7      7.727273
8     13.727273
9     13.727273
10     6.000000
11     6.181818
12     6.181818
13     6.636364
14     9.818182
15    12.454545
16    12.454545
17    12.454545
18    12.636364
19     6.636364
dtype: float64

# Pandas windowing
https://pandas.pydata.org/docs/user_guide/window.html

In [10]:
result_pdw = df1["act"].rolling(window=11, center=True, min_periods=11).mean()
result_pdw[:20]

29           NaN
59           NaN
89           NaN
119          NaN
149          NaN
179     7.727273
209     7.727273
239     7.727273
269    13.727273
299    13.727273
329     6.000000
359     6.181818
389     6.181818
419     6.636364
449     9.818182
479    12.454545
509    12.454545
539    12.454545
569    12.636364
599     6.636364
Name: act, dtype: float64

In [11]:
result_pdw = df1["act"].rolling(window=11, center=False, min_periods=11).mean()
result_pdw[:20]

29           NaN
59           NaN
89           NaN
119          NaN
149          NaN
179          NaN
209          NaN
239          NaN
269          NaN
299          NaN
329     7.727273
359     7.727273
389     7.727273
419    13.727273
449    13.727273
479     6.000000
509     6.181818
539     6.181818
569     6.636364
599     9.818182
Name: act, dtype: float64

In [12]:
result_pdw = df1["act"].rolling(window=11, center=True, min_periods=1).mean()
result_pdw[:20]

29     14.166667
59     12.142857
89     10.625000
119     9.444444
149     8.500000
179     7.727273
209     7.727273
239     7.727273
269    13.727273
299    13.727273
329     6.000000
359     6.181818
389     6.181818
419     6.636364
449     9.818182
479    12.454545
509    12.454545
539    12.454545
569    12.636364
599     6.636364
Name: act, dtype: float64

## A Few Question:

- How about the last 20 numbers? I.e.,: result_pdw[-20:] Vs result_loop[-20:]
- How about the execution time? (use %%timeit to check it)


In [13]:
Centered = True
Wsize = 11

df1["act"].rolling(window=Wsize, center=Centered, min_periods=1).agg(["mean", "median", "std", "var", "skew", "kurt"])

Unnamed: 0,mean,median,std,var,skew,kurt
29,14.166667,0.0,34.701105,1204.166667,2.449490,6.000000
59,12.142857,0.0,32.126980,1032.142857,2.645751,7.000000
89,10.625000,0.0,30.052038,903.125000,2.828427,8.000000
119,9.444444,0.0,28.333333,802.777778,3.000000,9.000000
149,8.500000,0.0,26.879360,722.500000,3.162278,10.000000
...,...,...,...,...,...,...
38249,36.900000,3.5,61.321466,3760.322222,2.193355,5.234941
38279,41.000000,6.0,63.570827,4041.250000,2.060310,4.632116
38309,46.125000,23.5,65.942480,4348.410714,1.921934,4.061232
38339,52.714286,41.0,68.322066,4667.904762,1.782101,3.556218


## How to apply rolling windows to the whole dataset?

In [14]:
X_grped = df.dropna().groupby(["pid"])["act"].rolling(window=Wsize, center=Centered, min_periods=1).agg(["mean", "median", "std", "var", "skew", "kurt", "max", "min", "count", "sum"])
X_grped

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,median,std,var,skew,kurt,max,min,count,sum
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,29,14.166667,0.0,34.701105,1204.166667,2.449490,6.000000,85.0,0.0,6.0,85.0
1,59,12.142857,0.0,32.126980,1032.142857,2.645751,7.000000,85.0,0.0,7.0,85.0
1,89,10.625000,0.0,30.052038,903.125000,2.828427,8.000000,85.0,0.0,8.0,85.0
1,119,9.444444,0.0,28.333333,802.777778,3.000000,9.000000,85.0,0.0,9.0,85.0
1,149,8.500000,0.0,26.879360,722.500000,3.162278,10.000000,85.0,0.0,10.0,85.0
...,...,...,...,...,...,...,...,...,...,...,...
1647,6231234,3.500000,0.0,6.819091,46.500000,2.262781,5.278992,21.0,0.0,10.0,35.0
1647,6231264,3.888889,0.0,7.114149,50.611111,2.113952,4.568488,21.0,0.0,9.0,35.0
1647,6231294,4.375000,0.0,7.443837,55.410714,1.952574,3.855072,21.0,0.0,8.0,35.0
1647,6231324,5.000000,0.0,7.810250,61.000000,1.774885,3.142972,21.0,0.0,7.0,35.0


In [14]:
X = X_grped.reset_index().rename(columns={"level_1": "time"})
X

Unnamed: 0,pid,time,mean,median,std,var,skew,kurt,max,min,count,sum
0,1,29,14.166667,0.0,34.701105,1204.166667,2.449490,6.000000,85.0,0.0,6.0,85.0
1,1,59,12.142857,0.0,32.126980,1032.142857,2.645751,7.000000,85.0,0.0,7.0,85.0
2,1,89,10.625000,0.0,30.052038,903.125000,2.828427,8.000000,85.0,0.0,8.0,85.0
3,1,119,9.444444,0.0,28.333333,802.777778,3.000000,9.000000,85.0,0.0,9.0,85.0
4,1,149,8.500000,0.0,26.879360,722.500000,3.162278,10.000000,85.0,0.0,10.0,85.0
...,...,...,...,...,...,...,...,...,...,...,...,...
205356,1647,6231234,3.500000,0.0,6.819091,46.500000,2.262781,5.278992,21.0,0.0,10.0,35.0
205357,1647,6231264,3.888889,0.0,7.114149,50.611111,2.113952,4.568488,21.0,0.0,9.0,35.0
205358,1647,6231294,4.375000,0.0,7.443837,55.410714,1.952574,3.855072,21.0,0.0,8.0,35.0
205359,1647,6231324,5.000000,0.0,7.810250,61.000000,1.774885,3.142972,21.0,0.0,7.0,35.0


In [15]:
pd.merge(X, df[["time", "pid", "act"]], on=["time", "pid"]).head(20)


Unnamed: 0,pid,time,mean,median,std,var,skew,kurt,max,min,count,sum,act
0,1,29,14.166667,0.0,34.701105,1204.166667,2.44949,6.0,85.0,0.0,6.0,85.0,0.0
1,1,59,12.142857,0.0,32.12698,1032.142857,2.645751,7.0,85.0,0.0,7.0,85.0,0.0
2,1,89,10.625,0.0,30.052038,903.125,2.828427,8.0,85.0,0.0,8.0,85.0,0.0
3,1,119,9.444444,0.0,28.333333,802.777778,3.0,9.0,85.0,0.0,9.0,85.0,0.0
4,1,149,8.5,0.0,26.87936,722.5,3.162278,10.0,85.0,0.0,10.0,85.0,85.0
5,1,179,7.727273,0.0,25.628464,656.818182,3.316625,11.0,85.0,0.0,11.0,85.0,0.0
6,1,209,7.727273,0.0,25.628464,656.818182,3.316625,11.0,85.0,0.0,11.0,85.0,0.0
7,1,239,7.727273,0.0,25.628464,656.818182,3.316625,11.0,85.0,0.0,11.0,85.0,0.0
8,1,269,13.727273,0.0,30.83534,950.818182,2.007672,2.645086,85.0,0.0,11.0,151.0,0.0
9,1,299,13.727273,0.0,30.83534,950.818182,2.007672,2.645086,85.0,0.0,11.0,151.0,0.0


In [16]:
Y = df.dropna()[["pid", "sleep_phase"]].reset_index().rename(columns={"index": "time"})
Y["sleep"] = Y["sleep_phase"] > 0
Y

Unnamed: 0,time,pid,sleep_phase,sleep
0,29,1,0.0,False
1,59,1,0.0,False
2,89,1,0.0,False
3,119,1,0.0,False
4,149,1,0.0,False
...,...,...,...,...
205356,6231234,1647,0.0,False
205357,6231264,1647,0.0,False
205358,6231294,1647,0.0,False
205359,6231324,1647,0.0,False


### Are there NAs?

In [17]:
X[X.isna().values.any(axis=1)]

Unnamed: 0,pid,time,mean,median,std,var,skew,kurt,max,min,count,sum
39,1,1199,0.0,0.0,0.000000e+00,-3.096745e-13,,,0.0,0.0,11.0,0.0
40,1,1229,0.0,0.0,0.000000e+00,-3.096745e-13,,,0.0,0.0,11.0,0.0
70,1,2129,0.0,0.0,1.921614e-07,3.692602e-14,,,0.0,0.0,11.0,0.0
71,1,2159,0.0,0.0,1.921614e-07,3.692602e-14,,,0.0,0.0,11.0,0.0
156,1,4709,0.0,0.0,2.073008e-06,4.297362e-12,,,0.0,0.0,11.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
205330,1647,6230454,0.0,0.0,0.000000e+00,-1.644196e-12,,,0.0,0.0,11.0,0.0
205331,1647,6230484,0.0,0.0,0.000000e+00,-1.644196e-12,,,0.0,0.0,11.0,0.0
205332,1647,6230514,0.0,0.0,0.000000e+00,-1.644196e-12,,,0.0,0.0,11.0,0.0
205333,1647,6230544,0.0,0.0,0.000000e+00,-1.644196e-12,,,0.0,0.0,11.0,0.0


In [18]:
X[20:45]

Unnamed: 0,pid,time,mean,median,std,var,skew,kurt,max,min,count,sum
20,1,629,6.636364,0.0,12.706477,161.4545,1.909179,2.232769,35.0,0.0,11.0,73.0
21,1,659,6.636364,0.0,12.706477,161.4545,1.909179,2.232769,35.0,0.0,11.0,73.0
22,1,689,6.454545,0.0,12.793464,163.6727,1.91137,2.231362,35.0,0.0,11.0,71.0
23,1,719,11.818182,0.0,20.098847,403.9636,1.676889,1.967497,59.0,0.0,11.0,130.0
24,1,749,11.363636,0.0,20.323743,413.0545,1.684439,1.93963,59.0,0.0,11.0,125.0
25,1,779,8.181818,0.0,18.946336,358.9636,2.437634,5.642589,59.0,0.0,11.0,90.0
26,1,809,5.545455,0.0,17.739017,314.6727,3.309797,10.966251,59.0,0.0,11.0,61.0
27,1,839,6.0,0.0,17.646529,311.4,3.27143,10.771321,59.0,0.0,11.0,66.0
28,1,869,6.090909,0.0,17.615076,310.2909,3.272098,10.775408,59.0,0.0,11.0,67.0
29,1,899,5.909091,0.0,17.671755,312.2909,3.274953,10.787699,59.0,0.0,11.0,65.0


In [19]:
X = X.fillna(0.0)

# First ML model

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, matthews_corrcoef

In [21]:
model = LogisticRegression()
model.fit(X.values[:1000], Y["sleep"].values[:1000])

pred = model.predict(X.values[5000:10000])
f1_score(Y["sleep"].values[5000:10000], pred)

0.8585648148148148

---
# Open Parenthesis
- Is F1 a good metric to use here?
- Is F1 score a good metric in general?

See https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-019-6413-7

In [22]:
f1_score?

In [23]:
f1_score(~Y["sleep"].values[5000:10000], ~pred)

0.10147058823529412

In [24]:
matthews_corrcoef(~Y["sleep"].values[5000:10000], ~pred), matthews_corrcoef(Y["sleep"].values[5000:10000], pred)

(0.20050367614978312, 0.20050367614978312)

# Close parenthesis
---

In [25]:
ngrps = 5
pid_grp = {}

i = 0
for pid in X["pid"].unique():
    pid_grp[pid] = i
    i = (i+1) % ngrps

pid_grp

{1: 0,
 16: 1,
 21: 2,
 28: 3,
 33: 4,
 36: 0,
 46: 1,
 50: 2,
 52: 3,
 74: 4,
 107: 0,
 111: 1,
 120: 2,
 121: 3,
 125: 4,
 133: 0,
 138: 1,
 144: 2,
 152: 3,
 155: 4,
 159: 0,
 167: 1,
 171: 2,
 193: 3,
 197: 4,
 220: 0,
 251: 1,
 271: 2,
 275: 3,
 282: 4,
 286: 0,
 292: 1,
 295: 2,
 299: 3,
 301: 4,
 306: 0,
 318: 1,
 323: 2,
 332: 3,
 339: 4,
 374: 0,
 380: 1,
 382: 2,
 386: 3,
 392: 4,
 393: 0,
 402: 1,
 423: 2,
 427: 3,
 435: 4,
 443: 0,
 445: 1,
 459: 2,
 470: 3,
 474: 4,
 476: 0,
 495: 1,
 499: 2,
 501: 3,
 509: 4,
 518: 0,
 522: 1,
 526: 2,
 528: 3,
 529: 4,
 534: 0,
 545: 1,
 550: 2,
 554: 3,
 555: 4,
 558: 0,
 589: 1,
 604: 2,
 612: 3,
 626: 4,
 632: 0,
 640: 1,
 657: 2,
 664: 3,
 677: 4,
 686: 0,
 688: 1,
 694: 2,
 702: 3,
 711: 4,
 712: 0,
 715: 1,
 716: 2,
 727: 3,
 728: 4,
 762: 0,
 768: 1,
 782: 2,
 784: 3,
 791: 4,
 796: 0,
 801: 1,
 804: 2,
 807: 3,
 811: 4,
 812: 0,
 813: 1,
 852: 2,
 860: 3,
 864: 4,
 884: 0,
 889: 1,
 892: 2,
 893: 3,
 899: 4,
 908: 0,
 912: 1,
 91

In [26]:
X["grp"] = X["pid"].apply(lambda x: pid_grp[x])
X

Unnamed: 0,pid,time,mean,median,std,var,skew,kurt,max,min,count,sum,grp
0,1,29,14.166667,0.0,34.701105,1204.166667,2.449490,6.000000,85.0,0.0,6.0,85.0,0
1,1,59,12.142857,0.0,32.126980,1032.142857,2.645751,7.000000,85.0,0.0,7.0,85.0,0
2,1,89,10.625000,0.0,30.052038,903.125000,2.828427,8.000000,85.0,0.0,8.0,85.0,0
3,1,119,9.444444,0.0,28.333333,802.777778,3.000000,9.000000,85.0,0.0,9.0,85.0,0
4,1,149,8.500000,0.0,26.879360,722.500000,3.162278,10.000000,85.0,0.0,10.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
205356,1647,6231234,3.500000,0.0,6.819091,46.500000,2.262781,5.278992,21.0,0.0,10.0,35.0,4
205357,1647,6231264,3.888889,0.0,7.114149,50.611111,2.113952,4.568488,21.0,0.0,9.0,35.0,4
205358,1647,6231294,4.375000,0.0,7.443837,55.410714,1.952574,3.855072,21.0,0.0,8.0,35.0,4
205359,1647,6231324,5.000000,0.0,7.810250,61.000000,1.774885,3.142972,21.0,0.0,7.0,35.0,4


In [27]:
cross_val_score(LogisticRegression(), X.values, Y["sleep"].values, groups=X["grp"], scoring="f1")

array([0.84624756, 0.86708327, 0.86441916, 0.85149739, 0.84328028])

In [28]:
cross_val_score(LogisticRegression(), X.values, Y["sleep"].values, groups=X["grp"], scoring="matthews_corrcoef")

array([0.43722301, 0.46052623, 0.43611178, 0.35579605, 0.28850433])

# Ideas to improve??

In [29]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


pipe = make_pipeline(StandardScaler(), LogisticRegression())
cross_val_score(pipe, X.values, Y["sleep"].values, groups=X["grp"], scoring="matthews_corrcoef")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

array([0.4847928 , 0.53811454, 0.58032939, 0.53635802, 0.49203124])