In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("./datasets/tutorial_sleep_training_data.csv.gz")

In [3]:
df1 = df[df["pid"] == 10].dropna()
df1

Unnamed: 0,time,act,sleep_phase,hr,pid
289664,29,38.0,0.0,71.0,10
289694,59,0.0,0.0,69.0,10
289724,89,23.0,0.0,73.0,10
289754,119,17.0,0.0,59.0,10
289784,149,8.0,0.0,71.0,10
...,...,...,...,...,...
323654,34019,64.0,0.0,100.0,10
323684,34049,42.0,0.0,31.0,10
323714,34079,45.0,0.0,34.0,10
323744,34109,105.0,0.0,33.0,10


In [4]:
df1.dtypes

time             int64
act            float64
sleep_phase    float64
hr             float64
pid              int64
dtype: object

In [5]:
df1["act"].mean()

8.06414762741652

In [6]:
df1[0:10]["act"].mean(), df1[1:11]["act"].mean(), df1[2:12]["act"].mean()

(10.6, 8.8, 13.2)

In [7]:
# Centered window of size 11 (5 + 1 + 5)
i = 5
winsize = 10
df_slice = df1[i-(winsize//2):i+(winsize//2)+1]

print("Shape: ", df_slice.shape)
df_slice

Shape:  (11, 5)


Unnamed: 0,time,act,sleep_phase,hr,pid
289664,29,38.0,0.0,71.0,10
289694,59,0.0,0.0,69.0,10
289724,89,23.0,0.0,73.0,10
289754,119,17.0,0.0,59.0,10
289784,149,8.0,0.0,71.0,10
289814,179,12.0,0.0,61.0,10
289844,209,0.0,0.0,73.0,10
289874,239,0.0,0.0,73.0,10
289904,269,0.0,0.0,68.0,10
289934,299,8.0,0.0,80.0,10


In [8]:
# %%timeit
n = df1.shape[0]
winsize = 11

result_loop = []
for i in range(0, n):
    r = df1[i-(winsize//2):i+(winsize//2)+1]["act"].mean()
    result_loop.append(r)


In [9]:
result_loop = pd.Series(result_loop)
result_loop[:20]

0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
5     11.454545
6     12.000000
7     12.181818
8     12.818182
9     11.727273
10    11.000000
11    11.363636
12    13.545455
13    25.909091
14    39.181818
15    45.454545
16    52.818182
17    67.727273
18    72.363636
19    83.727273
dtype: float64

# Pandas windowing
https://pandas.pydata.org/docs/user_guide/window.html

In [10]:
result_pdw = df1["act"].rolling(window=11, center=True, min_periods=11).mean()
result_pdw[:20]

289664          NaN
289694          NaN
289724          NaN
289754          NaN
289784          NaN
289814    11.454545
289844    12.000000
289874    12.181818
289904    12.818182
289934    11.727273
289964    11.000000
289994    11.363636
290024    13.545455
290054    25.909091
290084    39.181818
290114    45.454545
290144    52.818182
290174    67.727273
290204    72.363636
290234    83.727273
Name: act, dtype: float64

In [11]:
result_pdw = df1["act"].rolling(window=11, center=False, min_periods=11).mean()
result_pdw[:20]

289664          NaN
289694          NaN
289724          NaN
289754          NaN
289784          NaN
289814          NaN
289844          NaN
289874          NaN
289904          NaN
289934          NaN
289964    11.454545
289994    12.000000
290024    12.181818
290054    12.818182
290084    11.727273
290114    11.000000
290144    11.363636
290174    13.545455
290204    25.909091
290234    39.181818
Name: act, dtype: float64

In [12]:
result_pdw = df1["act"].rolling(window=11, center=True, min_periods=1).mean()
result_pdw[:20]

289664    16.333333
289694    14.000000
289724    12.250000
289754    10.888889
289784    10.600000
289814    11.454545
289844    12.000000
289874    12.181818
289904    12.818182
289934    11.727273
289964    11.000000
289994    11.363636
290024    13.545455
290054    25.909091
290084    39.181818
290114    45.454545
290144    52.818182
290174    67.727273
290204    72.363636
290234    83.727273
Name: act, dtype: float64

## A Few Question:

- How about the last 20 numbers? I.e.,: result_pdw[-20:] Vs result_loop[-20:]
- How about the execution time? (use %%timeit to check it)


In [13]:
Centered = True
Wsize = 11

df1["act"].rolling(window=Wsize, center=Centered, min_periods=1).agg(["mean", "median", "std", "var", "skew", "kurt"])

Unnamed: 0,mean,median,std,var,skew,kurt
289664,16.333333,14.5,13.185851,173.866667,0.713228,0.666706
289694,14.000000,12.0,13.527749,183.000000,0.835845,0.404760
289724,12.250000,10.0,13.466891,181.357143,1.000748,0.535173
289754,10.888889,8.0,13.242398,175.361111,1.158859,0.824273
289784,10.600000,8.0,12.518431,156.711111,1.270328,1.344534
...,...,...,...,...,...,...
323654,61.300000,54.0,43.415435,1884.900000,0.938059,1.207229
323684,62.111111,54.0,45.968588,2113.111111,0.847039,0.740940
323714,68.625000,59.0,44.480935,1978.553571,0.823750,1.031684
323744,56.571429,54.0,30.859127,952.285714,-0.009530,0.702244


## How to apply rolling windows to the whole dataset?

In [14]:
df_indexed = df.set_index(["pid", "time"])
df_indexed

Unnamed: 0_level_0,Unnamed: 1_level_0,act,sleep_phase,hr
pid,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,,,74.0
0,1,,,74.0
0,2,,,75.0
0,3,,,77.0
0,4,,,76.0
...,...,...,...,...
99,27685,,,70.0
99,27686,,,69.0
99,27687,,,68.0
99,27688,,,68.0


In [15]:
X_grped = df_indexed.dropna()["act"].rolling(window=Wsize, center=Centered, min_periods=1).agg(["mean", "median", "std", "var", "skew", "kurt", "max", "min", "count", "sum"])
X_grped

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,median,std,var,skew,kurt,max,min,count,sum
pid,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,29,15.333333,1.5,35.120744,1233.466667,2.445916,5.986385,87.0,0.0,6.0,92.0
0,59,13.285714,1.0,32.515198,1057.238095,2.642249,6.985881,87.0,0.0,7.0,93.0
0,89,11.625000,1.0,30.467489,928.267857,2.824207,7.982049,87.0,0.0,8.0,93.0
0,119,10.333333,1.0,28.761954,827.250000,2.995208,8.978607,87.0,0.0,9.0,93.0
0,149,9.300000,0.5,27.313204,746.011111,3.157003,9.975393,87.0,0.0,10.0,93.0
...,...,...,...,...,...,...,...,...,...,...,...
99,27569,32.600000,35.0,28.336078,802.933333,0.034822,-1.788971,74.0,0.0,10.0,326.0
99,27599,36.000000,41.0,27.807373,773.250000,-0.217459,-1.604856,74.0,0.0,9.0,324.0
99,27629,33.125000,35.0,28.261218,798.696429,0.060339,-1.596417,74.0,0.0,8.0,265.0
99,27659,29.428571,29.0,28.359931,804.285714,0.446974,-1.109392,74.0,0.0,7.0,206.0


<hr>
<h3> That is it!!! </h3>
<h3> We already have the minimal minimal minimal necessary to run our first ML model! </h3>

So lets do it....
<hr>

In [16]:
# Reset index to get pid and time cols back to the dataframe
X_reseted = X_grped.reset_index()
X_reseted

Unnamed: 0,pid,time,mean,median,std,var,skew,kurt,max,min,count,sum
0,0,29,15.333333,1.5,35.120744,1233.466667,2.445916,5.986385,87.0,0.0,6.0,92.0
1,0,59,13.285714,1.0,32.515198,1057.238095,2.642249,6.985881,87.0,0.0,7.0,93.0
2,0,89,11.625000,1.0,30.467489,928.267857,2.824207,7.982049,87.0,0.0,8.0,93.0
3,0,119,10.333333,1.0,28.761954,827.250000,2.995208,8.978607,87.0,0.0,9.0,93.0
4,0,149,9.300000,0.5,27.313204,746.011111,3.157003,9.975393,87.0,0.0,10.0,93.0
...,...,...,...,...,...,...,...,...,...,...,...,...
102854,99,27569,32.600000,35.0,28.336078,802.933333,0.034822,-1.788971,74.0,0.0,10.0,326.0
102855,99,27599,36.000000,41.0,27.807373,773.250000,-0.217459,-1.604856,74.0,0.0,9.0,324.0
102856,99,27629,33.125000,35.0,28.261218,798.696429,0.060339,-1.596417,74.0,0.0,8.0,265.0
102857,99,27659,29.428571,29.0,28.359931,804.285714,0.446974,-1.109392,74.0,0.0,7.0,206.0


In [17]:
df.dropna() # [["pid", "time", "act"]]

Unnamed: 0,time,act,sleep_phase,hr,pid
29,29,2.0,0.0,71.0,0
59,59,0.0,0.0,76.0,0
89,89,1.0,0.0,78.0,0
119,119,2.0,0.0,73.0,0
149,149,87.0,0.0,80.0,0
...,...,...,...,...,...
3116246,27569,9.0,0.0,67.0,99
3116276,27599,29.0,0.0,65.0,99
3116306,27629,53.0,0.0,64.0,99
3116336,27659,74.0,0.0,67.0,99


In [18]:
# Merge X to the original dataframe to append the original activity values
X = pd.merge(X_reseted, df[["time", "pid", "act"]].dropna(), on=["time", "pid"])
X

Unnamed: 0,pid,time,mean,median,std,var,skew,kurt,max,min,count,sum,act
0,0,29,15.333333,1.5,35.120744,1233.466667,2.445916,5.986385,87.0,0.0,6.0,92.0,2.0
1,0,59,13.285714,1.0,32.515198,1057.238095,2.642249,6.985881,87.0,0.0,7.0,93.0,0.0
2,0,89,11.625000,1.0,30.467489,928.267857,2.824207,7.982049,87.0,0.0,8.0,93.0,1.0
3,0,119,10.333333,1.0,28.761954,827.250000,2.995208,8.978607,87.0,0.0,9.0,93.0,2.0
4,0,149,9.300000,0.5,27.313204,746.011111,3.157003,9.975393,87.0,0.0,10.0,93.0,87.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
102854,99,27569,32.600000,35.0,28.336078,802.933333,0.034822,-1.788971,74.0,0.0,10.0,326.0,9.0
102855,99,27599,36.000000,41.0,27.807373,773.250000,-0.217459,-1.604856,74.0,0.0,9.0,324.0,29.0
102856,99,27629,33.125000,35.0,28.261218,798.696429,0.060339,-1.596417,74.0,0.0,8.0,265.0,53.0
102857,99,27659,29.428571,29.0,28.359931,804.285714,0.446974,-1.109392,74.0,0.0,7.0,206.0,74.0


In [19]:
# Get Ys (i.e., sleep labels)
Y = df.dropna()[["pid", "sleep_phase"]].reset_index().rename(columns={"index": "time"})
Y["sleep"] = Y["sleep_phase"] > 0
Y

Unnamed: 0,time,pid,sleep_phase,sleep
0,29,0,0.0,False
1,59,0,0.0,False
2,89,0,0.0,False
3,119,0,0.0,False
4,149,0,0.0,False
...,...,...,...,...
102854,3116246,99,0.0,False
102855,3116276,99,0.0,False
102856,3116306,99,0.0,False
102857,3116336,99,0.0,False


### Are there NAs?

In [20]:
X[X.isna().values.any(axis=1)]

Unnamed: 0,pid,time,mean,median,std,var,skew,kurt,max,min,count,sum,act
601,0,18059,0.0,0.0,0.000002,3.219017e-12,,,0.0,0.0,11.0,0.0,0.0
602,0,18089,0.0,0.0,0.000002,3.219017e-12,,,0.0,0.0,11.0,0.0,0.0
603,0,18119,0.0,0.0,0.000002,3.219017e-12,,,0.0,0.0,11.0,0.0,0.0
604,0,18149,0.0,0.0,0.000002,3.219017e-12,,,0.0,0.0,11.0,0.0,0.0
605,0,18179,0.0,0.0,0.000002,3.219017e-12,,,0.0,0.0,11.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
102274,99,10169,0.0,0.0,0.000002,5.025142e-12,,,0.0,0.0,11.0,0.0,0.0
102471,99,16079,0.0,0.0,0.000004,1.232615e-11,,,0.0,0.0,11.0,0.0,0.0
102490,99,16649,0.0,0.0,0.000004,1.232629e-11,,,0.0,0.0,11.0,0.0,0.0
102491,99,16679,0.0,0.0,0.000004,1.232629e-11,,,0.0,0.0,11.0,0.0,0.0


Lets have a look at them:

In [21]:
X[590:620]

Unnamed: 0,pid,time,mean,median,std,var,skew,kurt,max,min,count,sum,act
590,0,17729,50.181818,17.0,73.730344,5436.164,1.703638,1.619478,207.0,0.0,11.0,552.0,1.0
591,0,17759,49.0,17.0,74.48624,5548.2,1.688307,1.572494,207.0,0.0,11.0,539.0,0.0
592,0,17789,47.454545,5.0,75.387484,5683.273,1.679443,1.531653,207.0,0.0,11.0,522.0,5.0
593,0,17819,28.636364,1.0,54.529392,2973.455,2.616569,7.399291,182.0,0.0,11.0,315.0,48.0
594,0,17849,12.090909,0.0,20.057191,402.2909,1.363299,0.100988,49.0,0.0,11.0,133.0,30.0
595,0,17879,12.090909,0.0,20.057191,402.2909,1.363299,0.100988,49.0,0.0,11.0,133.0,49.0
596,0,17909,12.0,0.0,20.114671,404.6,1.36216,0.096118,49.0,0.0,11.0,132.0,0.0
597,0,17939,12.0,0.0,20.114671,404.6,1.36216,0.096118,49.0,0.0,11.0,132.0,0.0
598,0,17969,11.545455,0.0,20.343862,413.8727,1.378474,0.106173,49.0,0.0,11.0,127.0,0.0
599,0,17999,7.181818,0.0,16.533712,273.3636,2.208707,4.059921,49.0,0.0,11.0,79.0,0.0


In [22]:
X = X.fillna(0.0)

# First ML model

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, matthews_corrcoef, accuracy_score

In [25]:
model = LogisticRegression()
model.fit(X.values[:1000], Y["sleep"].values[:1000])

pred = model.predict(X.values[5000:10000])
f1_score(Y["sleep"].values[5000:10000], pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.749850209706411

---
# Open Parenthesis
- Is F1 a good metric to use here?
- Is F1 score a good metric in general?

See https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-019-6413-7

In [26]:
f1_score(Y["sleep"].values[5000:10000], pred)

0.749850209706411

In [27]:
f1_score(~Y["sleep"].values[5000:10000], ~pred)

0.4975932611311673

In [28]:
f1_score?

In [29]:
matthews_corrcoef(~Y["sleep"].values[5000:10000], ~pred), matthews_corrcoef(Y["sleep"].values[5000:10000], pred)

(0.28104257378719294, 0.28104257378719294)

# Close parenthesis
---

In [30]:
ngrps = 5
pid_grp = {}

i = 0
for pid in X["pid"].unique():
    pid_grp[pid] = i
    i = (i+1) % ngrps

pid_grp

{0: 0,
 1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 0,
 6: 1,
 7: 2,
 8: 3,
 9: 4,
 10: 0,
 11: 1,
 12: 2,
 13: 3,
 14: 4,
 15: 0,
 16: 1,
 17: 2,
 18: 3,
 19: 4,
 20: 0,
 21: 1,
 22: 2,
 23: 3,
 24: 4,
 25: 0,
 26: 1,
 27: 2,
 28: 3,
 29: 4,
 30: 0,
 31: 1,
 32: 2,
 33: 3,
 34: 4,
 35: 0,
 36: 1,
 37: 2,
 38: 3,
 39: 4,
 40: 0,
 41: 1,
 42: 2,
 43: 3,
 44: 4,
 45: 0,
 46: 1,
 47: 2,
 48: 3,
 49: 4,
 50: 0,
 51: 1,
 52: 2,
 53: 3,
 54: 4,
 55: 0,
 56: 1,
 57: 2,
 58: 3,
 59: 4,
 60: 0,
 61: 1,
 62: 2,
 63: 3,
 64: 4,
 65: 0,
 66: 1,
 67: 2,
 68: 3,
 69: 4,
 70: 0,
 71: 1,
 72: 2,
 73: 3,
 74: 4,
 75: 0,
 76: 1,
 77: 2,
 78: 3,
 79: 4,
 80: 0,
 81: 1,
 82: 2,
 83: 3,
 84: 4,
 85: 0,
 86: 1,
 87: 2,
 88: 3,
 89: 4,
 90: 0,
 91: 1,
 92: 2,
 93: 3,
 94: 4,
 95: 0,
 96: 1,
 97: 2,
 98: 3,
 99: 4}

In [31]:
X["grp"] = X["pid"].apply(lambda x: pid_grp[x])
X

Unnamed: 0,pid,time,mean,median,std,var,skew,kurt,max,min,count,sum,act,grp
0,0,29,15.333333,1.5,35.120744,1233.466667,2.445916,5.986385,87.0,0.0,6.0,92.0,2.0,0
1,0,59,13.285714,1.0,32.515198,1057.238095,2.642249,6.985881,87.0,0.0,7.0,93.0,0.0,0
2,0,89,11.625000,1.0,30.467489,928.267857,2.824207,7.982049,87.0,0.0,8.0,93.0,1.0,0
3,0,119,10.333333,1.0,28.761954,827.250000,2.995208,8.978607,87.0,0.0,9.0,93.0,2.0,0
4,0,149,9.300000,0.5,27.313204,746.011111,3.157003,9.975393,87.0,0.0,10.0,93.0,87.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102854,99,27569,32.600000,35.0,28.336078,802.933333,0.034822,-1.788971,74.0,0.0,10.0,326.0,9.0,4
102855,99,27599,36.000000,41.0,27.807373,773.250000,-0.217459,-1.604856,74.0,0.0,9.0,324.0,29.0,4
102856,99,27629,33.125000,35.0,28.261218,798.696429,0.060339,-1.596417,74.0,0.0,8.0,265.0,53.0,4
102857,99,27659,29.428571,29.0,28.359931,804.285714,0.446974,-1.109392,74.0,0.0,7.0,206.0,74.0,4


In [32]:
scores = cross_val_score(LogisticRegression(), X.values, Y["sleep"].values, groups=X["grp"], scoring="f1_weighted")
print(scores.mean())
scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8046077014341553


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([0.78863131, 0.78114769, 0.81538766, 0.81416144, 0.8237104 ])

In [33]:
scores = cross_val_score(LogisticRegression(), X.values, Y["sleep"].values, groups=X["grp"], scoring="matthews_corrcoef")
print(scores.mean())
scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.5505193467596092


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([0.49631613, 0.49896126, 0.57166581, 0.57230831, 0.61334522])

# Ideas to improve??

In [33]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler


pipe = make_pipeline(StandardScaler(), LogisticRegression())
scores = cross_val_score(pipe, X.values, Y["sleep"].values, groups=X["grp"], scoring="matthews_corrcoef")
print(scores.mean())
scores

0.5483737715557196


array([0.49868795, 0.56405213, 0.60067951, 0.56601866, 0.5124306 ])

# Comparison with different ML techniques -- hand-out test set

In [34]:
test_ids = range(151, 201)

In [35]:
X_test = X[X["pid"].isin(test_ids)]
X_test

Unnamed: 0,pid,time,mean,median,std,var,skew,kurt,max,min,count,sum,act,grp
154094,151,29,0.454545,0.0,0.522233,0.272727,0.212762,-2.444444,1.0,0.0,11.0,5.0,1.0,0
154095,151,59,3.727273,1.0,10.715325,114.818182,3.303394,10.936207,36.0,0.0,11.0,41.0,1.0,0
154096,151,89,6.636364,1.0,13.566670,184.054545,1.936871,2.165704,36.0,0.0,11.0,73.0,1.0,0
154097,151,119,8.272727,1.0,13.900294,193.218182,1.461043,0.504286,36.0,0.0,11.0,91.0,1.0,0
154098,151,149,8.272727,1.0,13.900294,193.218182,1.461043,0.504286,36.0,0.0,11.0,91.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205356,200,31199,3.700000,1.0,6.634087,44.011111,2.377384,5.895031,21.0,0.0,10.0,37.0,0.0,4
205357,200,31229,4.111111,1.0,6.900081,47.611111,2.239598,5.203851,21.0,0.0,9.0,37.0,0.0,4
205358,200,31259,4.625000,1.0,7.190023,51.696429,2.095160,4.524464,21.0,0.0,8.0,37.0,1.0,4
205359,200,31289,5.142857,1.0,7.603257,57.809524,1.909136,3.715867,21.0,0.0,7.0,36.0,8.0,4


In [36]:
Y_test = Y[Y["pid"].isin(test_ids)]
Y_test

Unnamed: 0,time,pid,sleep_phase,sleep
154094,4677905,151,0.0,False
154095,4677935,151,0.0,False
154096,4677965,151,0.0,False
154097,4677995,151,0.0,False
154098,4678025,151,0.0,False
...,...,...,...,...
205356,6231234,200,0.0,False
205357,6231264,200,0.0,False
205358,6231294,200,0.0,False
205359,6231324,200,0.0,False


In [37]:
X_train = X[~X["pid"].isin(test_ids)]
X_train

Unnamed: 0,pid,time,mean,median,std,var,skew,kurt,max,min,count,sum,act,grp
0,1,29,15.333333,1.5,35.120744,1233.466667,2.445916,5.986385,87.0,0.0,6.0,92.0,2.0,0
1,1,59,13.285714,1.0,32.515198,1057.238095,2.642249,6.985881,87.0,0.0,7.0,93.0,0.0,0
2,1,89,11.625000,1.0,30.467489,928.267857,2.824207,7.982049,87.0,0.0,8.0,93.0,1.0,0
3,1,119,10.333333,1.0,28.761954,827.250000,2.995208,8.978607,87.0,0.0,9.0,93.0,2.0,0
4,1,149,9.300000,0.5,27.313204,746.011111,3.157003,9.975393,87.0,0.0,10.0,93.0,87.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154089,150,28919,2.545455,0.0,7.789270,60.672727,3.300851,10.922336,26.0,0.0,11.0,28.0,0.0,4
154090,150,28949,2.636364,0.0,7.762380,60.254545,3.295674,10.897582,26.0,0.0,11.0,29.0,0.0,4
154091,150,28979,2.727273,0.0,7.734221,59.818182,3.292385,10.882344,26.0,0.0,11.0,30.0,1.0,4
154092,150,29009,2.818182,1.0,7.704780,59.363636,3.291050,10.876751,26.0,0.0,11.0,31.0,0.0,4


In [38]:
Y_train = Y[~Y["pid"].isin(test_ids)]
Y_train

Unnamed: 0,time,pid,sleep_phase,sleep
0,29,1,0.0,False
1,59,1,0.0,False
2,89,1,0.0,False
3,119,1,0.0,False
4,149,1,0.0,False
...,...,...,...,...
154089,4677755,150,5.0,True
154090,4677785,150,5.0,True
154091,4677815,150,5.0,True
154092,4677845,150,5.0,True


In [39]:
pipe = make_pipeline(StandardScaler(), LogisticRegression())
pipe.fit(X_train[['mean', 'median', 'std', 'var', 'skew', 'kurt', 'max', 'min', 'count', 'sum', 'act']].values,
         Y_train["sleep"].values)


In [40]:
y_hat = pipe.predict(X_test[['mean', 'median', 'std', 'var', 'skew', 'kurt', 'max', 'min', 'count', 'sum', 'act']].values)
acc = accuracy_score(Y_test["sleep"].values, y_hat)
f1 = f1_score(Y_test["sleep"].values, y_hat, average="weighted")
mcc = matthews_corrcoef(Y_test["sleep"].values, y_hat)
print("StandardScale + LR: Acc: %.3f, F1 %.3f, MCC: %.3f" % (acc, f1, mcc))

StandardScale + LR: Acc: 0.802, F1 0.781, MCC: 0.505


# Only Logistic Regression

In [41]:
pipe = LogisticRegression()
pipe.fit(X_train[['mean', 'median', 'std', 'var', 'skew', 'kurt', 'max', 'min', 'count', 'sum', 'act']].values,
         Y_train["sleep"].values)
y_hat = pipe.predict(X_test[['mean', 'median', 'std', 'var', 'skew', 'kurt', 'max', 'min', 'count', 'sum', 'act']].values)
acc = accuracy_score(Y_test["sleep"].values, y_hat)
f1 = f1_score(Y_test["sleep"].values, y_hat, average="weighted")
mcc = matthews_corrcoef(Y_test["sleep"].values, y_hat)
print("Only LR: Acc: %.3f, F1 %.3f, MCC: %.3f" % (acc, f1, mcc))

Only LR: Acc: 0.802, F1 0.780, MCC: 0.504


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
