In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("./datasets/tutorial_sleep_training_data.csv.gz")

In [3]:
df1 = df[df["pid"] == 10].dropna()
df1

Unnamed: 0,time,act,sleep_phase,hr,pid
255104,29,0.0,1.0,58.0,10
255134,59,2.0,0.0,55.0,10
255164,89,0.0,0.0,56.0,10
255194,119,0.0,1.0,56.0,10
255224,149,0.0,2.0,57.0,10
...,...,...,...,...,...
289514,34439,204.0,0.0,50.0,10
289544,34469,30.0,0.0,51.0,10
289574,34499,38.0,0.0,50.0,10
289604,34529,87.0,0.0,51.0,10


In [4]:
df1.dtypes

time             int64
act            float64
sleep_phase    float64
hr             float64
pid              int64
dtype: object

In [5]:
df1["act"].mean()

13.97482638888889

In [6]:
df1[0:10]["act"].mean(), df1[1:11]["act"].mean(), df1[2:12]["act"].mean()

(1.6, 1.8, 1.6)

In [7]:
# Centered window of size 11 (5 + 1 + 5)
i = 5
winsize = 10
df_slice = df1[i-(winsize//2):i+(winsize//2)+1]

print("Shape: ", df_slice.shape)
df_slice

Shape:  (11, 5)


Unnamed: 0,time,act,sleep_phase,hr,pid
255104,29,0.0,1.0,58.0,10
255134,59,2.0,0.0,55.0,10
255164,89,0.0,0.0,56.0,10
255194,119,0.0,1.0,56.0,10
255224,149,0.0,2.0,57.0,10
255254,179,5.0,2.0,58.0,10
255284,209,1.0,2.0,58.0,10
255314,239,2.0,2.0,55.0,10
255344,269,0.0,2.0,57.0,10
255374,299,6.0,2.0,58.0,10


In [8]:
# %%timeit
n = df1.shape[0]
winsize = 11

result_loop = []
for i in range(0, n):
    r = df1[i-(winsize//2):i+(winsize//2)+1]["act"].mean()
    result_loop.append(r)


In [9]:
result_loop = pd.Series(result_loop)
result_loop[:20]

0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
5      1.636364
6      1.636364
7      1.454545
8      1.454545
9      1.636364
10     1.727273
11     1.272727
12     1.181818
13     9.363636
14    13.545455
15    13.000000
16    12.818182
17    12.909091
18    25.909091
19    44.363636
dtype: float64

# Pandas windowing
https://pandas.pydata.org/docs/user_guide/window.html

In [10]:
result_pdw = df1["act"].rolling(window=11, center=True, min_periods=11).mean()
result_pdw[:20]

255104          NaN
255134          NaN
255164          NaN
255194          NaN
255224          NaN
255254     1.636364
255284     1.636364
255314     1.454545
255344     1.454545
255374     1.636364
255404     1.727273
255434     1.272727
255464     1.181818
255494     9.363636
255524    13.545455
255554    13.000000
255584    12.818182
255614    12.909091
255644    25.909091
255674    44.363636
Name: act, dtype: float64

In [11]:
result_pdw = df1["act"].rolling(window=11, center=False, min_periods=11).mean()
result_pdw[:20]

255104          NaN
255134          NaN
255164          NaN
255194          NaN
255224          NaN
255254          NaN
255284          NaN
255314          NaN
255344          NaN
255374          NaN
255404     1.636364
255434     1.636364
255464     1.454545
255494     1.454545
255524     1.636364
255554     1.727273
255584     1.272727
255614     1.181818
255644     9.363636
255674    13.545455
Name: act, dtype: float64

In [12]:
result_pdw = df1["act"].rolling(window=11, center=True, min_periods=1).mean()
result_pdw[:20]

255104     1.166667
255134     1.142857
255164     1.250000
255194     1.111111
255224     1.600000
255254     1.636364
255284     1.636364
255314     1.454545
255344     1.454545
255374     1.636364
255404     1.727273
255434     1.272727
255464     1.181818
255494     9.363636
255524    13.545455
255554    13.000000
255584    12.818182
255614    12.909091
255644    25.909091
255674    44.363636
Name: act, dtype: float64

## A Few Question:

- How about the last 20 numbers? I.e.,: result_pdw[-20:] Vs result_loop[-20:]
- How about the execution time? (use %%timeit to check it)


In [13]:
Centered = True
Wsize = 11

df1["act"].rolling(window=Wsize, center=Centered, min_periods=1).agg(["mean", "median", "std", "var", "skew", "kurt"])

Unnamed: 0,mean,median,std,var,skew,kurt
255104,1.166667,0.0,2.041241,4.166667,1.783229,2.774400
255134,1.142857,0.0,1.864454,3.476190,1.873551,3.432351
255164,1.250000,0.5,1.752549,3.071429,1.618904,2.665224
255194,1.111111,0.0,1.691482,2.861111,1.771953,3.202833
255224,1.600000,0.5,2.221111,4.933333,1.317213,0.515757
...,...,...,...,...,...,...
289514,87.400000,34.0,113.135317,12799.600000,1.227560,0.476469
289544,97.111111,38.0,115.492905,13338.611111,1.079703,0.086695
289574,109.000000,62.5,117.432047,13790.285714,0.911952,-0.281947
289604,124.571429,87.0,117.582393,13825.619048,0.738741,-0.569315


## How to apply rolling windows to the whole dataset?

In [14]:
df_indexed = df.set_index(["pid", "time"])
df_indexed

Unnamed: 0_level_0,Unnamed: 1_level_0,act,sleep_phase,hr
pid,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,,,75.0
1,1,,,75.0
1,2,,,77.0
1,3,,,78.0
1,4,,,77.0
...,...,...,...,...
200,31315,,,62.0
200,31316,,,60.0
200,31317,,,61.0
200,31318,,,62.0


In [15]:
X_grped = df_indexed.dropna()["act"].rolling(window=Wsize, center=Centered, min_periods=1).agg(["mean", "median", "std", "var", "skew", "kurt", "max", "min", "count", "sum"])
X_grped

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,median,std,var,skew,kurt,max,min,count,sum
pid,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,29,15.333333,1.5,35.120744,1233.466667,2.445916,5.986385,87.0,0.0,6.0,92.0
1,59,13.285714,1.0,32.515198,1057.238095,2.642249,6.985881,87.0,0.0,7.0,93.0
1,89,11.625000,1.0,30.467489,928.267857,2.824207,7.982049,87.0,0.0,8.0,93.0
1,119,10.333333,1.0,28.761954,827.250000,2.995208,8.978607,87.0,0.0,9.0,93.0
1,149,9.300000,0.5,27.313204,746.011111,3.157003,9.975393,87.0,0.0,10.0,93.0
...,...,...,...,...,...,...,...,...,...,...,...
200,31199,3.700000,1.0,6.634087,44.011111,2.377384,5.895031,21.0,0.0,10.0,37.0
200,31229,4.111111,1.0,6.900081,47.611111,2.239598,5.203851,21.0,0.0,9.0,37.0
200,31259,4.625000,1.0,7.190023,51.696429,2.095160,4.524464,21.0,0.0,8.0,37.0
200,31289,5.142857,1.0,7.603257,57.809524,1.909136,3.715867,21.0,0.0,7.0,36.0


<hr>
<h3> That is it!!! </h3>
<h3> We already have the minimal minimal minimal necessary to run our first ML model! </h3>

So lets do it....
<hr>

In [16]:
# Reset index to get pid and time cols back to the dataframe
X_reseted = X_grped.reset_index()
X_reseted

Unnamed: 0,pid,time,mean,median,std,var,skew,kurt,max,min,count,sum
0,1,29,15.333333,1.5,35.120744,1233.466667,2.445916,5.986385,87.0,0.0,6.0,92.0
1,1,59,13.285714,1.0,32.515198,1057.238095,2.642249,6.985881,87.0,0.0,7.0,93.0
2,1,89,11.625000,1.0,30.467489,928.267857,2.824207,7.982049,87.0,0.0,8.0,93.0
3,1,119,10.333333,1.0,28.761954,827.250000,2.995208,8.978607,87.0,0.0,9.0,93.0
4,1,149,9.300000,0.5,27.313204,746.011111,3.157003,9.975393,87.0,0.0,10.0,93.0
...,...,...,...,...,...,...,...,...,...,...,...,...
205356,200,31199,3.700000,1.0,6.634087,44.011111,2.377384,5.895031,21.0,0.0,10.0,37.0
205357,200,31229,4.111111,1.0,6.900081,47.611111,2.239598,5.203851,21.0,0.0,9.0,37.0
205358,200,31259,4.625000,1.0,7.190023,51.696429,2.095160,4.524464,21.0,0.0,8.0,37.0
205359,200,31289,5.142857,1.0,7.603257,57.809524,1.909136,3.715867,21.0,0.0,7.0,36.0


In [17]:
df.dropna() # [["pid", "time", "act"]]

Unnamed: 0,time,act,sleep_phase,hr,pid
29,29,2.0,0.0,73.0,1
59,59,0.0,0.0,75.0,1
89,89,1.0,0.0,77.0,1
119,119,2.0,0.0,73.0,1
149,149,87.0,0.0,80.0,1
...,...,...,...,...,...
6231234,31199,0.0,0.0,57.0,200
6231264,31229,0.0,0.0,60.0,200
6231294,31259,1.0,0.0,60.0,200
6231324,31289,8.0,0.0,66.0,200


In [18]:
# Merge X to the original dataframe to append the original activity values
X = pd.merge(X_reseted, df[["time", "pid", "act"]].dropna(), on=["time", "pid"])
X

Unnamed: 0,pid,time,mean,median,std,var,skew,kurt,max,min,count,sum,act
0,1,29,15.333333,1.5,35.120744,1233.466667,2.445916,5.986385,87.0,0.0,6.0,92.0,2.0
1,1,59,13.285714,1.0,32.515198,1057.238095,2.642249,6.985881,87.0,0.0,7.0,93.0,0.0
2,1,89,11.625000,1.0,30.467489,928.267857,2.824207,7.982049,87.0,0.0,8.0,93.0,1.0
3,1,119,10.333333,1.0,28.761954,827.250000,2.995208,8.978607,87.0,0.0,9.0,93.0,2.0
4,1,149,9.300000,0.5,27.313204,746.011111,3.157003,9.975393,87.0,0.0,10.0,93.0,87.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
205356,200,31199,3.700000,1.0,6.634087,44.011111,2.377384,5.895031,21.0,0.0,10.0,37.0,0.0
205357,200,31229,4.111111,1.0,6.900081,47.611111,2.239598,5.203851,21.0,0.0,9.0,37.0,0.0
205358,200,31259,4.625000,1.0,7.190023,51.696429,2.095160,4.524464,21.0,0.0,8.0,37.0,1.0
205359,200,31289,5.142857,1.0,7.603257,57.809524,1.909136,3.715867,21.0,0.0,7.0,36.0,8.0


In [19]:
# Get Ys (i.e., sleep labels)
Y = df.dropna()[["pid", "sleep_phase"]].reset_index().rename(columns={"index": "time"})
Y["sleep"] = Y["sleep_phase"] > 0
Y

Unnamed: 0,time,pid,sleep_phase,sleep
0,29,1,0.0,False
1,59,1,0.0,False
2,89,1,0.0,False
3,119,1,0.0,False
4,149,1,0.0,False
...,...,...,...,...
205356,6231234,200,0.0,False
205357,6231264,200,0.0,False
205358,6231294,200,0.0,False
205359,6231324,200,0.0,False


### Are there NAs?

In [20]:
X[X.isna().values.any(axis=1)]

Unnamed: 0,pid,time,mean,median,std,var,skew,kurt,max,min,count,sum,act
601,1,18059,0.0,0.0,0.000002,3.219017e-12,,,0.0,0.0,11.0,0.0,0.0
602,1,18089,0.0,0.0,0.000002,3.219017e-12,,,0.0,0.0,11.0,0.0,0.0
603,1,18119,0.0,0.0,0.000002,3.219017e-12,,,0.0,0.0,11.0,0.0,0.0
604,1,18149,0.0,0.0,0.000002,3.219017e-12,,,0.0,0.0,11.0,0.0,0.0
605,1,18179,0.0,0.0,0.000002,3.219017e-12,,,0.0,0.0,11.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
202987,198,28259,0.0,0.0,0.000000,-2.299431e-10,,,0.0,0.0,11.0,0.0,0.0
202988,198,28289,0.0,0.0,0.000000,-2.299431e-10,,,0.0,0.0,11.0,0.0,0.0
203301,199,3359,0.0,0.0,0.000000,-2.217682e-10,,,0.0,0.0,11.0,0.0,0.0
203899,199,21299,0.0,0.0,0.000000,-2.172086e-10,,,0.0,0.0,11.0,0.0,0.0


Lets have a look at them:

In [21]:
X[590:620]

Unnamed: 0,pid,time,mean,median,std,var,skew,kurt,max,min,count,sum,act
590,1,17729,50.181818,17.0,73.730344,5436.164,1.703638,1.619478,207.0,0.0,11.0,552.0,1.0
591,1,17759,49.0,17.0,74.48624,5548.2,1.688307,1.572494,207.0,0.0,11.0,539.0,0.0
592,1,17789,47.454545,5.0,75.387484,5683.273,1.679443,1.531653,207.0,0.0,11.0,522.0,5.0
593,1,17819,28.636364,1.0,54.529392,2973.455,2.616569,7.399291,182.0,0.0,11.0,315.0,48.0
594,1,17849,12.090909,0.0,20.057191,402.2909,1.363299,0.100988,49.0,0.0,11.0,133.0,30.0
595,1,17879,12.090909,0.0,20.057191,402.2909,1.363299,0.100988,49.0,0.0,11.0,133.0,49.0
596,1,17909,12.0,0.0,20.114671,404.6,1.36216,0.096118,49.0,0.0,11.0,132.0,0.0
597,1,17939,12.0,0.0,20.114671,404.6,1.36216,0.096118,49.0,0.0,11.0,132.0,0.0
598,1,17969,11.545455,0.0,20.343862,413.8727,1.378474,0.106173,49.0,0.0,11.0,127.0,0.0
599,1,17999,7.181818,0.0,16.533712,273.3636,2.208707,4.059921,49.0,0.0,11.0,79.0,0.0


In [22]:
X = X.fillna(0.0)

# First ML model

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, matthews_corrcoef, accuracy_score

In [24]:
model = LogisticRegression()
model.fit(X.values[:1000], Y["sleep"].values[:1000])

pred = model.predict(X.values[5000:10000])
f1_score(Y["sleep"].values[5000:10000], pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7395644283121596

---
# Open Parenthesis
- Is F1 a good metric to use here?
- Is F1 score a good metric in general?

See https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-019-6413-7

In [25]:
f1_score(Y["sleep"].values[5000:10000], pred)

0.7395644283121596

In [26]:
f1_score(~Y["sleep"].values[5000:10000], ~pred)

0.4917355371900826

In [27]:
f1_score?

In [28]:
matthews_corrcoef(~Y["sleep"].values[5000:10000], ~pred), matthews_corrcoef(Y["sleep"].values[5000:10000], pred)

(0.27000737786483253, 0.27000737786483253)

# Close parenthesis
---

In [29]:
ngrps = 5
pid_grp = {}

i = 0
for pid in X["pid"].unique():
    pid_grp[pid] = i
    i = (i+1) % ngrps

pid_grp

{1: 0,
 2: 1,
 3: 2,
 4: 3,
 5: 4,
 6: 0,
 7: 1,
 8: 2,
 9: 3,
 10: 4,
 11: 0,
 12: 1,
 13: 2,
 14: 3,
 15: 4,
 16: 0,
 17: 1,
 18: 2,
 19: 3,
 20: 4,
 21: 0,
 22: 1,
 23: 2,
 24: 3,
 25: 4,
 26: 0,
 27: 1,
 28: 2,
 29: 3,
 30: 4,
 31: 0,
 32: 1,
 33: 2,
 34: 3,
 35: 4,
 36: 0,
 37: 1,
 38: 2,
 39: 3,
 40: 4,
 41: 0,
 42: 1,
 43: 2,
 44: 3,
 45: 4,
 46: 0,
 47: 1,
 48: 2,
 49: 3,
 50: 4,
 51: 0,
 52: 1,
 53: 2,
 54: 3,
 55: 4,
 56: 0,
 57: 1,
 58: 2,
 59: 3,
 60: 4,
 61: 0,
 62: 1,
 63: 2,
 64: 3,
 65: 4,
 66: 0,
 67: 1,
 68: 2,
 69: 3,
 70: 4,
 71: 0,
 72: 1,
 73: 2,
 74: 3,
 75: 4,
 76: 0,
 77: 1,
 78: 2,
 79: 3,
 80: 4,
 81: 0,
 82: 1,
 83: 2,
 84: 3,
 85: 4,
 86: 0,
 87: 1,
 88: 2,
 89: 3,
 90: 4,
 91: 0,
 92: 1,
 93: 2,
 94: 3,
 95: 4,
 96: 0,
 97: 1,
 98: 2,
 99: 3,
 100: 4,
 101: 0,
 102: 1,
 103: 2,
 104: 3,
 105: 4,
 106: 0,
 107: 1,
 108: 2,
 109: 3,
 110: 4,
 111: 0,
 112: 1,
 113: 2,
 114: 3,
 115: 4,
 116: 0,
 117: 1,
 118: 2,
 119: 3,
 120: 4,
 121: 0,
 122: 1,
 123: 2,
 

In [30]:
X["grp"] = X["pid"].apply(lambda x: pid_grp[x])
X

Unnamed: 0,pid,time,mean,median,std,var,skew,kurt,max,min,count,sum,act,grp
0,1,29,15.333333,1.5,35.120744,1233.466667,2.445916,5.986385,87.0,0.0,6.0,92.0,2.0,0
1,1,59,13.285714,1.0,32.515198,1057.238095,2.642249,6.985881,87.0,0.0,7.0,93.0,0.0,0
2,1,89,11.625000,1.0,30.467489,928.267857,2.824207,7.982049,87.0,0.0,8.0,93.0,1.0,0
3,1,119,10.333333,1.0,28.761954,827.250000,2.995208,8.978607,87.0,0.0,9.0,93.0,2.0,0
4,1,149,9.300000,0.5,27.313204,746.011111,3.157003,9.975393,87.0,0.0,10.0,93.0,87.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205356,200,31199,3.700000,1.0,6.634087,44.011111,2.377384,5.895031,21.0,0.0,10.0,37.0,0.0,4
205357,200,31229,4.111111,1.0,6.900081,47.611111,2.239598,5.203851,21.0,0.0,9.0,37.0,0.0,4
205358,200,31259,4.625000,1.0,7.190023,51.696429,2.095160,4.524464,21.0,0.0,8.0,37.0,1.0,4
205359,200,31289,5.142857,1.0,7.603257,57.809524,1.909136,3.715867,21.0,0.0,7.0,36.0,8.0,4


In [31]:
scores = cross_val_score(LogisticRegression(), X.values, Y["sleep"].values, groups=X["grp"], scoring="f1_weighted")
print(scores.mean())
scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8110031506845493


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([0.80116425, 0.81861053, 0.82755386, 0.82217742, 0.78550969])

In [32]:
scores = cross_val_score(LogisticRegression(), X.values, Y["sleep"].values, groups=X["grp"], scoring="matthews_corrcoef")
print(scores.mean())
scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.5536377316045329


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([0.51672489, 0.56756254, 0.60180957, 0.57542451, 0.50666715])

# Ideas to improve??

In [33]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler


pipe = make_pipeline(StandardScaler(), LogisticRegression())
scores = cross_val_score(pipe, X.values, Y["sleep"].values, groups=X["grp"], scoring="matthews_corrcoef")
print(scores.mean())
scores

0.5483737715557196


array([0.49868795, 0.56405213, 0.60067951, 0.56601866, 0.5124306 ])

# Comparison with different ML techniques -- hand-out test set

In [34]:
test_ids = range(151, 201)

In [35]:
X_test = X[X["pid"].isin(test_ids)]
X_test

Unnamed: 0,pid,time,mean,median,std,var,skew,kurt,max,min,count,sum,act,grp
154094,151,29,0.454545,0.0,0.522233,0.272727,0.212762,-2.444444,1.0,0.0,11.0,5.0,1.0,0
154095,151,59,3.727273,1.0,10.715325,114.818182,3.303394,10.936207,36.0,0.0,11.0,41.0,1.0,0
154096,151,89,6.636364,1.0,13.566670,184.054545,1.936871,2.165704,36.0,0.0,11.0,73.0,1.0,0
154097,151,119,8.272727,1.0,13.900294,193.218182,1.461043,0.504286,36.0,0.0,11.0,91.0,1.0,0
154098,151,149,8.272727,1.0,13.900294,193.218182,1.461043,0.504286,36.0,0.0,11.0,91.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205356,200,31199,3.700000,1.0,6.634087,44.011111,2.377384,5.895031,21.0,0.0,10.0,37.0,0.0,4
205357,200,31229,4.111111,1.0,6.900081,47.611111,2.239598,5.203851,21.0,0.0,9.0,37.0,0.0,4
205358,200,31259,4.625000,1.0,7.190023,51.696429,2.095160,4.524464,21.0,0.0,8.0,37.0,1.0,4
205359,200,31289,5.142857,1.0,7.603257,57.809524,1.909136,3.715867,21.0,0.0,7.0,36.0,8.0,4


In [36]:
Y_test = Y[Y["pid"].isin(test_ids)]
Y_test

Unnamed: 0,time,pid,sleep_phase,sleep
154094,4677905,151,0.0,False
154095,4677935,151,0.0,False
154096,4677965,151,0.0,False
154097,4677995,151,0.0,False
154098,4678025,151,0.0,False
...,...,...,...,...
205356,6231234,200,0.0,False
205357,6231264,200,0.0,False
205358,6231294,200,0.0,False
205359,6231324,200,0.0,False


In [37]:
X_train = X[~X["pid"].isin(test_ids)]
X_train

Unnamed: 0,pid,time,mean,median,std,var,skew,kurt,max,min,count,sum,act,grp
0,1,29,15.333333,1.5,35.120744,1233.466667,2.445916,5.986385,87.0,0.0,6.0,92.0,2.0,0
1,1,59,13.285714,1.0,32.515198,1057.238095,2.642249,6.985881,87.0,0.0,7.0,93.0,0.0,0
2,1,89,11.625000,1.0,30.467489,928.267857,2.824207,7.982049,87.0,0.0,8.0,93.0,1.0,0
3,1,119,10.333333,1.0,28.761954,827.250000,2.995208,8.978607,87.0,0.0,9.0,93.0,2.0,0
4,1,149,9.300000,0.5,27.313204,746.011111,3.157003,9.975393,87.0,0.0,10.0,93.0,87.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154089,150,28919,2.545455,0.0,7.789270,60.672727,3.300851,10.922336,26.0,0.0,11.0,28.0,0.0,4
154090,150,28949,2.636364,0.0,7.762380,60.254545,3.295674,10.897582,26.0,0.0,11.0,29.0,0.0,4
154091,150,28979,2.727273,0.0,7.734221,59.818182,3.292385,10.882344,26.0,0.0,11.0,30.0,1.0,4
154092,150,29009,2.818182,1.0,7.704780,59.363636,3.291050,10.876751,26.0,0.0,11.0,31.0,0.0,4


In [38]:
Y_train = Y[~Y["pid"].isin(test_ids)]
Y_train

Unnamed: 0,time,pid,sleep_phase,sleep
0,29,1,0.0,False
1,59,1,0.0,False
2,89,1,0.0,False
3,119,1,0.0,False
4,149,1,0.0,False
...,...,...,...,...
154089,4677755,150,5.0,True
154090,4677785,150,5.0,True
154091,4677815,150,5.0,True
154092,4677845,150,5.0,True


In [39]:
pipe = make_pipeline(StandardScaler(), LogisticRegression())
pipe.fit(X_train[['mean', 'median', 'std', 'var', 'skew', 'kurt', 'max', 'min', 'count', 'sum', 'act']].values,
         Y_train["sleep"].values)


In [40]:
y_hat = pipe.predict(X_test[['mean', 'median', 'std', 'var', 'skew', 'kurt', 'max', 'min', 'count', 'sum', 'act']].values)
acc = accuracy_score(Y_test["sleep"].values, y_hat)
f1 = f1_score(Y_test["sleep"].values, y_hat, average="weighted")
mcc = matthews_corrcoef(Y_test["sleep"].values, y_hat)
print("StandardScale + LR: Acc: %.3f, F1 %.3f, MCC: %.3f" % (acc, f1, mcc))

StandardScale + LR: Acc: 0.802, F1 0.781, MCC: 0.505


# Only Logistic Regression

In [41]:
pipe = LogisticRegression()
pipe.fit(X_train[['mean', 'median', 'std', 'var', 'skew', 'kurt', 'max', 'min', 'count', 'sum', 'act']].values,
         Y_train["sleep"].values)
y_hat = pipe.predict(X_test[['mean', 'median', 'std', 'var', 'skew', 'kurt', 'max', 'min', 'count', 'sum', 'act']].values)
acc = accuracy_score(Y_test["sleep"].values, y_hat)
f1 = f1_score(Y_test["sleep"].values, y_hat, average="weighted")
mcc = matthews_corrcoef(Y_test["sleep"].values, y_hat)
print("Only LR: Acc: %.3f, F1 %.3f, MCC: %.3f" % (acc, f1, mcc))

Only LR: Acc: 0.802, F1 0.780, MCC: 0.504


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
