In [2]:
import numpy as np
import pandas as pd
from itertools import groupby

In [3]:
train = pd.read_parquet('D:/child-mind-institute-detect-sleep-states/train_series.parquet')
test  = pd.read_parquet('D:/child-mind-institute-detect-sleep-states/test_series.parquet')

In [4]:
print(train.head())
print(test.head())

      series_id  step                 timestamp  anglez    enmo
0  038441c925bb     0  2018-08-14T15:30:00-0400  2.6367  0.0217
1  038441c925bb     1  2018-08-14T15:30:05-0400  2.6368  0.0215
2  038441c925bb     2  2018-08-14T15:30:10-0400  2.6370  0.0216
3  038441c925bb     3  2018-08-14T15:30:15-0400  2.6368  0.0213
4  038441c925bb     4  2018-08-14T15:30:20-0400  2.6368  0.0215
      series_id  step                 timestamp  anglez    enmo
0  038441c925bb     0  2018-08-14T15:30:00-0400  2.6367  0.0217
1  038441c925bb     1  2018-08-14T15:30:05-0400  2.6368  0.0215
2  038441c925bb     2  2018-08-14T15:30:10-0400  2.6370  0.0216
3  038441c925bb     3  2018-08-14T15:30:15-0400  2.6368  0.0213
4  038441c925bb     4  2018-08-14T15:30:20-0400  2.6368  0.0215


In [5]:
# value count & unique & Null Count
train.info()
print()
print(train.isnull().sum() / len(train) * 100)
print()

for i in train.columns:
    print('\33[103m', i, '\33[0m')
    print(train[i].value_counts())
    uu = str(train[i].unique())
    print(train[i].dtype)
    print('\33[91m' + uu +  '\033[0m')
    print('\33[94m' + 'Null Count :' + '\033[0m', train[i].isnull().sum())
    print('🌈')
    print()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127946340 entries, 0 to 127946339
Data columns (total 5 columns):
 #   Column     Dtype  
---  ------     -----  
 0   series_id  object 
 1   step       uint32 
 2   timestamp  object 
 3   anglez     float32
 4   enmo       float32
dtypes: float32(2), object(2), uint32(1)
memory usage: 3.3+ GB

series_id    0.0
step         0.0
timestamp    0.0
anglez       0.0
enmo         0.0
dtype: float64

[103m series_id [0m
78569a801a38    1433880
f564985ab692    1052820
fb223ed2278c     918360
f56824b503a0     846360
cfeb11428dd7     809820
                 ...   
c535634d7dcd     136080
1c7c0bad1263     115380
60e51cad2ffb     113940
3a9a9dc2cbd9     103500
349c5562ee2c      37080
Name: series_id, Length: 277, dtype: int64
object
[91m['038441c925bb' '03d92c9f6f8a' '0402a003dae9' '04f547b8017d'
 '05e1944c3818' '062cae666e2a' '062dbd4c95e6' '08db4255286f'
 '0a96f4993bd7' '0cd1e3d0ed95' '0ce74d6d2106' '0cfc06c129cc'
 '0d0ad1e77851' '0dee4fda51

Feature Engineering

In [6]:
for df in (train, test):
    # parse the timestamp and create an "hour" feature
    df["timestamp"] = pd.to_datetime(df["timestamp"],utc=True)
    df["hour"] = df["timestamp"].dt.hour

    # feature cross
    df["anglez_times_enmo"] = abs(df["anglez"]) * df["enmo"]
    
    # "rolling" features
    periods = 50
    df["anglez_diff"] = df.groupby('series_id')['anglez'].diff(periods=periods).fillna(method="bfill")
    df["enmo_diff"] = df.groupby('series_id')['enmo'].diff(periods=periods).fillna(method="bfill")
    df["anglez_rolling"] = df["anglez"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill")
    df["enmo_rolling"] = df["enmo"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill")
    df["anglez_diff_rolling"] = df["anglez_diff"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill")
    df["enmo_diff_rolling"] = df["enmo_diff"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill")
    
features = ["hour","anglez_times_enmo",
           "anglez","anglez_diff","anglez_rolling","anglez_diff_rolling",
           "enmo","enmo_diff","enmo_rolling","enmo_diff_rolling"]

In [14]:
train

Unnamed: 0,series_id,step,timestamp,anglez,enmo,hour,anglez_times_enmo,anglez_diff,enmo_diff,anglez_rolling,enmo_rolling,anglez_diff_rolling,enmo_diff_rolling
0,038441c925bb,0,2018-08-14 19:30:00+00:00,2.636700,0.0217,19,0.057216,-82.651405,-0.0079,-26.215138,0.018090,-82.651405,-0.007900
1,038441c925bb,1,2018-08-14 19:30:05+00:00,2.636800,0.0215,19,0.056691,-82.651405,-0.0079,-26.215138,0.018090,-82.651405,-0.007900
2,038441c925bb,2,2018-08-14 19:30:10+00:00,2.637000,0.0216,19,0.056959,-82.651405,-0.0079,-26.215138,0.018090,-82.651405,-0.007900
3,038441c925bb,3,2018-08-14 19:30:15+00:00,2.636800,0.0213,19,0.056164,-82.651405,-0.0079,-26.215138,0.018090,-82.651405,-0.007900
4,038441c925bb,4,2018-08-14 19:30:20+00:00,2.636800,0.0215,19,0.056691,-82.651405,-0.0079,-26.215138,0.018090,-82.651405,-0.007900
...,...,...,...,...,...,...,...,...,...,...,...,...,...
127946335,fe90110788d2,592375,2017-09-08 04:14:35+00:00,-27.277500,0.0204,4,0.556461,-42.998100,0.0030,-19.529788,0.014656,-34.198950,0.009474
127946336,fe90110788d2,592376,2017-09-08 04:14:40+00:00,-27.032499,0.0233,4,0.629857,-45.973801,-0.1238,-19.529788,0.014656,-34.198950,0.009474
127946337,fe90110788d2,592377,2017-09-08 04:14:45+00:00,-26.841200,0.0202,4,0.542192,-17.404999,-0.0709,-19.529788,0.014656,-34.198950,0.009474
127946338,fe90110788d2,592378,2017-09-08 04:14:50+00:00,-26.723900,0.0199,4,0.531806,-27.441000,0.0185,-19.529788,0.014656,-34.198950,0.009474


Modeling

In [8]:
train.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo,hour,anglez_times_enmo,anglez_diff,enmo_diff,anglez_rolling,enmo_rolling,anglez_diff_rolling,enmo_diff_rolling
0,038441c925bb,0,2018-08-14 19:30:00+00:00,2.6367,0.0217,19,0.057216,-82.651405,-0.0079,-26.215138,0.01809,-82.651405,-0.0079
1,038441c925bb,1,2018-08-14 19:30:05+00:00,2.6368,0.0215,19,0.056691,-82.651405,-0.0079,-26.215138,0.01809,-82.651405,-0.0079
2,038441c925bb,2,2018-08-14 19:30:10+00:00,2.637,0.0216,19,0.056959,-82.651405,-0.0079,-26.215138,0.01809,-82.651405,-0.0079
3,038441c925bb,3,2018-08-14 19:30:15+00:00,2.6368,0.0213,19,0.056164,-82.651405,-0.0079,-26.215138,0.01809,-82.651405,-0.0079
4,038441c925bb,4,2018-08-14 19:30:20+00:00,2.6368,0.0215,19,0.056691,-82.651405,-0.0079,-26.215138,0.01809,-82.651405,-0.0079


In [7]:
%%time

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=50,
                                    max_depth=4,
                                    min_samples_leaf=50)

X_train = train[features]
y_train = train["awake"]

classifier.fit(X_train, y_train)

KeyError: 'awake'

In [None]:
X_test = test[features]
test["score"] = classifier.predict_proba(X_test)[:,1]

In [None]:
test["not_awake"] = 1-test["score"]
# smoothing of the predictions
smoothing_length = 2*250
test["smooth"] = test["not_awake"].rolling(smoothing_length,center=True).mean().fillna(method="bfill").fillna(method="ffill")
# re-binarize
test["smooth"] = test["smooth"].round()

# https://stackoverflow.com/questions/73777727/how-to-mark-start-end-of-a-series-of-non-null-and-non-0-values-in-a-column-of-a
def get_event(df):
    lstCV = zip(df.series_id, df.smooth)
    lstPOI = []
    for (c, v), g in groupby(lstCV, lambda cv: 
                            (cv[0], cv[1]!=0 and not pd.isnull(cv[1]))):
        llg = sum(1 for item in g)
        if v is False: 
            lstPOI.extend([0]*llg)
        else: 
            lstPOI.extend(['onset']+(llg-2)*[0]+['wakeup'] if llg > 1 else [0])
    return lstPOI

test["event"] = get_event(test)

In [None]:
sample_submission = test.loc[test["event"] != 0][["series_id","step","event","score"]].copy().reset_index(drop=True).reset_index(names="row_id")
# sample_submission.to_csv('submission_rf.csv', index=False)