# Imports, Load data

In [1]:
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

# load in score function
from metric import score

# format display 
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.3f}'.format)

# global random state
random_state = 0

In [2]:
# read in the data
root = 'data/'
train_series = pd.read_parquet(root + 'train_series.parquet')
train_events = pd.read_csv(root + 'train_events.csv')
test_series = pd.read_parquet(root + 'test_series.parquet')

In [3]:
# datetime transforms
# train_series['timestamp'] = pd.to_datetime(train_series['timestamp'], utc=True)
# train_series['hour'] = train_series['timestamp'].dt.hour
# train_events['timestamp'] = pd.to_datetime(train_events['timestamp'], utc=True)
# train_events['hour'] = train_events['timestamp'].dt.hour
# test_series['timestamp'] = pd.to_datetime(test_series['timestamp'], utc=True)
# test_series['hour'] = test_series['timestamp'].dt.hour

# cast features to int16 and uint16
train_series['anglez'] = train_series['anglez'].astype('int16')
train_series['enmo'] = (train_series['enmo'] * 1000).astype('uint16')
test_series['anglez'] = test_series['anglez'].astype('int16')
test_series['enmo'] = (test_series['enmo'] * 1000).astype('uint16')

# Filter data, Create features, Split data

In [4]:
# getting series ids as a list
series_ids = train_events['series_id'].drop_duplicates().tolist()
print(f'Number of total series: {len(series_ids)}')

# get counts of onset and wakeup fro each user
onset_counts = train_events[train_events['event'] == 'onset'].groupby('series_id').size()
wakeup_counts = train_events[train_events['event'] == 'wakeup'].groupby('series_id').size()

# create counts df and get the mismatches
counts = pd.DataFrame({'onset_counts': onset_counts, 'wakeup_counts': wakeup_counts}).reset_index()
count_mismatches = counts[counts['onset_counts'] != counts['wakeup_counts']]

# removing users with mismatched onset/wakeup counts
train_series = train_series[~train_series['series_id'].isin(count_mismatches['series_id'])]
train_events = train_events[~train_events['series_id'].isin(count_mismatches['series_id'])]

# update list of series ids, not including series with no non-null values
series_ids = train_events.dropna()['series_id'].drop_duplicates().tolist()
print(f'Number of series after removing mismatched counts: {len(series_ids)}')

Number of total series: 277
Number of series after removing mismatched counts: 269


In [5]:
def create_features(series):
    # feature list
    feature_cols = ['enmo', 'anglez']

    # create rolling features for 5min, 30min, 2hr, 8hr windows
    for mins in [5, 30, 120, 480]:
        # multiply by 12 because of 5 second intervals
        window_size = mins * 12

        # calculate rolling features for 'enmo'
        series[f'enmo_{mins}m_mean'] = series['enmo'].rolling(window_size, center=True, min_periods=1).mean().abs().astype('uint16')
        series[f'enmo_{mins}m_max'] = series['enmo'].rolling(window_size, center=True, min_periods=1).max().abs().astype('uint16')

        feature_cols += [f'enmo_{mins}m_mean', f'enmo_{mins}m_max']

        # calculate first variations for 'enmo' and 'anglez'
        for var in ['enmo', 'anglez']:
            series[f'{var}_1v_{mins}m_mean'] = series[var].diff().abs().rolling(window_size, center=True, min_periods=1).mean() * 10
            series[f'{var}_1v_{mins}m_max'] = series[var].diff().abs().rolling(window_size, center=True, min_periods=1).max() * 10

            feature_cols += [f'{var}_1v_{mins}m_mean', f'{var}_1v_{mins}m_max']

    # ensure integer conversion
    series[feature_cols] = series[feature_cols].astype('uint32')

    return series

In [6]:
# split data. first 200 series_id are train_series, last 69 are val_series
training_series = train_series[train_series['series_id'].isin(series_ids[:200])]
val_series = train_series[train_series['series_id'].isin(series_ids[200:])]

# check
print(f'Training series shape: {training_series.shape}')
print(f'Validation series shape: {val_series.shape}')

Training series shape: (90809280, 5)
Validation series shape: (34012800, 5)


In [7]:
# create features
training_series = create_features(training_series)
val_series = create_features(val_series)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series[f'enmo_{mins}m_mean'] = series['enmo'].rolling(window_size, center=True, min_periods=1).mean().abs().astype('uint16')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series[f'enmo_{mins}m_max'] = series['enmo'].rolling(window_size, center=True, min_periods=1).max().abs().astype('uint16')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/inde

In [8]:
# look at new features
print(training_series.shape)
training_series.head()

(90809280, 29)


Unnamed: 0,series_id,step,timestamp,anglez,enmo,enmo_5m_mean,enmo_5m_max,enmo_1v_5m_mean,enmo_1v_5m_max,anglez_1v_5m_mean,anglez_1v_5m_max,enmo_30m_mean,enmo_30m_max,enmo_1v_30m_mean,enmo_1v_30m_max,anglez_1v_30m_mean,anglez_1v_30m_max,enmo_120m_mean,enmo_120m_max,enmo_1v_120m_mean,enmo_1v_120m_max,anglez_1v_120m_mean,anglez_1v_120m_max,enmo_480m_mean,enmo_480m_max,enmo_1v_480m_mean,enmo_1v_480m_max,anglez_1v_480m_mean,anglez_1v_480m_max
0,038441c925bb,0,2018-08-14T15:30:00-0400,2,21,20,69,135588,655350,81,730,14,69,106175,655350,14,730,19,440,154953,655350,30,880,75,1223,283632,655350,101,1040
1,038441c925bb,1,2018-08-14T15:30:05-0400,2,21,20,69,131069,655350,79,730,14,69,105585,655350,14,730,19,440,155648,655350,30,880,75,1223,283534,655350,101,1040
2,038441c925bb,2,2018-08-14T15:30:10-0400,2,21,20,69,126841,655350,76,730,14,69,108622,655350,14,730,19,440,155432,655350,30,880,75,1223,283663,655350,102,1040
3,038441c925bb,3,2018-08-14T15:30:15-0400,2,21,19,69,122877,655350,74,730,14,69,108025,655350,14,730,19,440,156124,655350,30,880,75,1223,283564,655350,102,1040
4,038441c925bb,4,2018-08-14T15:30:20-0400,2,21,19,69,119153,655350,71,730,14,69,107435,655350,14,730,19,440,155908,655350,30,880,75,1223,283693,655350,102,1040


27 features

In [None]:
# classify columns into id and features
id_cols = ['series_id', 'step', 'timestamp']
feature_cols = [col for col in train_data.columns if col not in id_cols]

In [12]:
def make_train_dataset(train_data, train_events, drop_nulls=False, feature_cols=feature_cols, id_cols=id_cols):
    # Get series ids
    series_ids = train_data['series_id'].unique().tolist()

    # Initialize the dataframe for features and labels
    X_full = pd.DataFrame()
    y_full = pd.DataFrame()

    for idx in tqdm(series_ids): 
        # Get data for the user
        sample = train_data[train_data['series_id'] == idx].copy()

        # Normalize features by standard deviation
        for col in feature_cols:
            if sample[col].std() != 0:
                sample[col] = sample[col] / sample[col].std()

        # Get events for the user
        events = train_events[train_events['series_id'] == idx]

        # Remove datapoints on dates where no data was recorded
        if drop_nulls:
            valid_dates = events['timestamp'].dt.date.unique()
            sample = sample[sample['timestamp'].dt.date.isin(valid_dates)]

        # Append user's data to the full dataset
        X_full = pd.concat([X_full, sample[id_cols + feature_cols]])

        # Get onsets and wakeups
        onsets = events[events['event'] == 'onset']['step'].tolist()
        wakeups = events[events['event'] == 'wakeup']['step'].tolist()

        # Create 'asleep' column
        y_series = pd.Series(0, index=sample.index)
        for onset, wakeup in zip(onsets, wakeups):
            # set asleep to 1 between onset and wakeup
            y_series[(sample['step'] >= onset) & (sample['step'] < wakeup)] = 1
        
        # append user's labels to previous labels
        y_full = pd.concat([y_full, y_series])

    # flatten y to a 1D array
    y = y_full.to_numpy().ravel()

    return X_full, y


In [13]:
# create features and target
X_train, y_train = make_train_dataset(training_series, train_events)
X_val, y_val = make_train_dataset(val_series, train_events)

100%|██████████| 200/200 [30:58<00:00,  9.29s/it]
100%|██████████| 69/69 [04:27<00:00,  3.87s/it]


In [32]:
# recover memory
# del train_series
# del training_series
# del val_series
# del train_events
# gc.collect()

1238

# Logistic Regression

In [None]:
# create logreg, fit
logreg = LogisticRegression(random_state=random_state, n_jobs=-1)
logreg.fit(X_train, y_train)

In [None]:
# look at training performance
plt.title('Confusion Matrix on Training Data')
ConfusionMatrixDisplay.from_estimator(logreg, X_train, y_train, cmap='Blues');

print(classification_report(y_train, logreg.predict(X_train)))

In [None]:
# look at validation performance
plt.title('Confusion Matrix on Validation Data')
ConfusionMatrixDisplay.from_estimator(logreg, X_val, y_val, cmap='Blues');

print(classification_report(y_val, logreg.predict(X_val)))

# Use Score method

In [33]:
def get_events(series, classifier):
    '''
    Takes a time series and a classifier and returns a formatted submission dataframe.
    '''

    # get unique series ids
    series_ids = series['series_id'].unique()

    # create empty events df
    events = pd.DataFrame(columns=['series_id', 'step', 'event', 'score'])

    # iterate through each user
    for idx in tqdm(series_ids):
        # normalization and feature selection
        scale_cols = [col for col in feature_cols if series[col].std() != 0]
        X = series[series['series_id'] == idx][id_cols + feature_cols]
        X[scale_cols] = X[scale_cols].apply(lambda col: col / series[col.name].std())

        # predictions
        preds, probs = classifier.predict(X[feature_cols]), classifier.predict_proba(X[feature_cols])[:, 1]

        X['prediction'] = preds
        X['probability'] = probs

        # identifying sleep events
        pred_onsets = X[X['prediction'].diff() > 0]['step']
        pred_wakeups = X[X['prediction'].diff() < 0]['step']

        # event processing logic
        if len(pred_onsets) > 0:
            # if first predicted wakeup is before first predicted onset, remove
            if pred_wakeups.iloc[0] < pred_onsets.iloc[0]:
                pred_wakeups = pred_wakeups.iloc[1:]

            # if last predicted onset is after last predicted wakeup, remove
            if pred_onsets.iloc[-1] > pred_wakeups.iloc[-1]:
                pred_onsets = pred_onsets.iloc[:-1]

            # create 'sleep_period' if predicted sleep is greter than 30 minutes
            sleep_periods = [(onset, wakeup) for onset, wakeup in zip(pred_onsets, pred_wakeups) if wakeup - onset >= (12 * 30)]

            # constructing events df
            for onset, wakeup in sleep_periods:
                score = X[(X['step'] >= onset) & (X['step'] < wakeup)]['probability'].mean()
                events = events.append({'series_id': idx, 'step': onset, 'event': 'onset', 'score': score}, ignore_index=True)
                events = events.append({'series_id': idx, 'step': wakeup, 'event': 'wakeup', 'score': score}, ignore_index=True)

    # reset index for row ID
    events.reset_index(inplace=True)
    events.rename(columns={'index': 'row_id'}, inplace=True)

    return events

In [None]:
# create and fit classifier
classifier = LogisticRegression(n_jobs=-1, random_state=random_state)
classifier.fit(X_train[feature_cols], y_train)

In [None]:
# get event predictions
submission = get_events(test_series, classifier)
submission.to_csv('submission.csv', index=False)