In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/preprocessing/cropedTrainSeries3.csv
/kaggle/input/scalers/enmoScaler.pkl
/kaggle/input/scalers/anglezScaler.pkl
/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet
/kaggle/input/child-mind-institute-detect-sleep-states/sample_submission.csv
/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv
/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet


In [2]:
from xgboost import XGBClassifier
import joblib
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.datasets import load_digits
from sklearn.preprocessing import LabelEncoder
from scipy.stats import randint, uniform

In [3]:
trainDF=pd.read_csv('/kaggle/input/preprocessing/cropedTrainSeries3.csv')

In [4]:
trainDF.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo,Date,Time,event,NewDate,Season,Summer,Autumn,Spring,Winter,anglezScaled,enmoScaled,weekday,time_in_seconds,sin_time,cos_time
0,038441c925bb,0,2018-08-14 19:30:00+00:00,2.6367,0.0217,2018-08-14,1900-01-01 19:30:00,onset,2018-08-14,Summer,1.0,,,,0.517288,0.502686,1,70200,-0.92388,0.382683
1,038441c925bb,1,2018-08-14 19:30:05+00:00,2.6368,0.0215,2018-08-14,1900-01-01 19:30:05,onset,2018-08-14,Summer,1.0,,,,0.517289,0.502685,1,70205,-0.92374,0.383019
2,038441c925bb,2,2018-08-14 19:30:10+00:00,2.637,0.0216,2018-08-14,1900-01-01 19:30:10,onset,2018-08-14,Summer,1.0,,,,0.51729,0.502685,1,70210,-0.923601,0.383355
3,038441c925bb,3,2018-08-14 19:30:15+00:00,2.6368,0.0213,2018-08-14,1900-01-01 19:30:15,onset,2018-08-14,Summer,1.0,,,,0.517289,0.502684,1,70215,-0.923462,0.383691
4,038441c925bb,4,2018-08-14 19:30:20+00:00,2.6368,0.0215,2018-08-14,1900-01-01 19:30:20,onset,2018-08-14,Summer,1.0,,,,0.517289,0.502685,1,70220,-0.923322,0.384027


In [5]:
trainFeatures=['sin_time','cos_time','Summer','Autumn','Spring','Winter','anglezScaled','enmoScaled','weekday']
X_train=trainDF[trainFeatures]

In [6]:
li=LabelEncoder()
y_train=li.fit_transform(trainDF['event'])
y_train

array([0, 0, 0, ..., 1, 1, 1])

In [7]:
xgb_classifier = XGBClassifier()

In [8]:
# Define the hyperparameter search space
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(1, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0, 0.5),
}

# Use time series cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(
    xgb_classifier, param_distributions=param_dist, n_iter=10, scoring='accuracy', cv=tscv, verbose=2, n_jobs=-1
)

In [9]:
# Fit the model
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [10]:
bestParams=random_search.best_params_
bestXGboost=XGBClassifier(**bestParams)

In [11]:
bestParams

{'colsample_bytree': 0.6768884383109963,
 'gamma': 0.4279066146296105,
 'learning_rate': 0.20886819677051832,
 'max_depth': 1,
 'n_estimators': 175,
 'subsample': 0.9278030746723419}

In [12]:
bestXGboost.fit(X_train, y_train)

In [13]:
joblib.dump(bestXGboost, 'bestXGboost.pkl')
joblib.dump(li, 'labelEncoder.pkl')

['labelEncoder.pkl']

In [14]:
X_train.shape

(1279463, 9)

In [15]:
testDF=pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet')

In [16]:
enmoScaler=joblib.load('/kaggle/input/scalers/enmoScaler.pkl')
anglezScaler=joblib.load('/kaggle/input/scalers/anglezScaler.pkl')

In [17]:
test_series_DF_CP=testDF.copy()

In [18]:
test_series_DF_CP['timestamp'] = pd.to_datetime(test_series_DF_CP['timestamp'],utc=True)
test_series_DF_CP['Date'] = test_series_DF_CP['timestamp'].dt.date
test_series_DF_CP['Time'] = test_series_DF_CP['timestamp'].dt.time
test_series_DF_CP['NewDate'] = pd.to_datetime(test_series_DF_CP['Date'])
# Create a new 'Season' column based on the month of each date
test_series_DF_CP['Season'] = test_series_DF_CP['NewDate'].dt.month.map(lambda x: {3: 'Spring', 4: 'Spring', 5: 'Spring',
                                                 6: 'Summer', 7: 'Summer', 8: 'Summer',
                                                 9: 'Autumn', 10: 'Autumn', 11: 'Autumn',
                                                 12: 'Winter', 1: 'Winter', 2: 'Winter'}.get(x))

# Use get_dummies to one-hot encode the 'Season' column
season_dummiesTest = test_series_DF_CP['Season'].apply(lambda x: pd.Series({x: 1}))
test_series_DF_CP = pd.concat([test_series_DF_CP, season_dummiesTest], axis=1)

seasons=['Summer','Autumn','Spring','Winter']
for season in seasons:
    if season not in test_series_DF_CP.columns:
        test_series_DF_CP[season]=float('nan')
        
        
test_series_DF_CP['Date'] = pd.to_datetime(test_series_DF_CP['Date'])

# Add a new column 'weekday' to the DataFrame
test_series_DF_CP['weekday'] = test_series_DF_CP['Date'].dt.dayofweek

# Map the weekdays to 1 and weekends to 0
test_series_DF_CP['weekday'] = test_series_DF_CP['weekday'].apply(lambda x: 1 if x < 5 else 0)

test_series_DF_CP['anglezScaled']=anglezScaler.transform(test_series_DF_CP['anglez'].values.reshape(-1, 1))
test_series_DF_CP['enmoScaled']=anglezScaler.transform(test_series_DF_CP['enmo'].values.reshape(-1, 1))
test_series_DF_CP['Date'] = pd.to_datetime(test_series_DF_CP['Date'])

# Add a new column 'weekday' to the DataFrame
test_series_DF_CP['weekday'] = test_series_DF_CP['Date'].dt.dayofweek

# Map the weekdays to 1 and weekends to 0
test_series_DF_CP['weekday'] = test_series_DF_CP['weekday'].apply(lambda x: 1 if x < 5 else 0)

test_series_DF_CP['Time'] = pd.to_datetime(test_series_DF_CP['Time'], format='%H:%M:%S')

#Extracting total seconds since midnight as a single feature
test_series_DF_CP['time_in_seconds'] = test_series_DF_CP['Time'].dt.hour * 3600 + test_series_DF_CP['Time'].dt.minute * 60 + test_series_DF_CP['Time'].dt.second

# Create empty columns for sin and cos
test_series_DF_CP['sin_time'] = np.nan
test_series_DF_CP['cos_time'] = np.nan

# Calculating sin and cos of time_in_seconds
test_series_DF_CP['sin_time'] = np.sin(2 * np.pi * test_series_DF_CP['time_in_seconds'] / 86400)  # 86400 seconds in a day
test_series_DF_CP['cos_time'] = np.cos(2 * np.pi * test_series_DF_CP['time_in_seconds'] / 86400)

In [19]:
trainFeatures=['sin_time','cos_time','Summer','Autumn','Spring','Winter','anglezScaled','enmoScaled','weekday']

In [20]:
X_test=test_series_DF_CP[trainFeatures]
X_test.head()

Unnamed: 0,sin_time,cos_time,Summer,Autumn,Spring,Winter,anglezScaled,enmoScaled,weekday
0,-0.92388,0.382683,1.0,,,,0.517288,0.502686,1
1,-0.92374,0.383019,1.0,,,,0.517289,0.502685,1
2,-0.923601,0.383355,1.0,,,,0.51729,0.502685,1
3,-0.923462,0.383691,1.0,,,,0.517289,0.502684,1
4,-0.923322,0.384027,1.0,,,,0.517289,0.502685,1


In [21]:
preditions=bestXGboost.predict(X_test)
preditions

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [22]:
preditions_lables=li.inverse_transform(preditions)

In [23]:
preditions_lables

array(['wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup',
       'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup',
       'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup',
       'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup',
       'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup',
       'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup',
       'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup',
       'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup',
       'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup',
       'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup',
       'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup',
       'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup',
       'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup',
       'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup',
       'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wakeup', 'wake

In [24]:
predict_proba=bestXGboost.predict_proba(X_test)[:,1]
predict_proba

array([0.6588544 , 0.6588544 , 0.6588544 , 0.6588544 , 0.6588544 ,
       0.6588544 , 0.6588544 , 0.6588544 , 0.6588544 , 0.6588544 ,
       0.6588544 , 0.6588544 , 0.6588544 , 0.6588544 , 0.6588544 ,
       0.6588544 , 0.6588544 , 0.6588544 , 0.72808146, 0.63991165,
       0.63991165, 0.63991165, 0.6588544 , 0.63991165, 0.63991165,
       0.63991165, 0.63991165, 0.72808146, 0.6047511 , 0.6047511 ,
       0.6047511 , 0.6047511 , 0.6047511 , 0.6047511 , 0.6047511 ,
       0.6047511 , 0.6047511 , 0.6047511 , 0.6047511 , 0.6047511 ,
       0.6047511 , 0.6047511 , 0.6047511 , 0.6047511 , 0.6047511 ,
       0.6047511 , 0.6047511 , 0.6047511 , 0.6047511 , 0.6047511 ,
       0.6047511 , 0.6047511 , 0.6047511 , 0.6047511 , 0.6047511 ,
       0.6047511 , 0.6047511 , 0.6047511 , 0.6047511 , 0.6047511 ,
       0.6047511 , 0.6047511 , 0.6047511 , 0.6047511 , 0.6047511 ,
       0.6047511 , 0.6047511 , 0.6047511 , 0.6047511 , 0.6047511 ,
       0.6047511 , 0.6047511 , 0.6047511 , 0.6047511 , 0.60475

In [25]:
submission=testDF[['series_id','step']]
submission['event']=preditions_lables
submission['score']=predict_proba

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['event']=preditions_lables
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['score']=predict_proba


In [26]:
submission['row_id'] = submission.index.astype(int)
submission = submission[['row_id','series_id','step','event','score']]

In [27]:
submission

Unnamed: 0,row_id,series_id,step,event,score
0,0,038441c925bb,0,wakeup,0.658854
1,1,038441c925bb,1,wakeup,0.658854
2,2,038441c925bb,2,wakeup,0.658854
3,3,038441c925bb,3,wakeup,0.658854
4,4,038441c925bb,4,wakeup,0.658854
...,...,...,...,...,...
445,445,0402a003dae9,145,wakeup,0.988896
446,446,0402a003dae9,146,wakeup,0.988051
447,447,0402a003dae9,147,wakeup,0.988051
448,448,0402a003dae9,148,wakeup,0.988051


In [28]:
submission.to_csv('submission.csv')