In [35]:
# Import Libraries
import pandas as pd
import seaborn as sns
import numpy as np

from ydata_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

# from yellowbrick.regressor import PredictionError

In [2]:
# Loading Data
df = pd.read_csv('./data/Sleep_Efficiency.csv')
print(df.shape)
df.head()

(452, 15)


Unnamed: 0,ID,Age,Gender,Bedtime,Wakeup time,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
0,1,65,Female,2021-03-06 01:00:00,2021-03-06 07:00:00,6.0,0.88,18,70,12,0.0,0.0,0.0,Yes,3.0
1,2,69,Male,2021-12-05 02:00:00,2021-12-05 09:00:00,7.0,0.66,19,28,53,3.0,0.0,3.0,Yes,3.0
2,3,40,Female,2021-05-25 21:30:00,2021-05-25 05:30:00,8.0,0.89,20,70,10,1.0,0.0,0.0,No,3.0
3,4,40,Female,2021-11-03 02:30:00,2021-11-03 08:30:00,6.0,0.51,23,25,52,3.0,50.0,5.0,Yes,1.0
4,5,57,Male,2021-03-13 01:00:00,2021-03-13 09:00:00,8.0,0.76,27,55,18,3.0,0.0,3.0,No,3.0


In [3]:
df.isna().sum()[df.isna().sum() > 0]

Awakenings              20
Caffeine consumption    25
Alcohol consumption     14
Exercise frequency       6
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452 entries, 0 to 451
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      452 non-null    int64  
 1   Age                     452 non-null    int64  
 2   Gender                  452 non-null    object 
 3   Bedtime                 452 non-null    object 
 4   Wakeup time             452 non-null    object 
 5   Sleep duration          452 non-null    float64
 6   Sleep efficiency        452 non-null    float64
 7   REM sleep percentage    452 non-null    int64  
 8   Deep sleep percentage   452 non-null    int64  
 9   Light sleep percentage  452 non-null    int64  
 10  Awakenings              432 non-null    float64
 11  Caffeine consumption    427 non-null    float64
 12  Alcohol consumption     438 non-null    float64
 13  Smoking status          452 non-null    object 
 14  Exercise frequency      446 non-null    fl

In [5]:
profile = ProfileReport(df)
# profile.to_notebook_iframe()
# profile.to_file('./eda/sleep_efficiency.html')


In [6]:
# set aside and save unseen data set
data_unseen = df.sample(n=50, random_state=42)
data        = df.drop(data_unseen.index)
data.reset_index(inplace=True)
print(f'Data for model: {data.shape},\nData for unseen predictions: {data_unseen.shape}')
data_unseen.to_csv('./data/Sleep_Efficiency_unseen.csv', index=False)

Data for model: (402, 16),
Data for unseen predictions: (50, 15)


In [7]:
# splitting 'Exercise frequency' to dataset
X = data.loc[: , data.columns!='Sleep efficiency']
y = data.loc[: , data.columns=='Sleep efficiency']

In [8]:
# split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# encoding 
# get the categorical and numeric column names
num_cols = X_train.select_dtypes(exclude=['object']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
#print(num_cols, '\n', cat_cols)
print(f'num_cols: {num_cols}\ncat_cols: {cat_cols}')

['index', 'ID', 'Age', 'Sleep duration', 'REM sleep percentage', 'Deep sleep percentage', 'Light sleep percentage', 'Awakenings', 'Caffeine consumption', 'Alcohol consumption', 'Exercise frequency'] 
 ['Gender', 'Bedtime', 'Wakeup time', 'Smoking status']


In [10]:
# pipeline for numerical columns
num_pipe = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='mean'),
    StandardScaler()
)
num_pipe

In [11]:
# pipeline for categorical columns
cat_pipe = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='N/A'),
    OneHotEncoder(handle_unknown='ignore', sparse=False)
)
cat_pipe

In [12]:
# combine both the pipelines
full_pipe = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])
full_pipe

### Building model

In [13]:
# build the model GradientBoostingRegressor
gbr_efficiency = make_pipeline(full_pipe, GradientBoostingRegressor(random_state=42))
gbr_efficiency

In [14]:
# train the model GradientBoostingRegressor

gbr_efficiency.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [15]:
# build the model RandomForestRegressor
rf_efficiency = make_pipeline(full_pipe, RandomForestRegressor(random_state=42))
rf_efficiency

In [16]:
# train the model RandomForestRegressor

rf_efficiency.fit(X_train, y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [17]:
# build the model ExtraTreesRegressor
et_efficiency = make_pipeline(full_pipe, ExtraTreesRegressor(random_state=42))
et_efficiency

In [18]:
# train the model ExtraTreesRegressor

et_efficiency.fit(X_train, y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


### make predictions on the test set

In [19]:
# make predictions on the test set
gbr_y_pred = gbr_efficiency.predict(X_test)

In [20]:
# make predictions on the test set
rf_y_pred = rf_efficiency.predict(X_test)

In [21]:
# make predictions on the test set
et_y_pred = et_efficiency.predict(X_test)

### measure accuracy

In [22]:
# measure accuracy
print('R2_gbr:', r2_score(y_test, gbr_y_pred))

R2_gbr: 0.8959620882893106


In [23]:
# measure accuracy
print('R2_rf:', r2_score(y_test, rf_y_pred))

R2_rf: 0.8837776487794986


In [24]:
# measure accuracy
print('R2_et:', r2_score(y_test, et_y_pred))

R2_et: 0.8857051092801912


In [26]:
# comparison of sleep efficiency to gbr_y_pred
gbr_y_test = y_test.copy()
gbr_y_test['gbr_y_pred'] = gbr_y_pred
gbr_test_scores = gbr_y_test.copy()
gbr_test_scores

Unnamed: 0,Sleep efficiency,gbr_y_pred
285,0.60,0.567146
281,0.71,0.888105
33,0.64,0.641133
211,0.77,0.761987
93,0.90,0.896848
...,...,...
228,0.52,0.554283
371,0.87,0.920211
176,0.73,0.729336
272,0.90,0.877989


In [27]:
# comparison of sleep efficiency to rf_y_pred
rf_y_test = y_test.copy()
rf_y_test['rf_y_pred'] = rf_y_pred
rf_test_scores = rf_y_test.copy()
rf_test_scores

Unnamed: 0,Sleep efficiency,rf_y_pred
285,0.60,0.5418
281,0.71,0.9098
33,0.64,0.6524
211,0.77,0.7618
93,0.90,0.9108
...,...,...
228,0.52,0.5448
371,0.87,0.9258
176,0.73,0.7356
272,0.90,0.8785


In [28]:
# comparison of sleep efficiency to et_y_pred
et_y_test = y_test.copy()
et_y_test['et_y_pred'] = et_y_pred
et_test_scores = et_y_test.copy()
et_test_scores

Unnamed: 0,Sleep efficiency,et_y_pred
285,0.60,0.5158
281,0.71,0.8859
33,0.64,0.6499
211,0.77,0.7840
93,0.90,0.9146
...,...,...
228,0.52,0.5074
371,0.87,0.9351
176,0.73,0.7541
272,0.90,0.8777


### Models comparison

In [32]:
r2 = r2_score(gbr_test_scores['Sleep efficiency'], gbr_test_scores['gbr_y_pred'])
mae = mean_absolute_error(gbr_test_scores['Sleep efficiency'], gbr_test_scores['gbr_y_pred'])
mean_act = gbr_test_scores['Sleep efficiency'].mean()
mean_pred = gbr_test_scores['gbr_y_pred'].mean()
mape = mean_absolute_percentage_error(gbr_test_scores['Sleep efficiency'], gbr_test_scores['gbr_y_pred'])
print(f'R2_gbr: {r2}\nmae: {mae}\nact_mean: {mean_act}\npred_mean: {mean_pred}\nmape: {mape}')

R2_gbr: 0.8959620882893106
mae: 0.032006920207575534
act_mean: 0.7875308641975308
pred_mean: 0.7899702811344872
mape: 0.041885900372146366


In [33]:
r2 = r2_score(rf_test_scores['Sleep efficiency'], rf_test_scores['rf_y_pred'])
mae = mean_absolute_error(rf_test_scores['Sleep efficiency'], rf_test_scores['rf_y_pred'])
mean_act = rf_test_scores['Sleep efficiency'].mean()
mean_pred = rf_test_scores['rf_y_pred'].mean()
mape = mean_absolute_percentage_error(rf_test_scores['Sleep efficiency'], rf_test_scores['rf_y_pred'])
print(f'R2_rf: {r2}\nmae: {mae}\nact_mean: {mean_act}\npred_mean: {mean_pred}\nmape: {mape}')

R2_rf: 0.8837776487794986
mae: 0.032529629629629644
act_mean: 0.7875308641975308
pred_mean: 0.7916728395061731
mape: 0.04312116301028012


In [34]:
r2 = r2_score(et_test_scores['Sleep efficiency'], et_test_scores['et_y_pred'])
mae = mean_absolute_error(et_test_scores['Sleep efficiency'], et_test_scores['et_y_pred'])
mean_act = et_test_scores['Sleep efficiency'].mean()
mean_pred = et_test_scores['et_y_pred'].mean()
mape = mean_absolute_percentage_error(et_test_scores['Sleep efficiency'], et_test_scores['et_y_pred'])
print(f'R2_et: {r2}\nmae: {mae}\nact_mean: {mean_act}\npred_mean: {mean_pred}\nmape: {mape}')

R2_et: 0.8857051092801912
mae: 0.032607407407407446
act_mean: 0.7875308641975308
pred_mean: 0.7922962962962965
mape: 0.04286031238275778
