# Sleep Efficiency Dataset

## Day 1: EDA and Data Cleaning



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import math
from sklearn.preprocessing import PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import statsmodels.api as sm
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
pd.options.display.max_rows = 50
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Loading the data base
sleep_df = pd.read_csv('Sleep_Efficiency.csv')

In [3]:
# Checking general info: data types, number of columns.
sleep_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452 entries, 0 to 451
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      452 non-null    int64  
 1   Age                     452 non-null    int64  
 2   Gender                  452 non-null    object 
 3   Bedtime                 452 non-null    object 
 4   Wakeup time             452 non-null    object 
 5   Sleep duration          452 non-null    float64
 6   Sleep efficiency        452 non-null    float64
 7   REM sleep percentage    452 non-null    int64  
 8   Deep sleep percentage   452 non-null    int64  
 9   Light sleep percentage  452 non-null    int64  
 10  Awakenings              432 non-null    float64
 11  Caffeine consumption    427 non-null    float64
 12  Alcohol consumption     438 non-null    float64
 13  Smoking status          452 non-null    object 
 14  Exercise frequency      446 non-null    fl

In [4]:
# Eliminating duplicates
sleep_df = sleep_df.drop_duplicates()
sleep_df.info()
# No row was droppped, therefore there are no duplicates

<class 'pandas.core.frame.DataFrame'>
Int64Index: 452 entries, 0 to 451
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      452 non-null    int64  
 1   Age                     452 non-null    int64  
 2   Gender                  452 non-null    object 
 3   Bedtime                 452 non-null    object 
 4   Wakeup time             452 non-null    object 
 5   Sleep duration          452 non-null    float64
 6   Sleep efficiency        452 non-null    float64
 7   REM sleep percentage    452 non-null    int64  
 8   Deep sleep percentage   452 non-null    int64  
 9   Light sleep percentage  452 non-null    int64  
 10  Awakenings              432 non-null    float64
 11  Caffeine consumption    427 non-null    float64
 12  Alcohol consumption     438 non-null    float64
 13  Smoking status          452 non-null    object 
 14  Exercise frequency      446 non-null    fl

In [5]:
# Changing title of columns
sleep_df.rename(columns={'Wakeup time':'wakeup_time','Sleep Duration':'sleep_duration',
                          'Sleep efficiency':'sleep_efficiency','REM sleep percentage':'rem_sleep_percentage', 
                          'Deep sleep percentage':'deep_sleep_percentage','Light sleep percentage': 'light_sleep_percentage',
                          'Caffeine consumption':'caffeine_consumption','Alcohol consumption':'alcohol_consumption',
                         'Smoking status':'smoking_status','Exercise frequency':'exercise_frequency'}, inplace=True )
sleep_df

Unnamed: 0,ID,Age,Gender,Bedtime,wakeup_time,Sleep duration,sleep_efficiency,rem_sleep_percentage,deep_sleep_percentage,light_sleep_percentage,Awakenings,caffeine_consumption,alcohol_consumption,smoking_status,exercise_frequency
0,1,65,Female,2021-03-06 01:00:00,2021-03-06 07:00:00,6.0,0.88,18,70,12,0.0,0.0,0.0,Yes,3.0
1,2,69,Male,2021-12-05 02:00:00,2021-12-05 09:00:00,7.0,0.66,19,28,53,3.0,0.0,3.0,Yes,3.0
2,3,40,Female,2021-05-25 21:30:00,2021-05-25 05:30:00,8.0,0.89,20,70,10,1.0,0.0,0.0,No,3.0
3,4,40,Female,2021-11-03 02:30:00,2021-11-03 08:30:00,6.0,0.51,23,25,52,3.0,50.0,5.0,Yes,1.0
4,5,57,Male,2021-03-13 01:00:00,2021-03-13 09:00:00,8.0,0.76,27,55,18,3.0,0.0,3.0,No,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,448,27,Female,2021-11-13 22:00:00,2021-11-13 05:30:00,7.5,0.91,22,57,21,0.0,0.0,0.0,No,5.0
448,449,52,Male,2021-03-31 21:00:00,2021-03-31 03:00:00,6.0,0.74,28,57,15,4.0,25.0,0.0,No,3.0
449,450,40,Female,2021-09-07 23:00:00,2021-09-07 07:30:00,8.5,0.55,20,32,48,1.0,,3.0,Yes,0.0
450,451,45,Male,2021-07-29 21:00:00,2021-07-29 04:00:00,7.0,0.76,18,72,10,3.0,0.0,0.0,No,3.0


In [6]:
# Making all columns to be in lower case
sleep_df = sleep_df.rename(columns= lambda x: x.lower())
sleep_df

Unnamed: 0,id,age,gender,bedtime,wakeup_time,sleep duration,sleep_efficiency,rem_sleep_percentage,deep_sleep_percentage,light_sleep_percentage,awakenings,caffeine_consumption,alcohol_consumption,smoking_status,exercise_frequency
0,1,65,Female,2021-03-06 01:00:00,2021-03-06 07:00:00,6.0,0.88,18,70,12,0.0,0.0,0.0,Yes,3.0
1,2,69,Male,2021-12-05 02:00:00,2021-12-05 09:00:00,7.0,0.66,19,28,53,3.0,0.0,3.0,Yes,3.0
2,3,40,Female,2021-05-25 21:30:00,2021-05-25 05:30:00,8.0,0.89,20,70,10,1.0,0.0,0.0,No,3.0
3,4,40,Female,2021-11-03 02:30:00,2021-11-03 08:30:00,6.0,0.51,23,25,52,3.0,50.0,5.0,Yes,1.0
4,5,57,Male,2021-03-13 01:00:00,2021-03-13 09:00:00,8.0,0.76,27,55,18,3.0,0.0,3.0,No,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,448,27,Female,2021-11-13 22:00:00,2021-11-13 05:30:00,7.5,0.91,22,57,21,0.0,0.0,0.0,No,5.0
448,449,52,Male,2021-03-31 21:00:00,2021-03-31 03:00:00,6.0,0.74,28,57,15,4.0,25.0,0.0,No,3.0
449,450,40,Female,2021-09-07 23:00:00,2021-09-07 07:30:00,8.5,0.55,20,32,48,1.0,,3.0,Yes,0.0
450,451,45,Male,2021-07-29 21:00:00,2021-07-29 04:00:00,7.0,0.76,18,72,10,3.0,0.0,0.0,No,3.0


In [7]:
# Finding null entries in all columns
sleep_df.isnull().sum()

id                         0
age                        0
gender                     0
bedtime                    0
wakeup_time                0
sleep duration             0
sleep_efficiency           0
rem_sleep_percentage       0
deep_sleep_percentage      0
light_sleep_percentage     0
awakenings                20
caffeine_consumption      25
alcohol_consumption       14
smoking_status             0
exercise_frequency         6
dtype: int64

In [8]:
# Creating a function that finds the mean of a column, leaving out the null values
def mean_nonzero(column_name):
    mean_list_nonzero = []
    for i in sleep_df[column_name]:
        if i is not None and not pd.isna(i):
            mean_list_nonzero.append(i)
    if len(mean_list_nonzero) > 0:
        mean = sum(mean_list_nonzero) / len(mean_list_nonzero)
        rounded_mean = round(mean, 2) 
        return rounded_mean
    else:
        return None

In [9]:
# Appliying the function to find the mean in the columns where are null values
mean_awakenings = mean_nonzero('awakenings')
mean_caffeine_consumption = mean_nonzero('caffeine_consumption')
mean_alcohol_consumption = mean_nonzero('alcohol_consumption')
mean_exercise_frequency = mean_nonzero('exercise_frequency')
print ('mean_awakenings: ', mean_awakenings, \
      'mean_caffeine_consumption: ', mean_caffeine_consumption, \
      'mean_alcohol_consumption: ', mean_alcohol_consumption, \
      'mean_exercise_frequency: ', mean_exercise_frequency)

mean_awakenings:  1.64 mean_caffeine_consumption:  23.65 mean_alcohol_consumption:  1.17 mean_exercise_frequency:  1.79


In [10]:
# Using a lambda function to replace the null values for the mean
sleep_df['awakenings'] = \
sleep_df['awakenings'].apply(lambda x: mean_awakenings if x is None or pd.isna(x) else x)

In [11]:
sleep_df['caffeine_consumption'] = \
sleep_df['caffeine_consumption'].apply(lambda x: mean_alcohol_consumption if x is None or pd.isna(x) else x)

In [12]:
sleep_df['alcohol_consumption'] = \
sleep_df['alcohol_consumption'].apply(lambda x: mean_alcohol_consumption if x is None or pd.isna(x) else x)

In [13]:
sleep_df['exercise_frequency'] = \
sleep_df['exercise_frequency'].apply(lambda x: mean_exercise_frequency if x is None or pd.isna(x) else x)

In [14]:
# checking that all null values where replaced and there are not any more null values
sleep_df.isnull().sum()

id                        0
age                       0
gender                    0
bedtime                   0
wakeup_time               0
sleep duration            0
sleep_efficiency          0
rem_sleep_percentage      0
deep_sleep_percentage     0
light_sleep_percentage    0
awakenings                0
caffeine_consumption      0
alcohol_consumption       0
smoking_status            0
exercise_frequency        0
dtype: int64

In [15]:
# To do:
 # verify uniques in dummies
 # change categoricals to dummies
 # bins para wake up and bed time
 # change integers to floats