## Data Loading and Preprocessing

In [1]:
from google.colab import files
uploaded = files.upload()


Saving 01_Steps.csv to 01_Steps.csv
Saving 02_Sleep.csv to 02_Sleep.csv


In [9]:
import pandas as pd
import io

# Load the datasets for the 30-users
steps_single = pd.read_csv(io.BytesIO(uploaded['01_Steps.csv']))
sleep_single = pd.read_csv(io.BytesIO(uploaded['02_Sleep.csv']))

# Check the column names for both datasets
steps_single.columns, sleep_single.columns


(Index(['date', 'steps', 'distance', 'runDistance', 'calories'], dtype='object'),
 Index(['date', 'deepSleepTime', 'shallowSleepTime', 'wakeTime', 'start',
        'stop'],
       dtype='object'))

In [10]:
# Convert 'date' in both datasets to datetime
steps_single['date'] = pd.to_datetime(steps_single['date'])
sleep_single['date'] = pd.to_datetime(sleep_single['date'])

# Merge the datasets on 'date'
merged_single_user = pd.merge(steps_single, sleep_single, on='date', how='inner')

# Display the first few rows of the merged dataset
merged_single_user.head()



Unnamed: 0,date,steps,distance,runDistance,calories,deepSleepTime,shallowSleepTime,wakeTime,start,stop
0,2016-04-27,4948,3242,46,281,0,0,0,1461708000,1461708000
1,2016-04-28,16573,12060,79,751,158,262,2,1461801240,1461826560
2,2016-04-29,18002,12916,29,737,234,241,0,1461883500,1461912000
3,2016-04-30,4126,2981,11,207,239,318,0,1461979500,1462012920
4,2016-05-01,3869,2651,0,244,180,333,3,1462065840,1462096800


In [12]:
# Save merged model
df = pd.DataFrame(merged_single_user)
df.to_csv('merged_single_user.csv', index=False)

In [13]:
# Checking for missing values in the merged_30_users dataset
missing_values_single_user = merged_single_user.isnull().sum()

# Display the number of missing values per column
missing_values_single_user


date                0
steps               0
distance            0
runDistance         0
calories            0
deepSleepTime       0
shallowSleepTime    0
wakeTime            0
start               0
stop                0
dtype: int64

In [18]:
# Columns to check for outliers in merged_single_user
columns_to_check_single_user = ['steps', 'distance', 'runDistance', 'calories', 'deepSleepTime', 'shallowSleepTime', 'wakeTime']

# Removing outliers
cleaned_single_user = remove_outliers(merged_single_user, columns_to_check_single_user)

# Comparing the shape of the datasets before and after outlier removal
original_shape_single_user = merged_single_user.shape
cleaned_shape_single_user = cleaned_single_user.shape

original_shape_single_user, cleaned_shape_single_user


((2454, 10), (1643, 10))

In [17]:
# Function to detect and remove outliers using IQR
def remove_outliers(df, column_list):
    cleaned_df = df.copy()
    for column in column_list:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Filtering the non-outlier values
        cleaned_df = cleaned_df[(cleaned_df[column] >= lower_bound) & (cleaned_df[column] <= upper_bound)]

    return cleaned_df



In [21]:
# Set the weights
w1, w2, w3 = 0.6, 0.3, 0.1

# For merged_single_user dataset
merged_single_user['sleepQualityScore'] = (
    w1 * merged_single_user['deepSleepTime'] +
    w2 * merged_single_user['shallowSleepTime'] -
    w3 * merged_single_user['wakeTime']
)

# Display the first few rows of the datasets with sleepQualityScore
merged_single_user.head()


Unnamed: 0,date,steps,distance,runDistance,calories,deepSleepTime,shallowSleepTime,wakeTime,start,stop,sleepQualityScore
0,2016-04-27,4948,3242,46,281,0,0,0,1461708000,1461708000,0.0
1,2016-04-28,16573,12060,79,751,158,262,2,1461801240,1461826560,173.2
2,2016-04-29,18002,12916,29,737,234,241,0,1461883500,1461912000,212.7
3,2016-04-30,4126,2981,11,207,239,318,0,1461979500,1462012920,238.8
4,2016-05-01,3869,2651,0,244,180,333,3,1462065840,1462096800,207.6


## Feature Selection, Model Training and Evaluation



In [37]:
# Add new attribute 'bedTime'
merged_single_user['bedTime'] = merged_single_user['deepSleepTime'] + merged_single_user['shallowSleepTime'] + merged_single_user['wakeTime']

In [40]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


features = ['steps', 'distance', 'calories', 'bedTime']
target = 'sleepQualityScore'
X = merged_single_user[features]
y = merged_single_user[target]


# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Initialize models
models = {
  'Linear Regression': LinearRegression(),
  'Decision Trees': DecisionTreeRegressor(),
  'Random Forest': RandomForestRegressor(),
  'Gradient Boosting': GradientBoostingRegressor()
}


# Initialize a dictionary to hold the results
model_results = {}


# Train and evaluate models
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    model_results[model_name] = {'RMSE': rmse, 'R2 Score': r2}


print(model_results)

{'Linear Regression': {'RMSE': 15.165124957030196, 'R2 Score': 0.9608186283403994}, 'Decision Trees': {'RMSE': 21.95935655073253, 'R2 Score': 0.9178463487447467}, 'Random Forest': {'RMSE': 16.156267857569684, 'R2 Score': 0.955529732791895}, 'Gradient Boosting': {'RMSE': 14.935065786833874, 'R2 Score': 0.9619983958891949}}
