In [2]:
from google.colab import files
uploaded = files.upload()

Saving dailyActivity_merged.csv to dailyActivity_merged.csv
Saving dailyCalories_merged.csv to dailyCalories_merged.csv
Saving dailySteps_merged.csv to dailySteps_merged.csv
Saving sleepDay_merged.csv to sleepDay_merged.csv


In [7]:
import pandas as pd
import io

# Load the datasets for the 30-users
daily_steps = pd.read_csv(io.BytesIO(uploaded['dailySteps_merged.csv']))
sleep_day = pd.read_csv(io.BytesIO(uploaded['sleepDay_merged.csv']))
daily_calories = pd.read_csv(io.BytesIO(uploaded['dailyCalories_merged.csv']))
daily_activity = pd.read_csv(io.BytesIO(uploaded['dailyActivity_merged.csv']))

# Check the column names for both datasets
daily_steps.columns, sleep_day.columns, daily_calories.columns, daily_activity.columns

(Index(['Id', 'ActivityDay', 'StepTotal'], dtype='object'),
 Index(['Id', 'SleepDay', 'TotalSleepRecords', 'TotalMinutesAsleep',
        'TotalTimeInBed'],
       dtype='object'),
 Index(['Id', 'ActivityDay', 'Calories'], dtype='object'),
 Index(['Id', 'ActivityDate', 'TotalSteps', 'TotalDistance', 'TrackerDistance',
        'LoggedActivitiesDistance', 'VeryActiveDistance',
        'ModeratelyActiveDistance', 'LightActiveDistance',
        'SedentaryActiveDistance', 'VeryActiveMinutes', 'FairlyActiveMinutes',
        'LightlyActiveMinutes', 'SedentaryMinutes', 'Calories'],
       dtype='object'))

In [None]:
# Convert the date fields to datetime
daily_activity['ActivityDate'] = pd.to_datetime(daily_activity['ActivityDate'])
daily_calories['ActivityDay'] = pd.to_datetime(daily_calories['ActivityDay'])
daily_steps['ActivityDay'] = pd.to_datetime(daily_steps['ActivityDay'])
sleep_day['SleepDay'] = pd.to_datetime(sleep_day['SleepDay'].str.rstrip(), format='%m/%d/%Y %I:%M:%S %p')

In [18]:
# Merge the datasets on 'Id' and date field
merged_data = daily_activity.merge(daily_calories, left_on=['Id', 'ActivityDate'], right_on=['Id', 'ActivityDay'], how='inner')
merged_data = merged_data.merge(daily_steps, left_on=['Id', 'ActivityDate'], right_on=['Id', 'ActivityDay'], how='inner')
merged_data = merged_data.merge(sleep_day, left_on=['Id', 'ActivityDate'], right_on=['Id', 'SleepDay'], how='inner')

# Dropping duplicate columns from the merge (duplicate date columns)
merged_data = merged_data.loc[:,~merged_data.columns.duplicated()]

selected_columns = {
    'Id': 'id',
    'ActivityDate': 'date',
    'TotalSteps': 'steps',
    'TotalDistance': 'distance',
    'Calories_x': 'calories',
    'TotalMinutesAsleep': 'asleepTime',
    'TotalTimeInBed': 'bedTime'
}

# Creating the new dataset with the selected columns
merged_30_users = merged_data[list(selected_columns.keys())].rename(columns=selected_columns)

merged_30_users.head()


Unnamed: 0,id,date,steps,distance,calories,asleepTime,bedTime
0,1503960366,2016-04-12,13162,8.5,1985,327,346
1,1503960366,2016-04-13,10735,6.97,1797,384,407
2,1503960366,2016-04-15,9762,6.28,1745,412,442
3,1503960366,2016-04-16,12669,8.16,1863,340,367
4,1503960366,2016-04-17,9705,6.48,1728,700,712


In [19]:
# Save merged model
df = pd.DataFrame(merged_30_users)
df.to_csv('merged_30_users.csv', index=False)

In [20]:
# Checking for missing values in the merged_30_users dataset
missing_values_30_users = merged_30_users.isnull().sum()

# Display the number of missing values per column
missing_values_30_users


id            0
date          0
steps         0
distance      0
calories      0
asleepTime    0
bedTime       0
dtype: int64

In [21]:
# Function to detect and remove outliers using IQR
def remove_outliers(df, column_list):
    cleaned_df = df.copy()
    for column in column_list:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Filtering the non-outlier values
        cleaned_df = cleaned_df[(cleaned_df[column] >= lower_bound) & (cleaned_df[column] <= upper_bound)]

    return cleaned_df

# Columns to check for outliers in merged_30_users
columns_to_check_30_users = ['steps', 'distance', 'calories', 'asleepTime', 'bedTime']

# Removing outliers
cleaned_30_users = remove_outliers(merged_30_users, columns_to_check_30_users)

# Comparing the shape of the datasets before and after outlier removal
original_shape_30_users = merged_30_users.shape
cleaned_shape_30_users = cleaned_30_users.shape

original_shape_30_users, cleaned_shape_30_users


((413, 7), (377, 7))

In [22]:
# Set the weights
w1, w2, w3 = 0.6, 0.3, 0.1

# Calculate WakeTime for merged_30_users dataset
merged_30_users['wakeTime'] = merged_30_users['bedTime'] - merged_30_users['asleepTime']

# For merged_30_users dataset, assuming TotalMinutesAsleep as the sum of deep and shallow sleep times
merged_30_users['sleepQualityScore'] = (
    w1 * merged_30_users['asleepTime'] * 0.4 + # Assuming 50% of sleep is deep sleep
    w2 * merged_30_users['asleepTime'] * 0.6 - # Assuming 50% of sleep is shallow sleep
    w3 * merged_30_users['wakeTime']
)

# Display the first few rows of the datasets with sleepQualityScore
merged_30_users.head()


Unnamed: 0,id,date,steps,distance,calories,asleepTime,bedTime,wakeTime,sleepQualityScore
0,1503960366,2016-04-12,13162,8.5,1985,327,346,19,135.44
1,1503960366,2016-04-13,10735,6.97,1797,384,407,23,158.98
2,1503960366,2016-04-15,9762,6.28,1745,412,442,30,170.04
3,1503960366,2016-04-16,12669,8.16,1863,340,367,27,140.1
4,1503960366,2016-04-17,9705,6.48,1728,700,712,12,292.8


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

models = {
    'Linear Regression': LinearRegression(),
    'Decision Trees': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

features_multi = ['steps', 'distance', 'calories', 'bedTime']
target_multi = 'sleepQualityScore'
X_multi = merged_30_users[features_multi]
y_multi = merged_30_users[target_multi]
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_multi, y_multi, test_size=0.2, random_state=42)


# Train and evaluate the models
model_results = {}

for model_name, model in models.items():
    model.fit(X_train_multi, y_train_multi)
    y_pred_multi = model.predict(X_test_multi)
    rmse_multi = np.sqrt(mean_squared_error(y_test_multi, y_pred_multi))
    r2_multi = r2_score(y_test_multi, y_pred_multi)
    model_results[model_name] = {'RMSE': rmse_multi, 'R2 Score': r2_multi}


print(model_results)

{'Linear Regression': {'RMSE': 26.436514626639344, 'R2 Score': 0.6440710672778196}, 'Decision Trees': {'RMSE': 19.14783450589283, 'R2 Score': 0.813278542870863}, 'Random Forest': {'RMSE': 16.454432872384483, 'R2 Score': 0.8621138219625671}, 'Gradient Boosting': {'RMSE': 15.063663056551155, 'R2 Score': 0.8844377249694253}}
