# CS foreach Curriculum Workshop 10/24/2024: Introduction to AI/ML

This Jupyter Notebook is a demo supplemental to the Intro to AI/ML Workshop hosted on 10/24/2024.

In [128]:
# Import all relevant libraries
import pandas as pd
import numpy as np
import scipy as scp
from sklearn.preprocessing import OneHotEncoder

In [129]:
# Load the sleep data
sleep_data = pd.read_csv('Health_Sleep_Statistics.csv')
sleep_data.head()

Unnamed: 0,User ID,Age,Gender,Sleep Quality,Bedtime,Wake-up Time,Daily Steps,Calories Burned,Physical Activity Level,Dietary Habits,Sleep Disorders,Medication Usage
0,1,25,f,8,23:00,06:30,8000,2500,medium,healthy,no,no
1,2,34,m,7,00:30,07:00,5000,2200,low,unhealthy,yes,yes
2,3,29,f,9,22:45,06:45,9000,2700,high,healthy,no,no
3,4,41,m,5,01:00,06:30,4000,2100,low,unhealthy,yes,no
4,5,22,f,8,23:30,07:00,10000,2800,high,medium,no,no


# Data Encoding

We want to try and predict the Sleep Quality Score as found in the "Sleep Quality" column using the other variables that we have.

To start, let's perform One-Hot Encoding for the following columns: "Gender", "Physical Activity Level", "Dietary Habits", "Sleep Disorders", and "Medication Usage", so that all variables are represented by quantities.

In [130]:
# Initialize sklearn's One Hot Encoder
encoder = OneHotEncoder()

In [131]:
# Perform One-Hot Encoding on "Gender"
encoded_gender = encoder.fit_transform(sleep_data[['Gender']])
gender_df = pd.DataFrame(encoded_gender.toarray(), columns=encoder.get_feature_names_out(['Gender']))
# gender_df

In [132]:
# Perform One-Hot Encoding on "Physical Activity Level"
encoded_physical_activity = encoder.fit_transform(sleep_data[['Physical Activity Level']])
physical_activity_df = pd.DataFrame(encoded_physical_activity.toarray(), columns=encoder.get_feature_names_out(['Physical Activity Level']))
# physical_activity_df

In [133]:
# Perform One-Hot Encoding on "Dietary Habits"
encoded_dietary_habits = encoder.fit_transform(sleep_data[['Dietary Habits']])
dietary_habits_df = pd.DataFrame(encoded_dietary_habits.toarray(), columns=encoder.get_feature_names_out(['Dietary Habits']))
# dietary_habits_df

In [134]:
# Perform One-Hot Encoding on "Sleep Disorders"
encoded_sleep_disorders = encoder.fit_transform(sleep_data[['Sleep Disorders']])
sleep_disorders_df = pd.DataFrame(encoded_sleep_disorders.toarray(), columns=encoder.get_feature_names_out(['Sleep Disorders']))
# sleep_disorders_df

In [135]:
# Perform One-Hot Encoding on "Medication Usage"
encoded_medication_usage = encoder.fit_transform(sleep_data[['Medication Usage']])
medication_usage_df = pd.DataFrame(encoded_medication_usage.toarray(), columns=encoder.get_feature_names_out(['Medication Usage']))
# medication_usage_df

In [136]:
encoded_sleep_data = (sleep_data
                      .join(gender_df)
                      .join(physical_activity_df)
                      .join(dietary_habits_df)
                      .join(sleep_disorders_df)
                      .join(medication_usage_df))
# encoded_sleep_data

Let's drop the names of the columns that we just performed One-Hot Encoding on, since we won't really need them anymore.

In [137]:
encoded_sleep_data = encoded_sleep_data.drop(columns=['Gender', 'Physical Activity Level', 'Dietary Habits', 'Sleep Disorders', 'Medication Usage'])
# encoded_sleep_data

Let's also index by User ID:

In [138]:
encoded_sleep_data = encoded_sleep_data.set_index('User ID')
# encoded_sleep_data

The only columns that need to be converted to viable quantities are "Bedtime" and "Wake-up Time". We can convert these times to minutes and create a model based on that. We can also convert them to seconds later and make a model based on that to see if there may be any improvement. 

In [139]:
def convert_to_minutes(time):
    time_components = time.split(':')
    minutes = int(time_components[0]) * 60 + int(time_components[1])
    return minutes

In [142]:
# Convert bedtimes and wake-up times to minutes
bedtime_in_minutes = encoded_sleep_data['Bedtime'].apply(convert_to_minutes)
wakeuptime_in_minutes = encoded_sleep_data['Wake-up Time'].apply(convert_to_minutes)

In [143]:
# Add them to the encoded sleep data DataFrame
encoded_sleep_data['Bedtime - Min'] = bedtime_in_minutes
encoded_sleep_data['Wake-up Time - Min'] = wakeuptime_in_minutes
# encoded_sleep_data

We'll also drop the original columns for Bedtime and Wake-up Time:

In [144]:
encoded_sleep_data = encoded_sleep_data.drop(columns=['Bedtime', 'Wake-up Time'])
encoded_sleep_data

Unnamed: 0_level_0,Age,Sleep Quality,Daily Steps,Calories Burned,Gender_f,Gender_m,Physical Activity Level_high,Physical Activity Level_low,Physical Activity Level_medium,Dietary Habits_healthy,Dietary Habits_medium,Dietary Habits_unhealthy,Sleep Disorders_no,Sleep Disorders_yes,Medication Usage_no,Medication Usage_yes,Bedtime - Min,Wake-up Time - Min
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,25,8,8000,2500,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1380,390
2,34,7,5000,2200,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,30,420
3,29,9,9000,2700,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1365,405
4,41,5,4000,2100,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,60,390
5,22,8,10000,2800,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1410,420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,43,7,6500,2400,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,45,435
97,33,8,8500,2600,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1395,375
98,46,4,3000,2000,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,90,420
99,25,9,9500,2700,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1335,405


Everything is quantified now! We can start creating the model.