# CS foreach Curriculum Workshop 10/24/2024: Introduction to AI/ML

This Jupyter Notebook is a supplemental demo to the Intro to AI/ML Workshop hosted on 10/24/2024. It aims to create a simple linear regression model to predict sleep quality on a scale of 1-10 based on a number of factors.

In [355]:
# Import all relevant libraries
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import OneHotEncoder

If you are using Google Colab to run this notebook, please un-comment the code cell below. Download the `Health_Sleep_Statistics.csv` file to your computer, run the cell below, and you will have the option to upload the file to this notebook.

In [335]:
# from google.colab import files
# uploaded = files.upload()

In [336]:
# Load the sleep data
sleep_data = pd.read_csv('Health_Sleep_Statistics.csv')
sleep_data.head()

Unnamed: 0,User ID,Age,Gender,Sleep Quality,Bedtime,Wake-up Time,Daily Steps,Calories Burned,Physical Activity Level,Dietary Habits,Sleep Disorders,Medication Usage
0,1,25,f,8,23:00,06:30,8000,2500,medium,healthy,no,no
1,2,34,m,7,00:30,07:00,5000,2200,low,unhealthy,yes,yes
2,3,29,f,9,22:45,06:45,9000,2700,high,healthy,no,no
3,4,41,m,5,01:00,06:30,4000,2100,low,unhealthy,yes,no
4,5,22,f,8,23:30,07:00,10000,2800,high,medium,no,no


# Data Encoding

We want to try and predict the Sleep Quality Score as found in the "Sleep Quality" column using the other variables that we have.

To start, let's perform One-Hot Encoding for the following columns: "Gender", "Physical Activity Level", "Dietary Habits", "Sleep Disorders", and "Medication Usage", so that all variables are represented by quantities. 

In [337]:
# Initialize sklearn's One Hot Encoder
encoder = OneHotEncoder()

In [338]:
# Perform One-Hot Encoding on "Gender"
encoded_gender = encoder.fit_transform(sleep_data[['Gender']])
gender_df = pd.DataFrame(encoded_gender.toarray(), columns=encoder.get_feature_names_out(['Gender']))
# gender_df

In [339]:
# Perform One-Hot Encoding on "Physical Activity Level"
encoded_physical_activity = encoder.fit_transform(sleep_data[['Physical Activity Level']])
physical_activity_df = pd.DataFrame(encoded_physical_activity.toarray(), columns=encoder.get_feature_names_out(['Physical Activity Level']))
# physical_activities_df

In [340]:
# Perform One-Hot Encoding on "Dietary Habits"
encoded_dietary_habits = encoder.fit_transform(sleep_data[['Dietary Habits']])
dietary_habits_df = pd.DataFrame(encoded_dietary_habits.toarray(), columns=encoder.get_feature_names_out(['Dietary Habits']))
# dietary_habits_df

In [341]:
# Perform One-Hot Encoding on "Sleep Disorders"
encoded_sleep_disorders = encoder.fit_transform(sleep_data[['Sleep Disorders']])
sleep_disorders_df = pd.DataFrame(encoded_sleep_disorders.toarray(), columns=encoder.get_feature_names_out(['Sleep Disorders']))

In [342]:
# Perform One-Hot Encoding on "Medication Usage"
encoded_medication_usage = encoder.fit_transform(sleep_data[['Medication Usage']])
medication_usage_df = pd.DataFrame(encoded_medication_usage.toarray(), columns=encoder.get_feature_names_out(['Medication Usage']))
# medication_usage_df

In [343]:
# Join all of the One-Hot encoded data together
encoded_sleep_data = (sleep_data
                      .join(gender_df)
                      .join(physical_activity_df)
                      .join(dietary_habits_df)
                      .join(sleep_disorders_df)
                      .join(medication_usage_df))
# encoded_sleep_data

Let's drop the names of the columns that we just performed One-Hot Encoding on, since we won't really need them anymore.

In [344]:
encoded_sleep_data = encoded_sleep_data.drop(columns=['Gender', 'Physical Activity Level', 'Dietary Habits', 'Sleep Disorders', 'Medication Usage'])
# encoded_sleep_data

Let's also index by User ID:

In [345]:
encoded_sleep_data = encoded_sleep_data.set_index('User ID')
# encoded_sleep_data

The only columns that need to be converted to viable quantities are "Bedtime" and "Wake-up Time". We can convert these times to minutes and create a model based on that.

In [346]:
def convert_to_minutes(time):
    time_components = time.split(':')
    minutes = int(time_components[0]) * 60 + int(time_components[1])
    return minutes

In [347]:
# Convert bedtimes and wake-up times to minutes
bedtime_in_minutes = encoded_sleep_data['Bedtime'].apply(convert_to_minutes)
wakeuptime_in_minutes = encoded_sleep_data['Wake-up Time'].apply(convert_to_minutes)

In [348]:
# Add them to the encoded sleep data DataFrame
encoded_sleep_data['Bedtime - Min'] = bedtime_in_minutes
encoded_sleep_data['Wake-up Time - Min'] = wakeuptime_in_minutes

We'll also drop the original columns for Bedtime and Wake-up Time:

In [349]:
encoded_sleep_data = encoded_sleep_data.drop(columns=['Bedtime', 'Wake-up Time'])
encoded_sleep_data

Unnamed: 0_level_0,Age,Sleep Quality,Daily Steps,Calories Burned,Gender_f,Gender_m,Physical Activity Level_high,Physical Activity Level_low,Physical Activity Level_medium,Dietary Habits_healthy,Dietary Habits_medium,Dietary Habits_unhealthy,Sleep Disorders_no,Sleep Disorders_yes,Medication Usage_no,Medication Usage_yes,Bedtime - Min,Wake-up Time - Min
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,25,8,8000,2500,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1380,390
2,34,7,5000,2200,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,30,420
3,29,9,9000,2700,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1365,405
4,41,5,4000,2100,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,60,390
5,22,8,10000,2800,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1410,420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,43,7,6500,2400,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,45,435
97,33,8,8500,2600,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1395,375
98,46,4,3000,2000,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,90,420
99,25,9,9500,2700,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1335,405


Everything is quantified now! We can start creating the model. Also, use this spot as a checkpoint if anything goes wrong after this line. 

In [350]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [351]:
sleep_scores = encoded_sleep_data['Sleep Quality']

In [352]:
data_no_scores = encoded_sleep_data.drop(['Sleep Quality'], axis=1)
data_no_scores

Unnamed: 0_level_0,Age,Daily Steps,Calories Burned,Gender_f,Gender_m,Physical Activity Level_high,Physical Activity Level_low,Physical Activity Level_medium,Dietary Habits_healthy,Dietary Habits_medium,Dietary Habits_unhealthy,Sleep Disorders_no,Sleep Disorders_yes,Medication Usage_no,Medication Usage_yes,Bedtime - Min,Wake-up Time - Min
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,25,8000,2500,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1380,390
2,34,5000,2200,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,30,420
3,29,9000,2700,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1365,405
4,41,4000,2100,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,60,390
5,22,10000,2800,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1410,420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,43,6500,2400,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,45,435
97,33,8500,2600,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1395,375
98,46,3000,2000,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,90,420
99,25,9500,2700,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1335,405


In [365]:
# Split into training data and testing data
train_sleep_data, test_sleep_data, train_sleep_score, test_sleep_score = train_test_split(data_no_scores, sleep_scores, train_size=90, random_state=random.randint(1, 42))

In [366]:
model = LinearRegression().fit(train_sleep_data, train_sleep_score)
print('The model fits the data about ' + str(model.score(train_sleep_data, train_sleep_score) * 100) + '% well!') # fitting, not accuracy

The model fits the data about 96.98034062421816% well!


In [367]:
print('The model fits the data ' + str(model.score(test_sleep_data, test_sleep_score) * 100) + '% well!') # fitting, not accuracy

The model fits the data 98.39410155131372% well!


In [368]:
test_sleep_data

Unnamed: 0_level_0,Age,Daily Steps,Calories Burned,Gender_f,Gender_m,Physical Activity Level_high,Physical Activity Level_low,Physical Activity Level_medium,Dietary Habits_healthy,Dietary Habits_medium,Dietary Habits_unhealthy,Sleep Disorders_no,Sleep Disorders_yes,Medication Usage_no,Medication Usage_yes,Bedtime - Min,Wake-up Time - Min
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
21,29,9000,2600,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1395,390
92,36,5000,2200,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,30,420
35,28,10000,2750,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1365,420
53,27,8500,2600,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1380,420
9,27,9500,2750,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1380,450
75,26,9500,2700,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1335,405
22,35,4000,2100,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,60,390
89,30,8500,2600,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1395,375
81,31,8500,2600,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1395,375
90,47,3000,2000,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,90,420


In [369]:
model.coef_

array([ 1.11399226e-03,  3.48396315e-04,  1.64009789e-03,  2.15295268e-01,
       -2.15295268e-01,  4.12548667e-02, -1.29784995e-01,  8.85301285e-02,
        2.63569273e-01, -4.02792920e-02, -2.23289981e-01,  1.29784995e-01,
       -1.29784995e-01,  4.47274492e-02, -4.47274492e-02, -2.29097649e-04,
        4.02874489e-03])

In [370]:
input_matrix = test_sleep_data.to_numpy()
input_matrix

array([[2.900e+01, 9.000e+03, 2.600e+03, 1.000e+00, 0.000e+00, 1.000e+00,
        0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 1.000e+00,
        0.000e+00, 1.000e+00, 0.000e+00, 1.395e+03, 3.900e+02],
       [3.600e+01, 5.000e+03, 2.200e+03, 0.000e+00, 1.000e+00, 0.000e+00,
        0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 1.000e+00,
        0.000e+00, 1.000e+00, 0.000e+00, 3.000e+01, 4.200e+02],
       [2.800e+01, 1.000e+04, 2.750e+03, 1.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 1.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 1.000e+00,
        0.000e+00, 1.000e+00, 0.000e+00, 1.365e+03, 4.200e+02],
       [2.700e+01, 8.500e+03, 2.600e+03, 1.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 1.000e+00,
        0.000e+00, 1.000e+00, 0.000e+00, 1.380e+03, 4.200e+02],
       [2.700e+01, 9.500e+03, 2.750e+03, 1.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 1.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 1.000e+00,
    

In [371]:
predictions = np.dot(input_matrix, np.transpose(model.coef_)).astype(int)
predictions

array([ 9,  6, 10,  9, 10,  9,  5,  8,  8,  5])

In [372]:
test_sleep_score

User ID
21    8
92    6
35    9
53    8
9     9
75    9
22    5
89    8
81    8
90    4
Name: Sleep Quality, dtype: int64

In [373]:
accuracy = np.mean(predictions == test_sleep_score)
print('The model has a ' + str(accuracy * 100) + '% accuracy!')

The model has a 50.0% accuracy!
