# Reading and Setting Up Data

In [None]:
import sys
from collections import Counter
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
import random
import warnings
warnings.filterwarnings("ignore")


from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix


In [None]:
df = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv')
df['Sleep Disorder'] = df['Sleep Disorder'].fillna("No Disorder")
df = df.sample(frac = 1, random_state=23)

In [None]:
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
150,151,Female,39,Accountant,8.0,9,80,3,Normal Weight,115/78,67,7500,No Disorder
355,356,Female,58,Nurse,8.0,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
54,55,Male,32,Doctor,6.0,6,30,8,Normal,125/80,72,5000,No Disorder
102,103,Female,36,Teacher,7.2,8,60,4,Normal,115/75,68,7000,No Disorder
181,182,Male,42,Lawyer,7.8,8,90,5,Normal,130/85,70,8000,No Disorder


In [None]:
df["Sleep Disorder"].value_counts()

Sleep Disorder
No Disorder    219
Sleep Apnea     78
Insomnia        77
Name: count, dtype: int64

# Functions

In [None]:
class KNN:
    def __init__(self, X, y, k = 3):
        """ param X: pd.DataFrame
            param y: pd.Series
            param k: k nearest neighbors (int)

            Using Euclidean distance
        """
        self.X = pd.get_dummies(X).astype(int)
        self.y = y
        self.k = k

    def predict(self, X_test):
        X_test = pd.get_dummies(X_test).astype(int)
        predictions = []
        for index, row in X_test.iterrows():
            distance = np.sqrt(np.sum((row - self.X)**2, axis=1)) # euclidean distance metric
            knn = pd.Series(distance).sort_values().iloc[:self.k]
            label = self.y[knn.index].value_counts().idxmax()
            predictions.append(label)
        return pd.Series(predictions, index=X_test.index)

In [None]:
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

# Model 1: Sleep

**Predictors Used:** Sleep Duration and Quality of Sleep

In [None]:
data_cv = pd.DataFrame()
n_folds = 5
import math
for i in range(1,n_folds+1):
            n_index = math.floor(len(df.index) / n_folds)
            if i != n_folds+1:
                data_test = df.iloc[(i-1)*n_index:(i*n_index)]
                index_list = data_test.index.values.tolist()
                D_exclude = df.index.isin(index_list)
                data_train = df[~D_exclude]
            else:
                data_test = df.iloc[(i-1)*n_index:]
                data_train = df.iloc[0:(i-1)*n_index-1]
            X_train = data_train.drop(columns = ["Sleep Disorder", "Person ID", "Gender", "Age", "Occupation", "Physical Activity Level", "Stress Level", "BMI Category", "Blood Pressure", "Heart Rate", "Daily Steps"], axis=1)
            X_test = data_test.drop(columns = ["Sleep Disorder", "Person ID", "Gender", "Age", "Occupation", "Physical Activity Level", "Stress Level", "BMI Category", "Blood Pressure", "Heart Rate", "Daily Steps"], axis=1)
            y_train = data_train['Sleep Disorder']
            knn_classifier_sleep = KNN(X_train, y_train, k=10)
            data_test['predict'] = knn_classifier_sleep.predict(X_test)
            data_cv = pd.concat([data_cv, data_test])
data_cv.dropna(subset=['predict'], inplace=True)
data_cv
predictions_sleep = data_cv["predict"]
y_test = data_cv['Sleep Disorder']
data_cv

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder,predict
101,102,Female,36,Teacher,7.2,8,60,4,Normal,115/75,68,7000,No Disorder,No Disorder
140,141,Female,38,Accountant,7.1,8,60,4,Normal,115/75,68,7000,No Disorder,No Disorder
74,75,Male,33,Doctor,6.0,6,30,8,Normal,125/80,72,5000,No Disorder,Sleep Apnea
333,334,Female,54,Engineer,8.4,9,30,3,Normal,125/80,65,5000,No Disorder,Sleep Apnea
251,252,Female,45,Teacher,6.8,7,30,6,Overweight,135/90,65,6000,Insomnia,Insomnia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,154,Male,39,Lawyer,7.2,8,60,5,Normal,130/85,68,8000,No Disorder,No Disorder
90,91,Male,35,Engineer,7.3,8,60,4,Normal,125/80,65,5000,No Disorder,No Disorder
39,40,Male,31,Doctor,7.6,7,75,6,Normal,120/80,70,8000,No Disorder,No Disorder
347,348,Female,57,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea,No Disorder


#Evaluation Sleep

In [None]:
accuracy(y_test,predictions_sleep)

0.6297297297297297

In [None]:
y = data_cv['Sleep Disorder']
predictions_sleep.index = predictions_sleep.index
cmCols = y.unique()
cmCols.sort()
cm = pd.DataFrame(0, index=cmCols, columns=cmCols)
for true, pred in zip(y, predictions_sleep):
    cm.loc[true, pred] += 1


print(cm, "\n")

             Insomnia  No Disorder  Sleep Apnea
Insomnia           42           27            6
No Disorder        29          175           13
Sleep Apnea        13           49           16 



# Model 2: Physical Activity

**Predictors Used:** Physical Activity Level, BMI Category, Blood Pressure, Heart Rate, Daily Steps

In [None]:
data_cv = pd.DataFrame()
n_folds = 5
import math
for i in range(1,n_folds+1):
            n_index = math.floor(len(df.index) / n_folds)
            if i != n_folds+1:
                data_test = df.iloc[(i-1)*n_index:(i*n_index)]
                index_list = data_test.index.values.tolist()
                D_exclude = df.index.isin(index_list)
                data_train = df[~D_exclude]
            else:
                data_test = df.iloc[(i-1)*n_index:]
                data_train = df.iloc[0:(i-1)*n_index-1]
            X_train = data_train.drop(columns = ["Sleep Disorder", "Person ID", "Gender", "Age", "Occupation", "Stress Level", "Sleep Duration", "Quality of Sleep"], axis=1)
            X_test = data_test.drop(columns = ["Sleep Disorder", "Person ID", "Gender", "Age", "Occupation", "Stress Level", "Sleep Duration", "Quality of Sleep"], axis=1)
            y_train = data_train['Sleep Disorder']
            knn_classifier_sleep = KNN(X_train, y_train, k=10)
            data_test['predict'] = knn_classifier_sleep.predict(X_test)
            data_cv = pd.concat([data_cv, data_test])
data_cv.dropna(subset=['predict'], inplace=True)
data_cv
predictions_physical_activity = data_cv["predict"]
y_test = data_cv['Sleep Disorder']
data_cv

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder,predict
101,102,Female,36,Teacher,7.2,8,60,4,Normal,115/75,68,7000,No Disorder,No Disorder
140,141,Female,38,Accountant,7.1,8,60,4,Normal,115/75,68,7000,No Disorder,No Disorder
74,75,Male,33,Doctor,6.0,6,30,8,Normal,125/80,72,5000,No Disorder,No Disorder
333,334,Female,54,Engineer,8.4,9,30,3,Normal,125/80,65,5000,No Disorder,No Disorder
251,252,Female,45,Teacher,6.8,7,30,6,Overweight,135/90,65,6000,Insomnia,Insomnia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,154,Male,39,Lawyer,7.2,8,60,5,Normal,130/85,68,8000,No Disorder,No Disorder
90,91,Male,35,Engineer,7.3,8,60,4,Normal,125/80,65,5000,No Disorder,No Disorder
39,40,Male,31,Doctor,7.6,7,75,6,Normal,120/80,70,8000,No Disorder,No Disorder
347,348,Female,57,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea,Sleep Apnea


# Evauation Physical Activity

In [None]:
accuracy(y_test, predictions_physical_activity)

0.8513513513513513

In [None]:
X_test = data_cv['Sleep Disorder']
predictions_physical_activity.index = X_test.index
cmCols = y.unique()
cmCols.sort()
cm = pd.DataFrame(0, index=cmCols, columns=cmCols)
for true, pred in zip(y, predictions_physical_activity):
    cm.loc[true, pred] += 1


print(cm, "\n")

             Insomnia  No Disorder  Sleep Apnea
Insomnia           64            9            2
No Disorder        11          191           15
Sleep Apnea         6           12           60 



# Model 3: Stress Level

**Predictors Used:** Stress Level

In [None]:
X = df.drop(columns = ["Sleep Disorder", "Person ID", "Gender", "Age", "Occupation", "Physical Activity Level", "Quality of Sleep", "BMI Category", "Blood Pressure", "Heart Rate", "Daily Steps", "Sleep Duration"], axis=1)
y = df["Sleep Disorder"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
data_cv = pd.DataFrame()
n_folds = 5
import math
for i in range(1,n_folds+1):
            n_index = math.floor(len(df.index) / n_folds)
            if i != n_folds+1:
                data_test = df.iloc[(i-1)*n_index:(i*n_index)]
                index_list = data_test.index.values.tolist()
                D_exclude = df.index.isin(index_list)
                data_train = df[~D_exclude]
            else:
                data_test = df.iloc[(i-1)*n_index:]
                data_train = df.iloc[0:(i-1)*n_index-1]
            X_train = data_train.drop(columns = ["Sleep Disorder", "Person ID", "Gender", "Age", "Occupation", "Physical Activity Level", "Quality of Sleep", "BMI Category", "Blood Pressure", "Heart Rate", "Daily Steps", "Sleep Duration"], axis=1)
            X_test = data_test.drop(columns = ["Sleep Disorder", "Person ID", "Gender", "Age", "Occupation", "Physical Activity Level", "Quality of Sleep", "BMI Category", "Blood Pressure", "Heart Rate", "Daily Steps", "Sleep Duration"], axis=1)
            y_train = data_train['Sleep Disorder']
            knn_classifier_stress = KNN(X_train, y_train, k=10)
            data_test['predict'] = knn_classifier_stress.predict(X_test)
            data_cv = pd.concat([data_cv, data_test])
data_cv.dropna(subset=['predict'], inplace=True)
data_cv
predictions_stress_level = data_cv["predict"]
y_test = data_cv['Sleep Disorder']
data_cv

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder,predict
101,102,Female,36,Teacher,7.2,8,60,4,Normal,115/75,68,7000,No Disorder,No Disorder
140,141,Female,38,Accountant,7.1,8,60,4,Normal,115/75,68,7000,No Disorder,No Disorder
74,75,Male,33,Doctor,6.0,6,30,8,Normal,125/80,72,5000,No Disorder,No Disorder
333,334,Female,54,Engineer,8.4,9,30,3,Normal,125/80,65,5000,No Disorder,Sleep Apnea
251,252,Female,45,Teacher,6.8,7,30,6,Overweight,135/90,65,6000,Insomnia,No Disorder
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,154,Male,39,Lawyer,7.2,8,60,5,Normal,130/85,68,8000,No Disorder,No Disorder
90,91,Male,35,Engineer,7.3,8,60,4,Normal,125/80,65,5000,No Disorder,No Disorder
39,40,Male,31,Doctor,7.6,7,75,6,Normal,120/80,70,8000,No Disorder,No Disorder
347,348,Female,57,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea,No Disorder


# Evaluation Stress Level

In [None]:
accuracy(y_test, predictions_stress_level)

0.6675675675675675

In [None]:
X_test = data_cv['Sleep Disorder']
predictions_stress_level.index = X_test.index
cmCols = y.unique()
cmCols.sort()
cm = pd.DataFrame(0, index=cmCols, columns=cmCols)
for true, pred in zip(y, predictions_stress_level):
    cm.loc[true, pred] += 1


print(cm, "\n")

             Insomnia  No Disorder  Sleep Apnea
Insomnia           40           32            3
No Disorder         3          190           24
Sleep Apnea         6           55           17 

