In [420]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [421]:
df = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv')
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [422]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           155 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB


In [423]:
df.describe()

Unnamed: 0,Person ID,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Heart Rate,Daily Steps
count,374.0,374.0,374.0,374.0,374.0,374.0,374.0,374.0
mean,187.5,42.184492,7.132086,7.312834,59.171123,5.385027,70.165775,6816.84492
std,108.108742,8.673133,0.795657,1.196956,20.830804,1.774526,4.135676,1617.915679
min,1.0,27.0,5.8,4.0,30.0,3.0,65.0,3000.0
25%,94.25,35.25,6.4,6.0,45.0,4.0,68.0,5600.0
50%,187.5,43.0,7.2,7.0,60.0,5.0,70.0,7000.0
75%,280.75,50.0,7.8,8.0,75.0,7.0,72.0,8000.0
max,374.0,59.0,8.5,9.0,90.0,8.0,86.0,10000.0


In [424]:
df.drop('Person ID', axis=1, inplace=True)

In [425]:

df['Sleep Disorder'].fillna('Normal', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Sleep Disorder'].fillna('Normal', inplace=True)


In [426]:
df.isnull().sum()

Unnamed: 0,0
Gender,0
Age,0
Occupation,0
Sleep Duration,0
Quality of Sleep,0
Physical Activity Level,0
Stress Level,0
BMI Category,0
Blood Pressure,0
Heart Rate,0


In [427]:
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [428]:
df['Occupation'].unique()

array(['Software Engineer', 'Doctor', 'Sales Representative', 'Teacher',
       'Nurse', 'Engineer', 'Accountant', 'Scientist', 'Lawyer',
       'Salesperson', 'Manager'], dtype=object)

In [429]:
df['BMI Category'].unique()

array(['Overweight', 'Normal', 'Obese', 'Normal Weight'], dtype=object)

In [430]:
df['Quality of Sleep'].unique()

array([6, 4, 7, 5, 8, 9])

In [431]:
df['Blood Pressure'].unique()

array(['126/83', '125/80', '140/90', '120/80', '132/87', '130/86',
       '117/76', '118/76', '128/85', '131/86', '128/84', '115/75',
       '135/88', '129/84', '130/85', '115/78', '119/77', '121/79',
       '125/82', '135/90', '122/80', '142/92', '140/95', '139/91',
       '118/75'], dtype=object)

In [432]:
df['Sleep Disorder'].unique()

array(['Normal', 'Sleep Apnea', 'Insomnia'], dtype=object)

In [433]:
df['Sleep Disorder'].value_counts()

Unnamed: 0_level_0,count
Sleep Disorder,Unnamed: 1_level_1
Normal,219
Sleep Apnea,78
Insomnia,77


In [434]:
# prompt: saya ingin membuat kolom baru bernama sistolik dan diastolik valuenya diambil dari memisahakan value kolom Blood Pressure

# Assuming 'Blood Pressure' column has values like '120/80'
def extract_blood_pressure(bp_str):
    try:
        sistolik, diastolik = map(int, bp_str.split('/'))
        return sistolik, diastolik
    except (ValueError, AttributeError):
        return np.nan, np.nan

df[['sistolik', 'diastolik']] = df['Blood Pressure'].apply(lambda x: pd.Series(extract_blood_pressure(x)))

In [435]:
# Mengidentifikasi jenis kolom (variabel) dalam sebuah dataframe berdasarkan tipe data dan karakteristiknya

def grab_col_names(dataframe, cat_th = 10, car_th = 20): #  memisahkan kolom dalam dataset berdasarkan tipe dan karakteristiknya.

    cat_cols = [col for col in dataframe.columns if str(dataframe[col].dtypes) in ["category", "object", "bool"]] # kolom kategorikal (tipe "category", "object", "bool", atau numerik dengan unique values < cat_th).
    num_but_cat = [col for col in dataframe.columns if str(dataframe[col].dtypes) in ["int64", "float64"] and dataframe[col].nunique() < cat_th] # kolom numerik yang bertindak seperti kategorikal (berdasarkan jumlah unique values).
    cat_but_car = [col for col in dataframe.columns if str(dataframe[col].dtypes) in ["category", "object"] and dataframe[col].nunique() > car_th] # kolom kategorikal dengan unique values > car_th (high cardinality).

    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # kolom numerik murni (int64, float64) yang bukan kategorikal.
    num_cols = [col for col in dataframe.columns if str(dataframe[col].dtypes) in ["int64", "float64"]]
    num_cols = [col for col in num_cols if col not in cat_cols]

    print(f"Jumlah observasi: {dataframe.shape[0]}")
    print(f"Jumlah variabel: {dataframe.shape[1]}")
    print(f"Kolom kategorikal: {len(cat_cols)}")
    print(f"Kolom Numerik: {len(num_cols)}")
    print(f"Kategori tapi kardinal: {len(cat_but_car)}")
    print(f"Numerik tapi kategorikal: {len(num_but_cat)}")

    # mengembalikan daftar kolom yang dikelompokkan berdasarkan tipe (cat_cols, num_cols, cat_but_car).
    return cat_cols, num_cols, cat_but_car

In [436]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

Jumlah observasi: 374
Jumlah variabel: 14
Kolom kategorikal: 6
Kolom Numerik: 7
Kategori tapi kardinal: 1
Numerik tapi kategorikal: 2


In [437]:
from sklearn.preprocessing import LabelEncoder
import joblib


# Buat encoder untuk setiap kolom kategorikal dan simpan dengan nama kolom
for col in cat_cols:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])
    joblib.dump(encoder, f'{col}_encoder.joblib')  # Simpan dengan nama berdasarkan kolom

data_final = pd.concat([df[num_cols], df[cat_cols]], axis=1)

In [438]:
data_final.head()

Unnamed: 0,Age,Sleep Duration,Physical Activity Level,Heart Rate,Daily Steps,sistolik,diastolik,Gender,Occupation,BMI Category,Sleep Disorder,Quality of Sleep,Stress Level
0,27,6.1,42,77,4200,126,83,1,9,3,1,2,3
1,28,6.2,60,75,10000,125,80,1,1,0,1,2,5
2,28,6.2,60,75,10000,125,80,1,1,0,1,2,5
3,28,5.9,30,85,3000,140,90,1,6,2,2,0,5
4,28,5.9,30,85,3000,140,90,1,6,2,2,0,5


In [439]:
cat_cols, num_cols, cat_but_car = grab_col_names(data_final)

Jumlah observasi: 374
Jumlah variabel: 13
Kolom kategorikal: 5
Kolom Numerik: 8
Kategori tapi kardinal: 0
Numerik tapi kategorikal: 5


In [440]:
# data_final = pd.concat([data_final[num_cols], data_final[cat_cols]], axis=1)
data_final.head()

Unnamed: 0,Age,Sleep Duration,Physical Activity Level,Heart Rate,Daily Steps,sistolik,diastolik,Gender,Occupation,BMI Category,Sleep Disorder,Quality of Sleep,Stress Level
0,27,6.1,42,77,4200,126,83,1,9,3,1,2,3
1,28,6.2,60,75,10000,125,80,1,1,0,1,2,5
2,28,6.2,60,75,10000,125,80,1,1,0,1,2,5
3,28,5.9,30,85,3000,140,90,1,6,2,2,0,5
4,28,5.9,30,85,3000,140,90,1,6,2,2,0,5


In [441]:
df['Sleep Disorder'].unique()

array([1, 2, 0])

In [442]:
data_final['Sleep Disorder'].unique()

array([1, 2, 0])

In [443]:
data_final['Sleep Disorder'].value_counts()

Unnamed: 0_level_0,count
Sleep Disorder,Unnamed: 1_level_1
1,219
2,78
0,77


In [444]:
from sklearn.model_selection import train_test_split

X = data_final.drop(columns=['Sleep Disorder'])
y = data_final['Sleep Disorder']

In [445]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42) # Inisialisasi SMOTE
X_smote_resampled, y_smote_resampled = smote.fit_resample(X, y)



In [446]:
y_smote_resampled.value_counts()

Unnamed: 0_level_0,count
Sleep Disorder,Unnamed: 1_level_1
1,219
2,219
0,219


In [447]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Fit dan transform data numerik
X_smote_resampled[X.columns] = scaler.fit_transform(X_smote_resampled[X.columns])

# Sekarang data_final sudah dinormalisasi
print(data_final.head())

   Age  Sleep Duration  Physical Activity Level  Heart Rate  Daily Steps  \
0   27             6.1                       42          77         4200   
1   28             6.2                       60          75        10000   
2   28             6.2                       60          75        10000   
3   28             5.9                       30          85         3000   
4   28             5.9                       30          85         3000   

   sistolik  diastolik  Gender  Occupation  BMI Category  Sleep Disorder  \
0       126         83       1           9             3               1   
1       125         80       1           1             0               1   
2       125         80       1           1             0               1   
3       140         90       1           6             2               2   
4       140         90       1           6             2               2   

   Quality of Sleep  Stress Level  
0                 2             3  
1             

In [456]:
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']

In [448]:
X_smote_resampled.head()

Unnamed: 0,Age,Sleep Duration,Physical Activity Level,Heart Rate,Daily Steps,sistolik,diastolik,Gender,Occupation,BMI Category,Quality of Sleep,Stress Level
0,0.0,0.111111,0.2,0.571429,0.171429,0.407407,0.4,1.0,0.9,1.0,0.4,0.6
1,0.03125,0.148148,0.5,0.47619,1.0,0.37037,0.25,1.0,0.1,0.0,0.4,1.0
2,0.03125,0.148148,0.5,0.47619,1.0,0.37037,0.25,1.0,0.1,0.0,0.4,1.0
3,0.03125,0.037037,0.0,0.952381,0.0,0.925926,0.75,1.0,0.6,0.666667,0.0,1.0
4,0.03125,0.037037,0.0,0.952381,0.0,0.925926,0.75,1.0,0.6,0.666667,0.0,1.0


In [449]:
X_train, X_test, y_train, y_test = train_test_split(X_smote_resampled, y_smote_resampled, test_size=0.2, random_state=42,stratify=y_smote_resampled)

In [450]:
X_train

Unnamed: 0,Age,Sleep Duration,Physical Activity Level,Heart Rate,Daily Steps,sistolik,diastolik,Gender,Occupation,BMI Category,Quality of Sleep,Stress Level
414,0.53125,0.261663,0.250000,0.000000,0.428571,0.740741,0.75,0.0,1.0,1.000000,0.6,0.2
253,0.56250,0.259259,0.250000,0.000000,0.428571,0.740741,0.75,0.0,1.0,1.000000,0.6,0.2
475,0.03125,0.053664,0.016667,0.904762,0.008000,0.925926,0.75,1.0,0.9,0.666667,0.0,0.8
312,0.78125,0.962963,0.000000,0.000000,0.285714,0.370370,0.25,0.0,0.2,0.000000,1.0,0.0
143,0.34375,0.481481,0.500000,0.142857,0.571429,0.000000,0.00,0.0,0.0,0.000000,0.8,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...
646,0.71875,0.074074,1.000000,0.476190,1.000000,0.925926,1.00,0.0,0.5,1.000000,0.4,1.0
119,0.31250,0.518519,0.500000,0.142857,0.571429,0.000000,0.00,0.0,0.0,0.000000,0.8,0.2
596,0.93750,0.885327,0.750000,0.142857,0.571429,0.925926,1.00,0.0,0.5,1.000000,1.0,0.0
369,1.00000,0.851852,0.750000,0.142857,0.571429,0.925926,1.00,0.0,0.5,1.000000,1.0,0.0


In [451]:
X_test.head()

Unnamed: 0,Age,Sleep Duration,Physical Activity Level,Heart Rate,Daily Steps,sistolik,diastolik,Gender,Occupation,BMI Category,Quality of Sleep,Stress Level
404,0.40625,0.642273,0.916667,0.190476,0.690714,0.444444,0.4,0.0,0.2,0.0,0.8,0.2
515,0.09375,0.222424,0.083333,0.619048,0.157,0.555556,0.55,0.0,0.4,0.333333,0.2,0.8
512,0.53125,0.185185,0.25,0.333333,0.428571,0.555556,0.5,1.0,0.7,1.0,0.4,0.8
240,0.53125,0.259259,0.25,0.0,0.428571,0.740741,0.75,0.0,1.0,1.0,0.6,0.2
538,1.0,0.851852,0.75,0.142857,0.571429,0.925926,1.0,0.0,0.5,1.0,1.0,0.0


In [452]:
#bikin model decision tree
from sklearn .metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
dtc = DecisionTreeClassifier(
    ccp_alpha=0.0, class_weight=None, criterion='entropy', max_features=None, max_leaf_nodes=None,
    min_impurity_decrease=0.0, min_samples_leaf=3,
    min_samples_split=3, min_weight_fraction_leaf=0.0,
    random_state=42, splitter='best'
)

model = dtc.fit(X_train, y_train)
dtc_acc = accuracy_score(y_test, dtc.predict(X_test))
print(f"akurasi data training = {accuracy_score(y_train, dtc.predict(X_train))}")
print(f"akurasi data testing = {dtc_acc} \n")

print(f"confusion matrix : \n{confusion_matrix(y_test, dtc.predict(X_test))}\n")
confusion = confusion_matrix(y_test, dtc.predict(X_test))

y_pred_dtc = dtc.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dtc))

# Menampilkan laporan klasifikasi
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_dtc))

akurasi data training = 0.939047619047619
akurasi data testing = 0.9166666666666666 

confusion matrix : 
[[42  2  0]
 [ 3 40  1]
 [ 3  2 39]]

Decision Tree Accuracy: 0.9166666666666666

Classification Report:

              precision    recall  f1-score   support

           0       0.88      0.95      0.91        44
           1       0.91      0.91      0.91        44
           2       0.97      0.89      0.93        44

    accuracy                           0.92       132
   macro avg       0.92      0.92      0.92       132
weighted avg       0.92      0.92      0.92       132



In [453]:
joblib.dump(dtc, 'model.joblib')

['model.joblib']

In [454]:
from sklearn .metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#Menggunakan algoritme Decision Tree Classifier untuk melakukan klasifikasi.
dt = DecisionTreeClassifier(random_state=42)
#Melatih Model | X_train: Fitur pada data pelatihan. | y_train: Label target pada data pelatihan.
dt.fit(X_train, y_train)
#Membuat Prediksi | X_test: Fitur pada data uji. | Hasil prediksi disimpan dalam variabel y_pred_dt.
y_pred_dt = dt.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))

# Menampilkan laporan klasifikasi
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_dt))

Decision Tree Accuracy: 0.8636363636363636

Classification Report:

              precision    recall  f1-score   support

           0       0.79      0.95      0.87        44
           1       0.90      0.82      0.86        44
           2       0.92      0.82      0.87        44

    accuracy                           0.86       132
   macro avg       0.87      0.86      0.86       132
weighted avg       0.87      0.86      0.86       132



In [455]:

import joblib # Import library untuk menyimpan model

joblib.dump(dt, 'decision_tree_model.joblib')


['decision_tree_model.joblib']