In [None]:
!kaggle datasets download -d alexteboul/diabetes-health-indicators-dataset

Dataset URL: https://www.kaggle.com/datasets/alexteboul/diabetes-health-indicators-dataset
License(s): CC0-1.0
Downloading diabetes-health-indicators-dataset.zip to /content
  0% 0.00/6.03M [00:00<?, ?B/s]
100% 6.03M/6.03M [00:00<00:00, 187MB/s]


In [None]:
!unzip /content/diabetes-health-indicators-dataset.zip -d /content/diabetes-health-indicators-dataset

Archive:  /content/diabetes-health-indicators-dataset.zip
  inflating: /content/diabetes-health-indicators-dataset/diabetes_012_health_indicators_BRFSS2015.csv  
  inflating: /content/diabetes-health-indicators-dataset/diabetes_binary_5050split_health_indicators_BRFSS2015.csv  
  inflating: /content/diabetes-health-indicators-dataset/diabetes_binary_health_indicators_BRFSS2015.csv  


In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [None]:
features = pd.read_csv('/content/diabetes-health-indicators-dataset/diabetes_binary_5050split_health_indicators_BRFSS2015.csv')

In [None]:
features.columns

# PreProcessing


In [None]:
# gộp
features.loc[features['Education'] < 3, 'Education'] = 3


In [None]:
features.drop_duplicates(inplace=True)
features.reset_index(drop=True, inplace=True)

In [None]:
# loại bỏ các dữ liệu trùng lặp
Sscaler = StandardScaler()
Rscaler = RobustScaler()
Mscaler = MinMaxScaler()
features_standardized = pd.DataFrame(Sscaler.fit_transform(features), columns=features.columns)
features_robust = pd.DataFrame(Rscaler.fit_transform(features), columns=features.columns)
features_MinMax = pd.DataFrame(Mscaler.fit_transform(features), columns=features.columns)



features[['BMI', 'Age', 'GenHlth']] = features_standardized[['BMI', 'Age', 'GenHlth']]
features[['MentHlth', 'PhysHlth']] = features_robust[['MentHlth', 'PhysHlth']]
features[['Education', 'Income']] = features_MinMax[['Education', 'Income']]


In [None]:
features

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,-0.553426,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.123058,1.666667,5.000000,0.0,1.0,-1.610757,1.000000,1.000000
1,0.0,1.0,1.0,1.0,-0.553426,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.123058,0.000000,0.000000,0.0,1.0,1.188163,1.000000,1.000000
2,0.0,0.0,0.0,1.0,-0.553426,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,-1.682092,0.000000,1.666667,0.0,1.0,1.538027,1.000000,1.000000
3,0.0,1.0,1.0,1.0,-0.273626,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.123058,0.000000,0.500000,0.0,1.0,0.838298,1.000000,1.000000
4,0.0,0.0,0.0,1.0,-0.133727,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,-0.779517,0.000000,0.000000,0.0,0.0,-0.211297,0.666667,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69050,1.0,0.0,1.0,1.0,0.985471,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.025633,0.000000,0.000000,0.0,0.0,-0.911027,0.333333,0.000000
69051,1.0,0.0,1.0,1.0,-0.133727,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,-0.779517,0.000000,0.000000,1.0,1.0,0.488433,0.000000,0.714286
69052,1.0,1.0,1.0,1.0,-0.693325,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.928207,5.000000,0.000000,1.0,0.0,1.538027,1.000000,0.428571
69053,1.0,1.0,1.0,1.0,-1.672623,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.025633,0.000000,0.000000,1.0,0.0,0.838298,0.000000,0.428571


# chia tập thành train | test

In [None]:
class_0 = features[features['Diabetes_binary'] == 0]
class_1 = features[features['Diabetes_binary'] == 1]

x_class_0, y_class_0 = class_0.drop(columns=['Diabetes_binary'], axis=1), class_0['Diabetes_binary']
x_class_1, y_class_1 = class_1.drop(columns=['Diabetes_binary'], axis=1), class_1['Diabetes_binary']

x_0_train, x_0_test, y_0_train, y_0_test = train_test_split(x_class_0, y_class_0, test_size=0.2, random_state=1)
x_1_train, x_1_test, y_1_train, y_1_test = train_test_split(x_class_1, y_class_1, test_size=0.2, random_state=1)

x_train = pd.concat(objs=[x_0_train, x_1_train], ignore_index=True).to_numpy()
y_train = pd.concat(objs=[y_0_train, y_1_train], ignore_index=True).to_numpy()
index = np.arange(x_train.shape[0])
np.random.shuffle(index)
x_train = x_train[index]
y_train = y_train[index]

x_test = pd.concat(objs=[x_0_test, x_1_test], ignore_index=True).to_numpy()
y_test = pd.concat(objs=[y_0_test, y_1_test], ignore_index=True).to_numpy()
index = np.arange(x_test.shape[0])
np.random.shuffle(index)
x_test = x_test[index]
y_test = y_test[index]

# chia tập thành train | val | test

In [None]:
class_0 = features[features['Diabetes_binary'] == 0]
class_1 = features[features['Diabetes_binary'] == 1]

x_class_0, y_class_0 = class_0.drop(columns=['Diabetes_binary'], axis=1), class_0['Diabetes_binary']
x_class_1, y_class_1 = class_1.drop(columns=['Diabetes_binary'], axis=1), class_1['Diabetes_binary']

x_0_train, x_0_test, y_0_train, y_0_test = train_test_split(x_class_0, y_class_0, test_size=0.2, random_state=1)
x_1_train, x_1_test, y_1_train, y_1_test = train_test_split(x_class_1, y_class_1, test_size=0.2, random_state=1)

x_0_train, x_0_val, y_0_train, y_0_val = train_test_split(x_0_train, y_0_train, test_size=0.25, random_state=1)
x_1_train, x_1_val, y_1_train, y_1_val = train_test_split(x_1_train, y_1_train, test_size=0.25, random_state=1)



x_train = pd.concat(objs=[x_0_train, x_1_train], ignore_index=True).to_numpy()
y_train = pd.concat(objs=[y_0_train, y_1_train], ignore_index=True).to_numpy()
index = np.arange(x_train.shape[0])
np.random.shuffle(index)
x_train = x_train[index]
y_train = y_train[index]

x_val = pd.concat(objs=[x_0_val, x_1_val], ignore_index=True).to_numpy()
y_val = pd.concat(objs=[y_0_val, y_1_val], ignore_index=True).to_numpy()
index = np.arange(x_val.shape[0])
np.random.shuffle(index)
x_val = x_val[index]
y_val = y_val[index]

x_test = pd.concat(objs=[x_0_test, x_1_test], ignore_index=True).to_numpy()
y_test = pd.concat(objs=[y_0_test, y_1_test], ignore_index=True).to_numpy()
index = np.arange(x_test.shape[0])
np.random.shuffle(index)
x_test = x_test[index]
y_test = y_test[index]