In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [28]:
features = pd.read_csv('diabetes.csv')

In [3]:
print(features.columns)
print(features.shape)

Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')
(70692, 22)


# PreProcessing

In [25]:
features 

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70687,1.0,0.0,1.0,1.0,37.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,0.0,0.0,6.0,4.0,1.0
70688,1.0,0.0,1.0,1.0,29.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,2.0,0.0,0.0,1.0,1.0,10.0,3.0,6.0
70689,1.0,1.0,1.0,1.0,25.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,5.0,15.0,0.0,1.0,0.0,13.0,6.0,4.0
70690,1.0,1.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,1.0,0.0,11.0,2.0,4.0


In [23]:
# gộp
features.loc[features['Education'] < 3, 'Education'] = 3

In [5]:
features.drop_duplicates(inplace=True)
features.reset_index(drop=True, inplace=True)

In [26]:
# loại bỏ các dữ liệu trùng lặp
features.drop_duplicates(inplace=True)
features.reset_index(drop=True, inplace=True)
Sscaler = StandardScaler()
Rscaler = RobustScaler()
Mscaler = MinMaxScaler()
features_standardized = pd.DataFrame(Sscaler.fit_transform(features), columns=features.columns)
features_robust = pd.DataFrame(Rscaler.fit_transform(features), columns=features.columns)
features_MinMax = pd.DataFrame(Mscaler.fit_transform(features), columns=features.columns)



features[['BMI', 'Age', 'GenHlth']] = features_standardized[['BMI', 'Age', 'GenHlth']]
features[['MentHlth', 'PhysHlth']] = features_robust[['MentHlth', 'PhysHlth']]
features[['Education', 'Income']] = features_MinMax[['Education', 'Income']]

In [29]:
# Print mean and std of 'BMI', 'Age', 'GenHlth'
print(features[['BMI', 'Age', 'GenHlth']].agg(['mean', 'std'])) 

            BMI       Age   GenHlth
mean  29.856985  8.584055  2.837082
std    7.113954  2.852153  1.113565


# chia tập thành train | test

In [7]:
class_0 = features[features['Diabetes_binary'] == 0]
class_1 = features[features['Diabetes_binary'] == 1]

x_class_0, y_class_0 = class_0.drop(columns=['Diabetes_binary'], axis=1), class_0['Diabetes_binary']
x_class_1, y_class_1 = class_1.drop(columns=['Diabetes_binary'], axis=1), class_1['Diabetes_binary']

x_0_train, x_0_test, y_0_train, y_0_test = train_test_split(x_class_0, y_class_0, test_size=0.2, random_state=1)
x_1_train, x_1_test, y_1_train, y_1_test = train_test_split(x_class_1, y_class_1, test_size=0.2, random_state=1)

x_train = pd.concat(objs=[x_0_train, x_1_train], ignore_index=True).to_numpy()
y_train = pd.concat(objs=[y_0_train, y_1_train], ignore_index=True).to_numpy()
index = np.arange(x_train.shape[0])
np.random.shuffle(index)
x_train = x_train[index]
y_train = y_train[index]

x_test = pd.concat(objs=[x_0_test, x_1_test], ignore_index=True).to_numpy()
y_test = pd.concat(objs=[y_0_test, y_1_test], ignore_index=True).to_numpy()
index = np.arange(x_test.shape[0])
np.random.shuffle(index)
x_test = x_test[index]
y_test = y_test[index]

# chia tập thành train | val | test

In [8]:
class_0 = features[features['Diabetes_binary'] == 0]
class_1 = features[features['Diabetes_binary'] == 1]

x_class_0, y_class_0 = class_0.drop(columns=['Diabetes_binary'], axis=1), class_0['Diabetes_binary']
x_class_1, y_class_1 = class_1.drop(columns=['Diabetes_binary'], axis=1), class_1['Diabetes_binary']

x_0_train, x_0_test, y_0_train, y_0_test = train_test_split(x_class_0, y_class_0, test_size=0.2, random_state=1)
x_1_train, x_1_test, y_1_train, y_1_test = train_test_split(x_class_1, y_class_1, test_size=0.2, random_state=1)

x_0_train, x_0_val, y_0_train, y_0_val = train_test_split(x_0_train, y_0_train, test_size=0.25, random_state=1)
x_1_train, x_1_val, y_1_train, y_1_val = train_test_split(x_1_train, y_1_train, test_size=0.25, random_state=1)



x_train = pd.concat(objs=[x_0_train, x_1_train], ignore_index=True).to_numpy()
y_train = pd.concat(objs=[y_0_train, y_1_train], ignore_index=True).to_numpy()
index = np.arange(x_train.shape[0])
np.random.shuffle(index)
x_train = x_train[index]
y_train = y_train[index]

x_val = pd.concat(objs=[x_0_val, x_1_val], ignore_index=True).to_numpy()
y_val = pd.concat(objs=[y_0_val, y_1_val], ignore_index=True).to_numpy()
index = np.arange(x_val.shape[0])
np.random.shuffle(index)
x_val = x_val[index]
y_val = y_val[index]

x_test = pd.concat(objs=[x_0_test, x_1_test], ignore_index=True).to_numpy()
y_test = pd.concat(objs=[y_0_test, y_1_test], ignore_index=True).to_numpy()
index = np.arange(x_test.shape[0])
np.random.shuffle(index)
x_test = x_test[index]
y_test = y_test[index]

KNN

In [9]:
from sklearn.neighbors import KNeighborsClassifier

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import  f1_score
model_knn=KNeighborsClassifier()





In [12]:
model_knn=KNeighborsClassifier(n_neighbors=92, metric='manhattan')
model_knn.fit(x_train,y_train)

In [13]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

y_pred=model_knn.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(f1_score(y_test, y_pred, average='macro'))

0.7467960321482876
0.7455769809779851


In [15]:
columns_drop = ['MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']

In [16]:
x_train=np.concatenate((x_train,x_val),axis=0)
y_train=np.concatenate((y_train,y_val),axis=0)

print(x_train.shape)
print(y_train.shape)

(55244, 21)
(55244,)


In [17]:
features = pd.read_csv('diabetes.csv')
features=features.drop(columns="Diabetes_binary",axis=1)
x_train=pd.DataFrame(x_train,columns=features.columns)
y_train=pd.DataFrame(y_train,columns=["Diabetes_binary"])
x_test=pd.DataFrame(x_test,columns=features.columns)
y_test=pd.DataFrame(y_test,columns=["Diabetes_binary"])

In [18]:
x_train=x_train.drop(columns=columns_drop)
x_test=x_test.drop(columns=columns_drop)


In [19]:
x_train 

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth
0,1.0,1.0,1.0,-0.413526,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.928207
1,0.0,1.0,1.0,0.146073,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.123058
2,1.0,1.0,1.0,-0.413526,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.123058
3,1.0,1.0,1.0,-0.133727,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.123058
4,0.0,1.0,1.0,-0.833225,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.025633
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55239,1.0,0.0,1.0,2.244568,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,-1.682092
55240,0.0,0.0,1.0,-0.273626,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,-0.779517
55241,1.0,1.0,1.0,0.845571,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.123058
55242,1.0,1.0,1.0,0.425872,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.123058


In [20]:
y_train = y_train.squeeze()
y_test = y_test.squeeze()

In [21]:
from sklearn.neighbors import KNeighborsClassifier
model_knn=KNeighborsClassifier(n_neighbors=92,metric='manhattan',weights='uniform')

# model_knn=KNeighborsClassifier(n_neighbors=grid_search.best_params_['n_neighbors'],metric=grid_search.best_params_['metric'],weights=grid_search.best_params_['weights'])
model_knn.fit(x_train,y_train)

In [22]:

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

y_pred=model_knn.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(f1_score(y_test, y_pred, average='macro'))

0.739048584461661
0.7382071290858592


In [30]:
import joblib

# Save the trained KNN model
joblib.dump(model_knn, 'knn_model.pkl')
print("Model saved as 'knn_model.pkl'")

Model saved as 'knn_model.pkl'
