In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [3]:
diabetes_data=pd.read_csv("diabetes_prediction_dataset.csv")
diabetes_data

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
27523,Female,28.0,0,0,No Info,40.10,6.0,90,0
27524,Male,31.0,0,0,current,19.81,6.2,155,0
27525,Male,14.0,0,0,never,27.32,6.5,130,0
27526,Male,53.0,0,0,never,29.40,6.6,155,0


In [4]:
diabetes_data['smoking_history']=diabetes_data['smoking_history'].map({'never':0,'current':1,
                                                                      'former':2,'ever':3,'not current':4})
diabetes_data['gender']=diabetes_data['gender'].map({'Female':1,'Male':0})
diabetes_data['smoking_history']=diabetes_data['smoking_history'].fillna(1)
diabetes_data

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,1.0,80.0,0,1,0.0,25.19,6.6,140,0
1,1.0,54.0,0,0,1.0,27.32,6.6,80,0
2,0.0,28.0,0,0,0.0,27.32,5.7,158,0
3,1.0,36.0,0,0,1.0,23.45,5.0,155,0
4,0.0,76.0,1,1,1.0,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
27523,1.0,28.0,0,0,1.0,40.10,6.0,90,0
27524,0.0,31.0,0,0,1.0,19.81,6.2,155,0
27525,0.0,14.0,0,0,0.0,27.32,6.5,130,0
27526,0.0,53.0,0,0,0.0,29.40,6.6,155,0


In [5]:
X=diabetes_data.drop('diabetes',axis=1)
y=diabetes_data['diabetes']
X['gender']=X['gender'].fillna(X['gender'].mean())
scale=MinMaxScaler()
scale.fit(X)

In [6]:
MinMaxScaler
MinMaxScaler()
X_transform=pd.DataFrame(scale.transform(X), columns=X.columns)
X_transform['gender']=X_transform['gender'].fillna(X_transform['gender'].mean())
X_transform.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27528 entries, 0 to 27527
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               27528 non-null  float64
 1   age                  27528 non-null  float64
 2   hypertension         27528 non-null  float64
 3   heart_disease        27528 non-null  float64
 4   smoking_history      27528 non-null  float64
 5   bmi                  27528 non-null  float64
 6   HbA1c_level          27528 non-null  float64
 7   blood_glucose_level  27528 non-null  float64
dtypes: float64(8)
memory usage: 1.7 MB


In [7]:
X_train, X_test, y_train, y_test=train_test_split(X_transform,y,test_size=0.25, random_state=0)
clf_knn=KNeighborsClassifier()
parametrs_knn={'n_neighbors':[1,3,5,7,9,11], 'metric':['euclidean','manhattan','chebyshev']}
grid_clf_knn=GridSearchCV(clf_knn, parametrs_knn, cv=6, n_jobs=-1)
grid_clf_knn.fit(X_train, y_train)
GridSearchCV
estimator: KNeighborsClassifier

In [8]:
KNeighborsClassifier
best_model_knn=grid_clf_knn.best_estimator_
y_pred_knn=best_model_knn.predict(X_test)
print(f'Accuracy_score: {accuracy_score(y_test,y_pred_knn)}')
print(f'Precision_score: {precision_score(y_test,y_pred_knn)}')
print(f'Recall_score: {recall_score(y_test,y_pred_knn)}')
print(f'f1_score: {f1_score(y_test,y_pred_knn)}')

Accuracy_score: 0.9597500726532985
Precision_score: 0.9203539823008849
Recall_score: 0.5551601423487544
f1_score: 0.6925638179800221


In [11]:
clf_rnf=RandomForestClassifier()
parametrs_rnf={'n_estimators':[3,5,7,10], 'max_depth':[2,3,4,5,6]}
grid_forest=GridSearchCV(clf_rnf, parametrs_rnf, cv=6, n_jobs=-1)
grid_forest.fit(X_train,y_train)
GridSearchCV
estimator: RandomForestClassifier

RandomForestClassifier
best_model_rnf=grid_forest.best_estimator_
y_pred_rnf=best_model_rnf.predict(X_test)
accuracy=accuracy_score(y_test,y_pred_rnf)
print("Accuracy_score",accuracy*100)
print(f'Precision_score: {precision_score(y_test,y_pred_rnf)}')
print(f'Recall_score: {recall_score(y_test,y_pred_rnf)}')
print(f'f1_score: {f1_score(y_test,y_pred_rnf)}')

Accuracy_score 97.2972972972973
Precision_score: 1.0
Recall_score: 0.6690391459074733
f1_score: 0.8017057569296374


In [13]:
X_train_nn, X_test_nn, y_train_nn,y_test_nn=train_test_split(X,y, test_size=0.25,random_state=0)
clf_rnf_with_no_norm=RandomForestClassifier()
parametrs_rnf={'n_estimators':[3,5,7,10], 'max_depth':[2,3,4,5,6]}
grid_forest_no_norm=GridSearchCV(clf_rnf_with_no_norm, parametrs_rnf, cv=6, n_jobs=-1)
grid_forest_no_norm.fit(X_train_nn,y_train_nn)
GridSearchCV
estimator: RandomForestClassifier

RandomForestClassifier
best_model_rnf_nn=grid_forest_no_norm.best_estimator_
y_pred_rnf_nn=best_model_rnf_nn.predict(X_test_nn)
accuracy=accuracy_score(y_test_nn,y_pred_rnf_nn)
print('Accuracy_score',accuracy*100)
print(f'Precision_score: {precision_score(y_test_nn,y_pred_rnf_nn)}')
print(f'Recall_score: {recall_score(y_test_nn,y_pred_rnf_nn)}')
print(f'f1_score: {f1_score(y_test_nn,y_pred_rnf_nn)}')

Accuracy_score 97.2972972972973
Precision_score: 1.0
Recall_score: 0.6690391459074733
f1_score: 0.8017057569296374
