In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("/content/drive/MyDrive/diabates/diabetes.csv")

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
df.describe()
# there are 0 values which is impossible. (Glucose, BloodPressure, Skinthickness, Unsulin)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [8]:
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
columns_check = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]

In [10]:
for col in columns_check:
  zero_count = (df[col] == 0 ).sum()
  zero_percentage = (zero_count * 100) / len(df)
  print(f"{col}: {zero_count} %{zero_percentage:.2f}")

Glucose: 5 %0.65
BloodPressure: 35 %4.56
SkinThickness: 227 %29.56
Insulin: 374 %48.70
BMI: 11 %1.43


In [12]:
medians = {}
for col in columns_check:
    median_value = X_train[X_train[col] != 0][col].median()
    medians[col] = median_value
    X_train[col] = X_train[col].replace(0, median_value)

for col in columns_check:
    X_test[col] = X_test[col].replace(0, medians[col])

In [13]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [21]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

**AdaBoost Classifier**

In [20]:
adaboost_param = {
        "n_estimators" : [50, 70, 100, 120, 150, 200],
        "learning_rate" : [0.01, 0.01, 0.1, 1, 10]
}

grid = GridSearchCV(estimator=AdaBoostClassifier(), param_grid=adaboost_param, cv=5, n_jobs= -1)

grid.fit(X_train, y_train)
y_train_pred = grid.predict(X_train)
y_pred = grid.predict(X_test)

print("Best Params : \n", grid.best_params_,"\n")
print("Train Accuracy Score : \n", accuracy_score(y_train, y_pred=y_train_pred))
print("----------------------")
print("Test Accuraccy Score : \n", accuracy_score(y_test, y_pred=y_pred), "\n")
print("Classification Report : \n", classification_report(y_test, y_pred=y_pred), "\n")
print("Confusion Matrix : \n", confusion_matrix(y_test, y_pred=y_pred))

Best Params : 
 {'learning_rate': 1, 'n_estimators': 100} 

Train Accuracy Score : 
 0.8013029315960912
----------------------
Test Accuraccy Score : 
 0.7727272727272727 

Classification Report : 
               precision    recall  f1-score   support

           0       0.83      0.81      0.82        99
           1       0.67      0.71      0.69        55

    accuracy                           0.77       154
   macro avg       0.75      0.76      0.76       154
weighted avg       0.78      0.77      0.77       154
 

Confusion Matrix : 
 [[80 19]
 [16 39]]


**Random Forest Classifier**

In [24]:
from sklearn.ensemble import RandomForestClassifier
import warnings

warnings.filterwarnings("ignore")

randomf_param = {
    "n_estimators" : [50, 100, 300, 500],
    "max_depth" : [3, 5, 9, 15, 27, None],
    "criterion" : ["gini", "entropy", "log_los"],
}

randomcv = RandomizedSearchCV(estimator=RandomForestClassifier(), cv=5, param_distributions=randomf_param, n_jobs=-1)
randomcv.fit(X_train, y_train)
y_train_pred = randomcv.predict(X_train)
y_pred = randomcv.predict(X_test)

print("Best Params : \n", randomcv.best_params_,"\n")
print("Train Accuracy Score : \n", accuracy_score(y_train, y_pred=y_train_pred))
print("----------------------")
print("Test Accuraccy Score : \n", accuracy_score(y_test, y_pred=y_pred), "\n")
print("Classification Report : \n", classification_report(y_test, y_pred=y_pred), "\n")
print("Confusion Matrix : \n", confusion_matrix(y_test, y_pred=y_pred))

Best Params : 
 {'n_estimators': 300, 'max_depth': None, 'criterion': 'entropy'} 

Train Accuracy Score : 
 1.0
----------------------
Test Accuraccy Score : 
 0.7532467532467533 

Classification Report : 
               precision    recall  f1-score   support

           0       0.81      0.80      0.81        99
           1       0.65      0.67      0.66        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154
 

Confusion Matrix : 
 [[79 20]
 [18 37]]


**KNeighbors Classifier**

In [25]:
from sklearn.neighbors import KNeighborsClassifier

knn_param = {
    "n_neighbors" : [3, 5, 7, 9, 13, 21],
    "algorithm" : ["auto", "ball_tree", "kd_tree"],
    "weights" : ["uniform", "distance"]
}

random_knn = RandomizedSearchCV(estimator=KNeighborsClassifier(), cv= 5, param_distributions=knn_param, n_jobs=-1)

random_knn.fit(X_train, y_train)
y_train_pred = random_knn.predict(X_train)
y_pred = random_knn.predict(X_test)

print("Best Params : \n", random_knn.best_params_,"\n")
print("Train Accuracy Score : \n", accuracy_score(y_train, y_pred=y_train_pred))
print("----------------------")
print("Test Accuraccy Score : \n", accuracy_score(y_test, y_pred=y_pred), "\n")
print("Classification Report : \n", classification_report(y_test, y_pred=y_pred), "\n")
print("Confusion Matrix : \n", confusion_matrix(y_test, y_pred=y_pred))

Best Params : 
 {'weights': 'uniform', 'n_neighbors': 21, 'algorithm': 'kd_tree'} 

Train Accuracy Score : 
 0.8013029315960912
----------------------
Test Accuraccy Score : 
 0.7662337662337663 

Classification Report : 
               precision    recall  f1-score   support

           0       0.79      0.86      0.83        99
           1       0.70      0.60      0.65        55

    accuracy                           0.77       154
   macro avg       0.75      0.73      0.74       154
weighted avg       0.76      0.77      0.76       154
 

Confusion Matrix : 
 [[85 14]
 [22 33]]


**DecisionTree Classifier**

In [26]:
from sklearn.tree import DecisionTreeClassifier

tree_param = {
    "criterion" : ["gini", "entropy", "log_los"],
    "splitter" : ["best", "random"],
    "max_features" : ["sqrt", "log2"],
    "max_depth" : [3, 5, 7, 13, 17, 21 ,None]
}

random_dt = RandomizedSearchCV(estimator=DecisionTreeClassifier(), cv= 5, param_distributions=tree_param, n_jobs=-1)

random_dt.fit(X_train, y_train)
y_train_pred = random_dt.predict(X_train)
y_pred = random_dt.predict(X_test)

print("Best Params : \n", random_dt.best_params_,"\n")
print("Train Accuracy Score : \n", accuracy_score(y_train, y_pred=y_train_pred))
print("----------------------")
print("Test Accuraccy Score : \n", accuracy_score(y_test, y_pred=y_pred), "\n")
print("Classification Report : \n", classification_report(y_test, y_pred=y_pred), "\n")
print("Confusion Matrix : \n", confusion_matrix(y_test, y_pred=y_pred))

Best Params : 
 {'splitter': 'best', 'max_features': 'log2', 'max_depth': None, 'criterion': 'entropy'} 

Train Accuracy Score : 
 1.0
----------------------
Test Accuraccy Score : 
 0.7207792207792207 

Classification Report : 
               precision    recall  f1-score   support

           0       0.84      0.70      0.76        99
           1       0.58      0.76      0.66        55

    accuracy                           0.72       154
   macro avg       0.71      0.73      0.71       154
weighted avg       0.75      0.72      0.73       154
 

Confusion Matrix : 
 [[69 30]
 [13 42]]


**Support Vector Classifier**

In [28]:
from sklearn.svm import SVC

svc_param = {
    "C" : [0.1, 1, 10, 100],
    "kernel" : ["linear", "rbf", "sigmoid", "poly"]
}

random_svc = RandomizedSearchCV(estimator=SVC(), cv= 5, param_distributions=svc_param, n_jobs=-1)

random_svc.fit(X_train, y_train)
y_train_pred = random_svc.predict(X_train)
y_pred = random_svc.predict(X_test)

print("Best Params : \n", random_svc.best_params_,"\n")
print("Train Accuracy Score : \n", accuracy_score(y_train, y_pred=y_train_pred))
print("----------------------")
print("Test Accuraccy Score : \n", accuracy_score(y_test, y_pred=y_pred), "\n")
print("Classification Report : \n", classification_report(y_test, y_pred=y_pred), "\n")
print("Confusion Matrix : \n", confusion_matrix(y_test, y_pred=y_pred))

Best Params : 
 {'kernel': 'linear', 'C': 1} 

Train Accuracy Score : 
 0.7703583061889251
----------------------
Test Accuraccy Score : 
 0.7532467532467533 

Classification Report : 
               precision    recall  f1-score   support

           0       0.80      0.83      0.81        99
           1       0.67      0.62      0.64        55

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.73       154
weighted avg       0.75      0.75      0.75       154
 

Confusion Matrix : 
 [[82 17]
 [21 34]]


**Logistic Regression**

In [29]:
from sklearn.linear_model import LogisticRegression

logistic_param = {
    "C" : [0.1, 1, 10, 100],
    "penalty" : ["l1", "l2", "elasticnet"],
    "solver" : ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"]
}

random_logistic = RandomizedSearchCV(estimator=LogisticRegression(), cv= 5, param_distributions=logistic_param, n_jobs=-1)

random_logistic.fit(X_train, y_train)
y_train_pred = random_logistic.predict(X_train)
y_pred = random_logistic.predict(X_test)

print("Best Params : \n", random_logistic.best_params_,"\n")
print("Train Accuracy Score : \n", accuracy_score(y_train, y_pred=y_train_pred))
print("----------------------")
print("Test Accuraccy Score : \n", accuracy_score(y_test, y_pred=y_pred), "\n")
print("Classification Report : \n", classification_report(y_test, y_pred=y_pred), "\n")
print("Confusion Matrix : \n", confusion_matrix(y_test, y_pred=y_pred))

Best Params : 
 {'solver': 'newton-cholesky', 'penalty': 'l2', 'C': 1} 

Train Accuracy Score : 
 0.7719869706840391
----------------------
Test Accuraccy Score : 
 0.7532467532467533 

Classification Report : 
               precision    recall  f1-score   support

           0       0.80      0.83      0.81        99
           1       0.67      0.62      0.64        55

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.73       154
weighted avg       0.75      0.75      0.75       154
 

Confusion Matrix : 
 [[82 17]
 [21 34]]
