**Python libraries:**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score

In [None]:
data = pd.read_csv("diabetes.csv")
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [None]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


**Load Data:**

In [None]:
data = pd.read_csv("diabetes.csv")
data
# Separate features (X) and target variable (y)
X = data.drop("Outcome", axis=1)
y = data["Outcome"]

**Preprocessing:**

In [None]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [None]:
data.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
# Some features contain 0, which doesn't make sense in a few columns
# Clearly, 0 indicates missing value so we replace 0 by NaN
data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.NaN)

In [None]:
# Let's examine the distribution of the 'target' variable
data['Outcome'].value_counts()*100 / len(data)

Outcome
0    65.104167
1    34.895833
Name: count, dtype: float64

In [None]:
# Some features contain 0, which doesn't make sense in a few columns
# Clearly, 0 indicates missing value so we replace 0 by NaN
data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.NaN)

In [None]:
# The missing values will be filled with the median values of each variable
data['Glucose'] = data['Glucose'].fillna(data['Glucose'].median())
data['BloodPressure'] = data['BloodPressure'].fillna(data['BloodPressure'].median())
data['SkinThickness'] = data['SkinThickness'].fillna(data['SkinThickness'].median())
data['Insulin'] = data['Insulin'].fillna(data['Insulin'].median())
data['BMI'] = data['BMI'].fillna(data['BMI'].median())

In [None]:
# Let's check if our dataset has any more null values now
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [None]:
# According to BMI, some ranges can be determined and categorical variables can then be assigned
NewBMI = pd.Series(["Underweight", "Normal", "Overweight", "Obesity-I", "Obesity-II", "Obesity-III"], dtype = "category")
data["NewBMI"] = NewBMI

data.loc[data["BMI"] < 18.5, "NewBMI"] = NewBMI[0]
data.loc[(data["BMI"] > 18.5) & (data["BMI"] <= 24.9), "NewBMI"] = NewBMI[1]
data.loc[(data["BMI"] > 24.9) & (data["BMI"] <= 29.9), "NewBMI"] = NewBMI[2]
data.loc[(data["BMI"] > 29.9) & (data["BMI"] <= 34.9), "NewBMI"] = NewBMI[3]
data.loc[(data["BMI"] > 34.9) & (data["BMI"] <= 39.9), "NewBMI"] = NewBMI[4]
data.loc[data["BMI"] > 39.9 , "NewBMI"] = NewBMI[5]

In [None]:
# According to Glucose value, some ranges can be determined and categorical variables can then be assigned
NewGlucose = pd.Series(["Normal", "Prediabetes", "High"], dtype = "category")
data["NewGlucose"] = NewGlucose

data.loc[data["Glucose"] < 140, "NewGlucose"] = NewGlucose[0]
data.loc[(data["Glucose"] >= 140) & (data["Glucose"] <= 199), "NewGlucose"] = NewGlucose[1]
data.loc[data["Glucose"] > 199 ,"NewGlucose"] = NewGlucose[2]

In [None]:
# According to Insulin value, some ranges can be determined and categorical variables can then be assigned
def set_insulin(row):
    if row["Insulin"] >= 16 and row["Insulin"] <= 166:
        return "Normal"
    else:
        return "Abnormal"

data = data.assign(NewInsulin = data.apply(set_insulin, axis=1))

In [None]:
# By making One Hot Encoding transformation, categorical variables were converted into numerical values
# One Hot Encoding -> new column for each category
# To avoid dummy-variable trap, we should have 1 less dummies column
data = pd.get_dummies(data, columns =["NewBMI", "NewInsulin", "NewGlucose"], drop_first = True)

In [None]:
# Re-ordering the columns
data = data.reindex(columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
                                     'DiabetesPedigreeFunction', 'Age', 'NewInsulin_Normal', 'NewGlucose_Normal',
                                     'NewGlucose_Prediabetes', 'NewBMI_Obesity-I', 'NewBMI_Obesity-II', 'NewBMI_Obesity-III',
                                     'NewBMI_Overweight', 'NewBMI_Underweight', 'Outcome'])
data.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,NewInsulin_Normal,NewGlucose_Normal,NewGlucose_Prediabetes,NewBMI_Obesity-I,NewBMI_Obesity-II,NewBMI_Obesity-III,NewBMI_Overweight,NewBMI_Underweight,Outcome
0,6,148.0,72.0,35.0,125.0,33.6,0.627,50,True,False,True,True,False,False,False,False,1
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31,True,True,False,False,False,False,True,False,0
2,8,183.0,64.0,29.0,125.0,23.3,0.672,32,True,False,True,False,False,False,False,False,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,True,True,False,False,False,False,True,False,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,False,True,False,False,False,True,False,False,1


In [None]:
# Standardize features (optional, but recommended for some algorithms)
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data.iloc[:, :-1])
X = data_scaled  # Replace with scaled data if scaling is performed
y = data['Outcome']  # Target variable
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

**Split data into training and testing sets:**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=1)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

**LR**

In [None]:
# Define and train the model
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)

# Make predictions on test set
y_pred_lr = model_lr.predict(X_test)

# Evaluate model performance
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)

print("Logistic Regression:")
print(f"Accuracy: {accuracy_lr:.4f}")
print(f"Precision: {precision_lr:.4f}")
print(f"Recall: {recall_lr:.4f}")
print("F1-score:", f1_lr)

Logistic Regression:
Accuracy: 0.7792
Precision: 0.7500
Recall: 0.6000
F1-score: 0.6666666666666665


**RF**

In [None]:
# Define and train the model
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

# Make predictions on test set
y_pred_rf = model_rf.predict(X_test)

# Evaluate model performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print("\nRandom Forest:")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_lr:.4f}")
print("F1-score:", f1_rf)


Random Forest:
Accuracy: 0.8139
Precision: 0.8000
Recall: 0.5765
F1-score: 0.7225806451612903


**SVM**

In [None]:
# Define and train the model
model_svm = SVC(kernel="linear")
model_svm.fit(X_train, y_train)

# Make predictions on test set
y_pred_svm = model_svm.predict(X_test)

# Evaluate model performance
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)

print("\nSupport Vector Machine:")
print(f"Accuracy: {accuracy_svm:.4f}")
print(f"Precision: {precision_svm:.4f}")
print(f"Recall: {recall_svm:.4f}")
print("F1-score:", f1_svm)


Support Vector Machine:
Accuracy: 0.7922
Precision: 0.7937
Recall: 0.5882
F1-score: 0.6756756756756757


**K NN**

In [None]:
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train, y_train)
y_pred_knn = model_knn.predict(X_test)

accuracy_knn = accuracy_score(y_test, y_pred_knn)
precision_knn = precision_score(y_test, y_pred_knn)
recall_knn = recall_score(y_test, y_pred_knn)
f1_knn = f1_score(y_test, y_pred_knn)

print("\nK Nearest Neighbors:")
print(f"Accuracy: {accuracy_knn:.4f}")
print(f"Precision: {precision_knn:.4f}")
print(f"Recall: {recall_knn:.4f}")
print("F1-score:", f1_knn)


K Nearest Neighbors:
Accuracy: 0.7922
Precision: 0.7534
Recall: 0.6471
F1-score: 0.6962025316455697


**GB**

In [None]:
model_gb = GradientBoostingClassifier(random_state=42)
model_gb.fit(X_train, y_train)
y_pred_gb = model_gb.predict(X_test)

accuracy_gb = accuracy_score(y_test, y_pred_gb)
precision_gb = precision_score(y_test, y_pred_gb)
recall_gb = recall_score(y_test, y_pred_gb)
f1_gb = f1_score(y_test, y_pred_gb)

print("\nGradient Boosting Classifier:")
print(f"Accuracy: {accuracy_gb:.4f}")
print(f"Precision: {precision_gb:.4f}")
print(f"Recall: {recall_gb:.4f}")
print("F1-score:", f1_gb)


Gradient Boosting Classifier:
Accuracy: 0.7965
Precision: 0.7639
Recall: 0.6471
F1-score: 0.7006369426751593
