<a href="https://colab.research.google.com/github/leon3108/Applied/blob/main/Predict_Diabetes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

url = "https://raw.githubusercontent.com/leon3108/Applied/main/diabetes_prediction_dataset.csv"
df = pd.read_csv(url)

df.head()

# bmi(Body Mass Index) is a person's weight in kilograms (or pounds) divided by the square of height in meters (or feet)

# explain what is HbA1c_level is your average blood glucose (sugar) levels for the last two to three months

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [None]:
df["diabetes"].value_counts()
# data is imbalanced, adding weights to the class 1 could be an improvement

0    91500
1     8500
Name: diabetes, dtype: int64

In [2]:
from sklearn.preprocessing import LabelEncoder

print(df['smoking_history'].unique())
print(df['gender'].unique())

# as smoking_history and gender are categorical I need to modify the value to be numerical
label_encoder = LabelEncoder()
df['gender'] = label_encoder.fit_transform(df['gender'])
df['smoking_history'] = label_encoder.fit_transform(df['smoking_history'])

['never' 'No Info' 'current' 'former' 'ever' 'not current']
['Female' 'Male' 'Other']


In [3]:
from sklearn.model_selection import train_test_split

# choice of usefull feature
numerical_features = ["gender", "age", "hypertension", "heart_disease", "smoking_history", "bmi", "HbA1c_level", "blood_glucose_level"]
model_features = numerical_features

# choice of the target
model_target = 'diabetes'

# split the data for trainning and test/validation
x_train, x_val, y_train, y_val = train_test_split(df[model_features],
                                                  df[model_target],
                                                  test_size=0.10,
                                                  shuffle=True,
                                                  random_state=324
                                                 )

# Logistic Regression

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn import set_config

numerical_processor = Pipeline([('num_scaler', MinMaxScaler())])

data_preprocessor = ColumnTransformer(transformers=[
        ('numerical_pre', numerical_processor, model_features)
    ])

pipeline = Pipeline([
    ('data_preprocessing', data_preprocessor),
    ('logistic_regression', LogisticRegression())
])

set_config(display='diagram')
pipeline

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Définir la grille des hyperparamètres à rechercher
param_grid = {
    'logistic_regression__penalty': ['l2', 'none'],
    'logistic_regression__C': [0.1, 1, 10, 100, 200, 300],
    'logistic_regression__class_weight': [None, 'balanced', {0: 1, 1: 2}, {0: 1, 1: 3}, {0: 1, 1: 5}, {0: 1, 1: 10}],
    'logistic_regression__random_state': [42]
}

model = pipeline

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)

grid_search.fit(x_train, y_train)

print("Best params : ", grid_search.best_params_)

# Utiliser le modèle avec les meilleurs paramètres pour faire des prédictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_val)

val_predictions = best_model.predict(x_val)

print(confusion_matrix(y_val.values, val_predictions))
print(classification_report(y_val.values, val_predictions))
print("Accuracy (validation):", accuracy_score(y_val.values, val_predictions))



Best params :  {'logistic_regression__C': 0.1, 'logistic_regression__class_weight': None, 'logistic_regression__penalty': 'none', 'logistic_regression__random_state': 42}
[[9005   78]
 [ 325  592]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      9083
           1       0.88      0.65      0.75       917

    accuracy                           0.96     10000
   macro avg       0.92      0.82      0.86     10000
weighted avg       0.96      0.96      0.96     10000

Accuracy (validation): 0.9597


# SVM

In [None]:
# It takes too long to train the model,
# I have already try during many hours and it never end

from sklearn.svm import SVC
from sklearn import set_config
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

parameters = {
    'kernel': ['poly', 'linear', 'sigmoid'],
    'C': [0.1, 0.5, 1.0, 2.0],
    'gamma': ['auto', 'scale', 0.1, 0.01]
}

svc_model = SVC()

grid_search = GridSearchCV(estimator=svc_model, param_grid=parameters, cv=5, scoring='accuracy')

grid_search.fit(x_train, y_train)

best_params = grid_search.best_params_
print("Meilleurs paramètres :", best_params)

best_model = grid_search.best_estimator_
val_predictions = best_model.predict(x_val)

print(confusion_matrix(y_val.values, val_predictions))
print(classification_report(y_val.values, val_predictions))
print("Accuracy (validation):", accuracy_score(y_val.values, val_predictions))

# Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

parameters = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'class_weight': [None, 'balanced']
}

tree_model = DecisionTreeClassifier()

grid_search = GridSearchCV(estimator=tree_model, param_grid=parameters, cv=5, scoring='accuracy')

grid_search.fit(x_train, y_train)

best_params = grid_search.best_params_
print("Meilleurs paramètres :", best_params)


best_model = grid_search.best_estimator_
val_predictions = best_model.predict(x_val)

print(confusion_matrix(y_val.values, val_predictions))
print(classification_report(y_val.values, val_predictions))
print("Accuracy (validation):", accuracy_score(y_val.values, val_predictions))

Meilleurs paramètres : {'class_weight': None, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 5}
[[9076    7]
 [ 280  637]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      9083
           1       0.99      0.69      0.82       917

    accuracy                           0.97     10000
   macro avg       0.98      0.85      0.90     10000
weighted avg       0.97      0.97      0.97     10000

Accuracy (validation): 0.9713


# Random Forest Classifier

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

parameters = {
    'n_estimators': [50, 100, 200],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20],
    'class_weight': [None, 'balanced']
}

model = RandomForestClassifier()

grid_search = GridSearchCV(estimator=model, param_grid=parameters, cv=5, scoring='accuracy')

grid_search.fit(x_train, y_train)

best_params = grid_search.best_params_
print("Meilleurs paramètres :", best_params)


best_model = grid_search.best_estimator_
val_predictions = best_model.predict(x_val)

print(classification_report(y_val.values, val_predictions))
print("Accuracy (validation):", accuracy_score(y_val.values, val_predictions))

Meilleurs paramètres : {'class_weight': None, 'criterion': 'gini', 'max_depth': 10, 'n_estimators': 50}
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      9083
           1       1.00      0.69      0.81       917

    accuracy                           0.97     10000
   macro avg       0.98      0.84      0.90     10000
weighted avg       0.97      0.97      0.97     10000

Accuracy (validation): 0.9711
