In [3]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from sklearn.metrics import accuracy_score
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# loading the diabetes dataset to a pandas DataFrame
diabetes_dataset = pd.read_csv('/content/diabetes.csv')

# printing the first 5 rows of the dataset
print(diabetes_dataset.head())

# number of rows and Columns in this dataset
print(diabetes_dataset.shape)

# getting the statistical measures of the data
print(diabetes_dataset.describe())

# class distribution
print(diabetes_dataset['Outcome'].value_counts())

# separating the data and labels
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

print(X)
print(Y)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

print(X.shape, X_train.shape, X_test.shape)

# Prétraitement des données en utilisant ColumnTransformer
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Remplacer les valeurs manquantes par la moyenne
    ('scaler', StandardScaler())  # Mise à l'échelle des caractéristiques numériques
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

# Initialisation des modèles
models = [
    ('SVM', svm.SVC(kernel='linear')),
    ('Random Forest', RandomForestClassifier(n_estimators=100)),
    ('KNN', KNeighborsClassifier()),
    ('Logistic Regression', LogisticRegression())
]

# Créer un pipeline complet avec le prétraitement et les modèles
pipelines = []
for model_name, model in models:
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', model)])
    pipelines.append((model_name, pipeline))

# Entraînement et évaluation des modèles
for model_name, pipeline in pipelines:
    print("Training", model_name)
    pipeline.fit(X_train, Y_train)
    # accuracy score on the training data
    X_train_prediction = pipeline.predict(X_train)
    training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
    print('Accuracy score of the training data : ', training_data_accuracy)

    # accuracy score on the test data
    X_test_prediction = pipeline.predict(X_test)
    test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
    print('Accuracy score of the test data : ', test_data_accuracy)
    print("---")

    # Sauvegarde du modèle
    filename = f'diabetes_model_{model_name}.sav'
    pickle.dump(pipeline, open(filename, 'wb'))

    # Load the model
    loaded_model = pickle.load(open(filename, 'rb'))

    input_data = (5,166,72,19,175,25.8,0.587,51)

    # change the input data to a numpy array
    input_data_as_numpy_array = np.asarray(input_data)

    # reshape the array as we are predicting for one instance
    input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

    # Convertir en DataFrame pandas pour la prédiction
    input_df = pd.DataFrame(input_data_reshaped, columns=X.columns)

    prediction = loaded_model.predict(input_df)
    print(prediction)

    if prediction[0] == 0:
        print('The person is not diabetic')
    else:
        print('The person is diabetic')

    for column in X.columns:
        print(column)

# Utiliser la validation croisée pour évaluer la performance des modèles de manière robuste
for model_name, pipeline in pipelines:
    print("Cross-validation scores for", model_name)
    scores = cross_val_score(pipeline, X, Y, cv=5)
    print("Mean accuracy:", scores.mean())
    print("Standard deviation of accuracy:", scores.std())
    print("---")


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
(768, 9)
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.