In [56]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

from lazypredict.Supervised import LazyClassifier

data = pd.read_csv("diabetes_data.csv")

# 1) Create target variable: HealthStatus
conditions = [
    (data['Obesity'] == 'Yes') & (data['DiabeticClass'] == 'Positive'),
    (data['Obesity'] == 'Yes') & (data['DiabeticClass'] == 'Negative'),
    (data['Obesity'] == 'No') & (data['DiabeticClass'] == 'Positive'),
    (data['Obesity'] == 'No') & (data['DiabeticClass'] == 'Negative')
]

choices = ['Both', 'Obese', 'Diabetic', 'Healthy']
data['HealthStatus'] = np.select(conditions, choices)

# 2) Drop the original Obesity and DiabeticClass columns
data = data.drop(columns=['Obesity', 'DiabeticClass'])

# 3) Separate features and target variables
x = data.drop(columns=['HealthStatus'])
y = data['HealthStatus']

# 4) Split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# 5) Data preprocessing

# Numerical feature: Age
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Ordinal features
gender_order = data['Gender'].unique()
yesno_order = ['No', 'Yes']
ord_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', OrdinalEncoder(categories=[gender_order, yesno_order, yesno_order, yesno_order, yesno_order,
                                     yesno_order, yesno_order, yesno_order, yesno_order, yesno_order, 
                                     yesno_order, yesno_order, yesno_order, yesno_order]))
])

boolean_features = [col for col in x_train.columns if col not in ['Age']]
pre_processor = ColumnTransformer(transformers=[
    ('num_features', num_transformer, ['Age']),
    ('ord_features', ord_transformer, boolean_features)
])

# x_train = pre_processor.fit_transform(x_train)
# x_test = pre_processor.transform(x_test)



In [None]:
# 6) Initialize models

model = Pipeline(steps=[
    ('pre_processor', pre_processor),
    ('classifier', RandomForestClassifier(random_state=100))
])

param_grid = {
    "classifier__n_estimators": [50, 100, 200],
    # "classifier__criterion": ["gini", "entropy", "log_loss"],
    # "classifier__max_depth": [None, 2, 5],
    # "pre_processor__num_features__imputer__strategy": ["mean", "median"]
}
model_gr = GridSearchCV(estimator=model, param_grid=param_grid, scoring="recall", cv=6, verbose=2, n_jobs=-1)
model_gr.fit(x_train, y_train)

print("Best score: {}".format(model_gr.best_score_))
print("Best params: {}".format(model_gr.best_params_))

# 7) Train model
# model.fit(x_train, y_train)
# model_gr.fit(x_train, y_train)

# 8) Test model
# y_predict = model.predict(x_test)
y_predict = model_gr.predict(x_test)

# 9) Evaluate model:
for i, j in zip(y_predict, y_test.values):
    print('Predicted value: {}. Actual value: {}'.format(i, j))

# print('Accuracy: {}'.format(accuracy_score(y_test, y_predict)))
# print('F1 score: {}'.format(f1_score(y_test, y_predict, average='micro')))
print(classification_report(y_test, y_predict))

# reg = LazyClassifier(verbose=0, ignore_warnings=False, custom_metric=None)
# models, predictions = reg.fit(x_train, x_test, y_train, y_test)

# print(models)

Fitting 6 folds for each of 3 candidates, totalling 18 fits
Best score: nan
Best params: {'classifier__n_estimators': 50}
Fitting 6 folds for each of 3 candidates, totalling 18 fits
Predicted value: Healthy. Actual value: Healthy
Predicted value: Diabetic. Actual value: Diabetic
Predicted value: Diabetic. Actual value: Diabetic
Predicted value: Both. Actual value: Both
Predicted value: Diabetic. Actual value: Diabetic
Predicted value: Both. Actual value: Both
Predicted value: Diabetic. Actual value: Diabetic
Predicted value: Healthy. Actual value: Healthy
Predicted value: Diabetic. Actual value: Diabetic
Predicted value: Healthy. Actual value: Healthy
Predicted value: Healthy. Actual value: Healthy
Predicted value: Diabetic. Actual value: Diabetic
Predicted value: Healthy. Actual value: Healthy
Predicted value: Healthy. Actual value: Healthy
Predicted value: Diabetic. Actual value: Diabetic
Predicted value: Diabetic. Actual value: Diabetic
Predicted value: Healthy. Actual value: Health

In [62]:
x_train
model_gr.best_params_
model_gr.best_score_
model_gr.best_estimator_