In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pycaret.classification import *

Reading in the data and using one hot encoding

In [11]:
train_data = X_train.copy()
train_data["Diabetes"] = y_train

s = ClassificationExperiment()
# Setting feature selection to true
s.setup(train_data, target = "Diabetes", session_id = 123, preprocess=True, feature_selection=True)

top_models = s.compare_models(n_select=5)

In [8]:
# Read the data into a pandas dataframe
df = pd.read_csv("diabetes_data.csv")
print(df.size)

# do some basic exploration
print("(Observations, Attributes (including target): ", df.shape)
print("DF contains null / nan values: ", df.isna().any().any())
for i in df.columns:
    print(f"Unique values for {i}: ", sorted(df[i].unique()))
    # plt.figure(figsize=(3, 1.5))
    # plt.title(f'{i}')
    # plt.hist(df[i])
print(df.describe())

# (for our report, we should describe how the categories were assessed (e.g. age buckets, what the 5 GenHlth categories correspond to etc.))

# As can be seen, many categories are actually binary, the rest is somewhat categorical although interpreting them continously might bear some benefit
# One hot encoding, dropfirst to remove the redundant column, since for sex they will have perfect negative corrolation (e.g. is_female and is_male)
# df = pd.get_dummies(df, columns=["Sex"], drop_first=True)

# Identifying binary columns with unique values [0.0, 1.0]
binary_columns = ["Sex", "HighChol", "CholCheck", "Smoker", "HeartDiseaseorAttack", "PhysActivity", 
                  "Fruits", "Veggies", "HvyAlcoholConsump", "DiffWalk", "Stroke", "HighBP"]

# One-hot encoding the binary columns with drop_first=True
# df = pd.get_dummies(df, columns=binary_columns, drop_first=True)  # this did not make a difference for the results

# One hot encoding for GeneralHealth since its a catagorical value from 1-5
df = pd.get_dummies(df, columns=["GenHlth"])

In [9]:
## Plot: Distribution of age groups

# Define the age ranges for each group (according to AGEG5YR)
age_groups = ['18-24', '25-29', '30-34', '35-39', '40-44', '45-49', 
              '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80+']

# Count the occurrences of each age group (1-13)
age_group_counts = df['Age'].value_counts().reindex(range(1, 14), fill_value=0)

# Create the bar plot
plt.figure(figsize=(10, 6))
bars = plt.bar(age_group_counts.index, age_group_counts.values, color='skyblue')

# Annotate each bar with the corresponding age range
for bar, age_group in zip(bars, age_groups):
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, yval, age_group, 
             ha='center', va='bottom', fontsize=10, color='black')

# Add labels and title
plt.xlabel('Age Group Code')
plt.ylabel('Count')
plt.title('Distribution of Individuals by Age Group')
plt.xticks(age_group_counts.index)
plt.tight_layout()

# Show the plot
plt.show()

Checking for cyclical values and if we need to scale some features (maybe its better to leave some features unscaled for some models ???)

In [4]:

# Check if we have cyclical values | We dont :) often in form of months, days or any time series.

# Check scaling and what columns its need to do scaling (essentially if range is above 0<x<1 => we need to scale)
# our exploration above yields: Age, BMI, (GenHlth, now one-hot encoded,) MentHlth, PhysHlth

# Init of scaler
scaler = StandardScaler()

# Apply scaling for BMI, Mental health, Physical Health, and maybee age and GenHlth (depending on encoded or not) ??? 
# to benefit models like logistic regression, SVM or KNN
# We tested with this uncommented and it yielded a worse result for the best models
# to_be_scaled = ["BMI", "MentHlth", "PhysHlth", "Age", "GenHlth"]
# df[to_be_scaled] = scaler.fit_transform(df[to_be_scaled])

# print(df.describe())

# Splitting the data into data and target
data = df.drop("Diabetes", axis=1)
target = df["Diabetes"]

Now we split the data into triaining and test data

In [5]:
# Make a train and test split
# Train 80%
# Test 20%
# X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(data, target, stratify=target, test_size=0.2, random_state=42)

Checking for correlation of features in our dataset

In [6]:
# Check features that are correlated
plt.figure(figsize=(15, 15))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
plt.plot()
# there seems to be a slight linear correlation between GenHtlh and PhysHlth, we should address this / state what the threshold value would be and why

Using pycaret to choose top 5 models to train develope further with hyperparameters optimization 

In [7]:
train_data = X_train.copy()
train_data["Diabetes"] = y_train

# Pycaret for choosing the best models, pick top 5.
s = ClassificationExperiment()
s.setup(train_data, target = "Diabetes", session_id = 123, preprocess=True)  #preprocessing probably also takes care of scaling e.g.
top_models = s.compare_models(n_select=5)

Hyper parameter tuning

In [8]:
# built-in random-search by pycaret (took 2m 2.1s for me)
tuned_models = [s.tune_model(model) for model in top_models]

Lets try to use ensamble methods to get better results

In [9]:
# Use blend models to try to get better accuracy 
clf = setup(data=train_data, target="Diabetes")

ensemble_model = blend_models(estimator_list=tuned_models, method="hard")
print(evaluate_model(ensemble_model))

In [10]:

from sklearn.linear_model import LogisticRegression
# try to use stacking to see if thats any better
stacked_model_custom = s.stack_models(estimator_list=tuned_models, meta_model=LogisticRegression())


Lets try and see if feature selection does help ? 

After trying stacking, blending and feature selection they all performed worse than the single model after hyperparameter tuning.

So lets just stick with the following

In [12]:
# Making a copy of the training data
train_data = X_train.copy()
# Adding the target column to the df
train_data["Diabetes"] = y_train

# Pycaret for choosing the best models, pick top 5.
s = ClassificationExperiment()
s.setup(train_data, target = "Diabetes", session_id = 123, preprocess=True)  #preprocessing probably also takes care of scaling e.g.
# Selecting the top 5 models and display their scoring
top_models = s.compare_models(n_select=5)


Unnamed: 0,Description,Value
0,Session id,123
1,Target,Diabetes
2,Target type,Binary
3,Original data shape,"(56553, 22)"
4,Transformed data shape,"(56553, 22)"
5,Transformed train set shape,"(39587, 22)"
6,Transformed test set shape,"(16966, 22)"
7,Numeric features,4
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.7508,0.8272,0.7897,0.7329,0.7601,0.5016,0.5032,0.468
lightgbm,Light Gradient Boosting Machine,0.7495,0.8258,0.7926,0.7298,0.7598,0.499,0.5009,0.398
ada,Ada Boost Classifier,0.7482,0.825,0.7731,0.7366,0.7543,0.4964,0.4971,0.228
lr,Logistic Regression,0.7467,0.8226,0.7737,0.7342,0.7534,0.4934,0.4942,0.102
ridge,Ridge Classifier,0.7467,0.8217,0.7799,0.7314,0.7549,0.4934,0.4946,0.015
lda,Linear Discriminant Analysis,0.7467,0.8217,0.7798,0.7314,0.7548,0.4933,0.4945,0.017
rf,Random Forest Classifier,0.7273,0.7934,0.7627,0.7124,0.7366,0.4545,0.4558,0.407
nb,Naive Bayes,0.7246,0.7874,0.7251,0.7244,0.7247,0.4492,0.4492,0.014
et,Extra Trees Classifier,0.7108,0.7629,0.733,0.7019,0.7171,0.4215,0.422,0.498
knn,K Neighbors Classifier,0.7019,0.7596,0.7196,0.6951,0.7071,0.4038,0.4041,0.243


Lets try blending with more diverse models that should have a diverse classifications.

In [13]:
# Use blend models to try to get better accuracy 
clf = setup(data=train_data, target="Diabetes")

ensemble_model = blend_models(estimator_list=[top_models[0],top_models[2],top_models[4]], method="hard")
print(evaluate_model(ensemble_model))


from sklearn.linear_model import LogisticRegression
# try to use stacking to see if thats any better
stacked_model_custom = s.stack_models(estimator_list=[top_models[0],top_models[2],top_models[4]])
print(evaluate_model(stacked_model_custom))

Unnamed: 0,Description,Value
0,Session id,3112
1,Target,Diabetes
2,Target type,Binary
3,Original data shape,"(56553, 22)"
4,Transformed data shape,"(56553, 22)"
5,Transformed train set shape,"(39587, 22)"
6,Transformed test set shape,"(16966, 22)"
7,Numeric features,4
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7494,0.0,0.7863,0.7322,0.7583,0.4989,0.5002
1,0.7477,0.0,0.7741,0.7351,0.7541,0.4953,0.496
2,0.7378,0.0,0.7671,0.7246,0.7452,0.4756,0.4765
3,0.7595,0.0,0.7803,0.7493,0.7645,0.5191,0.5195
4,0.7454,0.0,0.7828,0.7284,0.7546,0.4908,0.4921
5,0.7492,0.0,0.7773,0.736,0.7561,0.4984,0.4991
6,0.7527,0.0,0.798,0.7318,0.7635,0.5054,0.5075
7,0.7486,0.0,0.7792,0.7343,0.7561,0.4972,0.4982
8,0.7519,0.0,0.759,0.7484,0.7536,0.5038,0.5038
9,0.7438,0.0,0.7751,0.7294,0.7516,0.4876,0.4886


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

None


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7482,0.8289,0.7717,0.7371,0.754,0.4963,0.4969
1,0.7605,0.8372,0.7894,0.7464,0.7673,0.5211,0.522
2,0.7386,0.815,0.7652,0.7266,0.7454,0.4771,0.4778
3,0.7472,0.8235,0.798,0.7244,0.7594,0.4943,0.4969
4,0.7454,0.8267,0.762,0.7374,0.7495,0.4908,0.4911
5,0.7439,0.8226,0.7852,0.7252,0.754,0.4878,0.4894
6,0.7408,0.8193,0.7716,0.7268,0.7485,0.4817,0.4826
7,0.7658,0.8367,0.7933,0.7519,0.7721,0.5316,0.5324
8,0.7496,0.8312,0.7655,0.7419,0.7535,0.4992,0.4995
9,0.761,0.828,0.7807,0.7511,0.7656,0.522,0.5224


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

None
