In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pycaret.classification import *

Reading in the data and using one hot encoding

In [None]:
# Read the data into a pandas dataframe
df = pd.read_csv("diabetes_data.csv")
print(df.size)

# do some basic exploration
print("(Observations, Attributes (including target): ", df.shape)
print("DF contains null / nan values: ", df.isna().any().any())
for i in df.columns:
    print(f"Unique values for {i}: ", sorted(df[i].unique()))
    # plt.figure(figsize=(3, 1.5))
    # plt.title(f'{i}')
    # plt.hist(df[i])
print(df.describe())

# (for our report, we should describe how the categories were assessed (e.g. age buckets, what the 5 GenHlth categories correspond to etc.))

# As can be seen, many categories are actually binary, the rest is somewhat categorical although interpreting them continously might bear some benefit
# One hot encoding, dropfirst to remove the redundant column, since for sex they will have perfect negative corrolation (e.g. is_female and is_male)
df = pd.get_dummies(df, columns=["Sex"], drop_first=True) 
# One hot encoding for GeneralHealth since its a catagorical value from 1-5
# df = pd.get_dummies(df, columns=["GenHlth"])

Checking for cyclical values and if we need to scale some features (maybe its better to leave some features unscaled for some models ???)

In [4]:

# Check if we have cyclical values | We dont :) often in form of months, days or any time series.

# Check scaling and what columns its need to do scaling (essentially if range is above 0<x<1 => we need to scale)
# our exploration above yields: Age, BMI, (GenHlth, now one-hot encoded,) MentHlth, PhysHlth
# Init of scaler
scaler = StandardScaler()

# Apply scaling for BMI, Mental health, Physical Health, and maybee age and GenHlth (depending on encoded or not) ??? 
# to benefit models like logistic regression, SVM or KNN
to_be_scaled = ["BMI", "MentHlth", "PhysHlth", "Age", "GenHlth"]
df[to_be_scaled] = scaler.fit_transform(df[to_be_scaled])
# print(df.describe())

# Splitting the data into data and target
data = df.drop("Diabetes", axis=1)
target = df["Diabetes"]

Now we split the data into triaining and test data

In [5]:
# Make a train and test split
# Train 80%
# Test 20%
# X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(data, target, stratify=target, test_size=0.2)

Checking for correlation of features in our dataset

In [None]:
# Check features that are correlated
plt.figure(figsize=(15, 15))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
plt.plot()
# there seems to be a slight linear correlation between GenHtlh and PhysHlth, we should address this / state what the threshold value would be and why

Using pycaret to choose top 5 models to train develope further with hyperparameters optimization 

In [None]:
train_data = X_train.copy()
train_data["Diabetes"] = y_train

# Pycaret for choosing the best models, pick top 5.
s = ClassificationExperiment()
s.setup(train_data, target = "Diabetes", session_id = 123, preprocess=True)  #preprocessing probably also takes care of scaling e.g.
top_models = s.compare_models(n_select=5)

Hyper parameter tuning

In [None]:
# hyperparameter-tuning: random? bayesian optimisation? grid-search takes a lot of time but is precise
# Libs like
    # Optuna
    # Hyper-opt

# built-in grid-search by pycaret (took 2m 2.1s for me)
tuned_models = [s.tune_model(model) for model in top_models]