# Prediction Model

## Setup

In [None]:
import os

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.tree import DecisionTreeClassifier

from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import seaborn as sns

from ipynb_utils import CFG

In [None]:
DATA_DIR = CFG["DATA_DIR"]

# Paths from which the data frame will be loaded.
DF_PKL_PATH_SRC = os.path.join(DATA_DIR, "df_processed.pkl")

As main evaluation metric, we decide for recall: The number of detected diabetes diseases should be as large as possible. We want to consider it more serious to label people with disease as healthy than declaring people without disease as ill. Of course, this is not apodictory. Have to compare the consequences of superfluous treatment with omitted treatment.

In [None]:
df = pd.read_pickle(DF_PKL_PATH_SRC)

# # TODO: Remove all missing values
# df = df.fillna(0)
df.isnull().sum()

Split data by pseudo-feature we introduces already in the [previous notebook](./1--analysis.ipynb).

In [None]:
df_0 = df[df["is_test"] == 0]
df_1 = df[df["is_test"] != 0]

X_cols_blacklist = [
    "id",
    "has_diabetes",
    "is_test",
]

X_cols = [col for col in df.columns if col not in X_cols_blacklist]
y_col = "has_diabetes"

X_0 = df_0[X_cols]
X_1 = df_1[X_cols]
y_0 = df_0[y_col]
y_1 = df_1[y_col]

## Simple Logistic Regression Model

As baseline model for benchmarking, we employ a logistic regression that takes only the age and the bmi of participant into account.

Furthermore, we oversample and scale

In [None]:
X_cols_red = ["age", "bmi"]

# ColumnTransformer to select columns
feature_selector = ColumnTransformer([("selector", "passthrough", X_cols_red)])

# Define the pipeline
pipeline = Pipeline(
    [
        # Feature selection
        ("feature_selector", feature_selector),
        # Oversampler
        ("oversampler", RandomOverSampler(random_state=CFG["RSEED"])),
        # Scaler for features (target scaling is not necessary)
        ("scaler", MinMaxScaler()),
        # Logistic regression
        ("model", LogisticRegression(random_state=CFG["RSEED"])),
    ]
)

Fit model, cross calidation. As the parameters of a logistic regression have also a visually pleasing explanation, let us also calculate these 

In [None]:
grid_search = GridSearchCV(
    pipeline,
    param_grid={},
    cv=10,
    scoring="accuracy",
    return_train_score=True,
)

grid_search.fit(X_0, y_0)

estimator = grid_search.best_estimator_

steps = estimator.named_steps

model = steps["model"]
coef_scaled = model.coef_[0]
intercept_scaled = model.intercept_[0]

scaler = steps["scaler"]
data_min = scaler.data_min_ 
data_max = scaler.data_max_ 
data_range = data_max - data_min 

# Back-transform coefficients to original scale
coef_orig = coef_scaled / data_range

# Adjust intercept for original scale
intercept_orig = intercept_scaled - np.sum(coef_scaled * data_min / data_range)

print("Best parameters :")
print(f"  Coefficients : {coef_orig}")
print(f"  Intercept    : {intercept_orig}")

The coefficients are positive, as expected: The older or the more likely a diabetes disease occur, and analogously for the BMI.

In [None]:
# Predict or evaluate on test data (full X_1 dataframe, pipeline selects columns internally)
z_1 = estimator.predict(X_1)

print("Classification Report:")
print(classification_report(y_1, z_1))

tpr = recall_score(y_1, z_1, pos_label=1)
print(f"True Positive Rate (Recall): {tpr:.2f}")

First, some preparation.

In [None]:
outcome_palette = {
    "TP": "green",
    "TN": "blue",
    "FP": "red",
    "FN": "orange",
}


def classify_outcome(row):
    if row["diagnosis"] == 1:
        if row["prognosis"] == 1:
            return "TP" 
        else:
            return "FN"
    else:
        if row["prognosis"] == 1:
            return "FP"
        else:
            return "TN"


For error analysis

In [None]:
cm = confusion_matrix(y_1, z_1)
sns.heatmap(cm, cmap="YlGnBu_r", annot=True, fmt=".0f")

In [None]:
# Scatterplot part

# Frame holding the data for plotting.
df_plot = X_1[["age", "bmi"]].copy()
df_plot["diagnosis"] = y_1
df_plot["prognosis"] = z_1
df_plot["outcome"] = df_plot.apply(classify_outcome, axis=1)

# Scatterplot
sns.scatterplot(
    data=df_plot,
    x="age",
    y="bmi",
    hue="outcome",
    palette=outcome_palette,
    s=20,
    edgecolor="k",
    alpha=0.7,
)

# Decision boundary part

# The decision boundary is described by 
# age_coeff * age + bmi_coeff * bmi + intercept = 0.
# We resolve this equation for bmi.

# Range of age
xx = np.array([df_plot["age"].min() - 1, df_plot["age"].max() + 1])

# Corresponding bmi values on the decision boundary line
yy = -(intercept_orig + coef_orig[0] * xx) / coef_orig[1]

# Plot decision boundary line
plt.plot(xx, yy, "k--", linewidth=2, label="Decision Boundary")

# Finalisation of the plot

plt.xlabel("Age (years)")
plt.ylabel("BMI (kg/sqm)")
plt.title("Classification Results with Decision Boundary")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


## Decision Tree Model

As Machine Learning model which aims to optimise our target evaluation metric, we choose a decision tree classificator.

In [None]:
X_cols_blacklist = ["dpf", "date"]
X_cols_red = [col for col in X_cols if col not in X_cols_blacklist]

feature_selector = ColumnTransformer([("selector", "passthrough", X_cols_red)])

# Define the pipeline
pipeline = Pipeline(
    [
        # Feature selection
        ("feature_selector", feature_selector),
        # Oversampler
        ("oversampler", RandomOverSampler(random_state=CFG["RSEED"])),
        # Scaling is not strictly necessary for Decision Trees
        # ("scaler", MinMaxScaler()),
        # Decision tree classifier
        ("model", DecisionTreeClassifier(random_state=CFG["RSEED"])),
    ]
)


In [None]:
param_grid = {
    "model__max_depth": [2, 4, 8, 16, None],
    "model__min_samples_split": [2, 4, 8],
    "model__min_samples_leaf": [1, 2, 4, 8],
}

grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=5,
    scoring="recall",
    n_jobs=-1,
    verbose=1,
)

grid_search.fit(X_0, y_0)

print("Best parameters :")
pad = max([len(k) for k in param_grid]) + 1
for k,v in grid_search.best_params_.items():
    print(f"  {k:{pad}}: {v}")
print("Best CV score   :", grid_search.best_score_)

In [None]:
z_1 = grid_search.predict(X_1)

print("Classification Report:")
print(classification_report(y_1, z_1))

tpr = recall_score(y_1, z_1, pos_label=1)
print(f"True Positive Rate (Recall): {tpr:.2f}")

In [None]:
cm = confusion_matrix(y_1, z_1)
sns.heatmap(cm, cmap="YlGnBu_r", annot=True, fmt=".0f")

There is no clear decision boundary anymore!

In [None]:
# Frame holding the data for plotting.
df_plot = X_1[["age", "bmi"]].copy()
df_plot["diagnosis"] = y_1
df_plot["prognosis"] = z_1
df_plot["outcome"] = df_plot.apply(classify_outcome, axis=1)

# Scatterplot
sns.scatterplot(
    data=df_plot,
    x="age",
    y="bmi",
    hue="outcome",
    palette=outcome_palette,
    s=20,
    edgecolor="k",
    alpha=0.7,
)

# Finalisation of the plot

plt.xlabel("Age (years)")
plt.ylabel("BMI (kg/sqm)")
plt.title("Classification Results with Decision Boundary")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()