# Prediction Model

In [None]:
import os
import subprocess

import pandas as pd
import numpy as np

from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    cross_validate,
    cross_val_predict,
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
RSEED = 42

# Path to root directory of the repo.
root_dir_ = subprocess.check_output(
    ["git", "rev-parse", "--show-toplevel"],
    text=True,
)
ROOT_DIR = root_dir_.strip()

# Path to data directory.
DATA_DIR = os.path.join(ROOT_DIR, "data")

# Paths from which the data frame will be loaded.
DF_PKL_PATH_SRC = os.path.join(DATA_DIR, "df_processed.pkl")

In [None]:
df = pd.read_pickle(DF_PKL_PATH_SRC)
df = df.fillna(0)

In [None]:
y_col = "has_diabetes"
X_cols = [col for col in df.columns if col != y_col]

X = df[X_cols]
y = df[y_col]

X_0, X_1, y_0, y_1 = train_test_split(
    X, y, test_size=0.25, random_state=RSEED, stratify=df[y_col]
)

## Simple Logistic Regression Model

In [None]:
scaler = MinMaxScaler()
X_0_scaled = scaler.fit_transform(X_0)
X_1_scaled = scaler.transform(X_1)
X_1_scaled = pd.DataFrame(X_1_scaled, columns=X_1.columns)


In [None]:
ros = RandomOverSampler(random_state=42)
X_0_scaled_ros, y_0_ros = ros.fit_resample(X_0_scaled, y_0)
X_0_scaled_ros = pd.DataFrame(X_0_scaled_ros, columns=X_0.columns)


In [None]:
# Define dictionary with several interesting metrics
# scorer_dict = {"ftwo_scorer": make_scorer(fbeta_score, beta=2), "accuracy": "accuracy", "precision": "precision", "recall": "recall"}

In [None]:
model = LogisticRegression()

model.fit(X_0_scaled_ros[["age", "bmi"]], y_0_ros)

print("Fitted parameters:")
print(f"  Coefficients : {model.coef_}")
print(f"  Intercept    : {model.intercept_}")

In [None]:
z_1 = model.predict(X_1_scaled[["age", "bmi"]])

mse = mean_squared_error(y_1, z_1)
rmse = np.sqrt(mse)
r2 = r2_score(y_1, z_1)
mape = mean_absolute_percentage_error(y_1, z_1)

print("Model Evaluation:")
print(f"  RMSE : {rmse}")
print(f"  R_2  : {r2}")
print(f"  MAPE : {mape}")

In [None]:
# Combine into a DataFrame for plotting
df_plot = X_1_scaled[["age", "bmi"]].copy()
df_plot["factual_diabetes"] = y_1
df_plot["predicted_diabetes"] = z_1

def classify_outcome(row):
    if row["factual_diabetes"] == 1:
        if row["predicted_diabetes"] == 1:
            return "TP"
        else:
            return "FN"
    else:
        if row["predicted_diabetes"] == 1:
            return "FP"
        else:
            return "TN"


df_plot["outcome"] = df_plot.apply(classify_outcome, axis=1)


df_plot_orig = df_plot.copy()
df_plot_orig[["age", "bmi"]] = scaler.inverse_transform(df_plot[["age", "bmi"]])

plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=df_plot_orig,
    x="age",
    y="bmi",
    hue="outcome",
    palette={"TP": "green", "TN": "blue", "FP": "red", "FN": "orange"},
    s=70
)
plt.xlabel("Age (years)")
plt.ylabel("BMI (kg/m²)")
plt.title("Classification Results: TP, TN, FP, FN")
plt.legend(title="Outcome", loc="best")
plt.grid(True)
plt.tight_layout()
plt.show()

## Decision Tree Model