# Prediction Model

In [None]:
import os
import subprocess

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
RSEED = 42

# Path to root directory of the repo.
root_dir_ = subprocess.check_output(
    ["git", "rev-parse", "--show-toplevel"],
    text=True,
)
ROOT_DIR = root_dir_.strip()

# Path to data directory.
DATA_DIR = os.path.join(ROOT_DIR, "data")

# Paths from which the data frame will be loaded.
DF_PKL_PATH_SRC = os.path.join(DATA_DIR, "df_processed.pkl")

In [None]:
df = pd.read_pickle(DF_PKL_PATH_SRC)
df = df.fillna(0)


In [None]:
y_col = "has_diabetes"
X_cols = [col for col in df.columns if col != y_col]

X = df[X_cols]
y = df[y_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=RSEED
)

model = LogisticRegression()

model.fit(X_train, y_train)

print("Fitted parameters:")
print("  Coefficients:", model.coef_)
print("  Intercept:   ", model.intercept_)

In [None]:
z_test = model.predict(X_test)

mse = mean_squared_error(y_test, z_test)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, z_test)
mape = mean_absolute_percentage_error(y_test, z_test)

print("Model Evaluation:")
print(f"  RMSE : {rmse}")
print(f"  R_2  : {r2}")
print(f"  MAPE : {mape}")

In [None]:
t = 1000

plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=z_test)
plt.plot([0, t], [0, t], color='red', linestyle='--')
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.xlim(0, 3000)
plt.title("Residuals Plot")
plt.show()