In [None]:
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", None)

In [None]:
X_test = np.load("X_test.npy")
y_test = pd.read_csv("y_test.csv").values.ravel()

stack_model = joblib.load("stacking_model_final.joblib")
y_proba = stack_model.predict_proba(X_test)[:, 1]

print("Predicted probabilities:")
print(y_proba[:10])

In [None]:
p33 = np.percentile(y_proba, 33)
p66 = np.percentile(y_proba, 66)

print("Risk categorization limits:")
print(f"Low: ≤ {p33:.4f}")
print(f"Average: > {p33:.4f} e ≤ {p66:.4f}")
print(f"High: > {p66:.4f}")

In [None]:
df_risk = pd.DataFrame({
    "true_label": y_test,
    "predicted_proba": y_proba
})

def categorizar_risco(p):
    if p <= p33:
        return "Low"
    elif p <= p66:
        return "Average"
    else:
        return "High"

df_risk["risk_category"] = df_risk["predicted_proba"].apply(categorizar_risco)
df_risk.head(10)

In [None]:
contagem_abs = df_risk["risk_category"].value_counts()
contagem_prop = df_risk["risk_category"].value_counts(normalize=True)

print("Absolute frequency of risk categories:")
print(contagem_abs)
print("Proportional frequency of risk categories:")
print(contagem_prop)

In [None]:
cross_tab = pd.crosstab(df_risk["risk_category"], df_risk["true_label"],
                        rownames=["Risk Category"],
                        colnames=["Real Diagnosis (target)"])

print("Actual distribution of disease by risk category:")
print(cross_tab)

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(data=df_risk, x="risk_category", hue="true_label", palette="Set2")
plt.title("Distribution of Actual Diagnosis by Risk Category")
plt.xlabel("Risk Category")
plt.ylabel("Number of Patients")
plt.legend(title="Real Diagnosis", labels=["No Disease", "With Disease"])
plt.tight_layout()
plt.show()

In [None]:
df_risk.to_csv("risk_categorized_results.csv", index=False)