## Business Case Description

Toronto is one of the most expensive cities in Canada, with individuals facing rising rent, tuition, and basic living expenses. Due to this, many Canadians are concerned about their financial well-being.

## Data Cleaning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split

# Load Data

In [None]:
file_path = "personal_finance_dataset.xlsx"
df = pd.read_excel(file_path, sheet_name="datathon_finance", engine="openpyxl")
df = df.drop(columns=["PATTSITC", "PATTSKP", "PEFATINC", "PFTENUR", "PLFFPTME", "PNBEARG"])
df = df.drop_duplicates()
print(df.head())

# Source Code

In [None]:
pip install pandas scikit-learn xgboost shap openpyxl

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
import shap

In [None]:
# -----------------------------
# 1. Load Excel Data
# -----------------------------
df = pd.read_excel("your_file.xlsx")

X = df[[
    "Net Debt",
    "Net Worth",
    "TFSA Balance",
    "Bank Deposit",
    "After-Tax Income"
]]

y = df["stress"]  # Binary outcome (1 = stress, 0 = stable)

In [None]:
# -----------------------------
# 2. Train/Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# -----------------------------
# 3. Define Base Models
# -----------------------------

# Logistic Regression (Elastic Net)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_model = LogisticRegression(
    penalty="elasticnet",
    solver="saga",
    l1_ratio=0.5,
    max_iter=5000
)

# Gradient Boosting (XGBoost)
xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    random_state=42
)

In [None]:
# -----------------------------
# 4. Stacking Model
# -----------------------------

stack_model = StackingClassifier(
    estimators=[
        ("logistic", log_model),
        ("xgb", xgb_model)
    ],
    final_estimator=LogisticRegression(),
    passthrough=True
)

stack_model.fit(X_train, y_train)

In [None]:
# -----------------------------
# 5. Evaluate
# -----------------------------

probs_stack = stack_model.predict_proba(X_test)[:, 1]
auc_stack = roc_auc_score(y_test, probs_stack)

print("Stacked Model AUC:", auc_stack)

# Logistic Regression Interpretation

In [None]:
log_model.fit(X_train_scaled, y_train)

coefficients = pd.DataFrame({
    "Variable": X.columns,
    "Coefficient": log_model.coef_[0],
    "Odds_Ratio": np.exp(log_model.coef_[0])
}).sort_values(by="Odds_Ratio", ascending=False)

print(coefficients)

# XGBoost + SHAP (Best Variable Ranking) Interpretation

In [None]:
xgb_model.fit(X_train, y_train)

explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_train)

shap.summary_plot(shap_values, X_train)

# Priority 3 Weights