## Business Case Description

Toronto is one of the most expensive cities in Canada, with individuals facing rising rent, tuition, and basic living expenses. Due to this, many Canadians are concerned about their financial well-being.

# Source Code

In [None]:
pip install pandas scikit-learn xgboost shap openpyxl



In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
import shap

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# -----------------------------
# 1. Load Excel Data
# -----------------------------

file_path = "personal_finance_dataset.xlsx"
df = pd.read_excel(file_path, sheet_name="datathon_finance", engine="openpyxl")
df = df.drop(columns=["PATTSITC", "PATTSKP", "PFTENUR", "PLFFPTME", "PNBEARG"])
df = df.drop_duplicates()
df = df.rename(columns={
    "PAGEMIEG": "Age Group",
    "PATTCRU": "Credit Card Payment",
    "PWAPRVAL": "Home Value",
    "PWASTDEP": "Bank Deposits",
    "PWATFS": "TFSA Balance",
    "PWDPRMOR": "Mortgage Debt",
    "PWDSLOAN": "Student Loan Debt",
    "PWDSTCRD": "Credit Card Debt",
    "PWDSTLOC": "Line of Credit Debt",
    "PWNETWPG": "Net Worth",
    "PPVRES": "Province",
    "PFMTYPG": "Family Type",
    "PEDUCMIE": "Education Level",
    "PEFATINC": "After-Tax Income"
})
debt_cols = ["Mortgage Debt", "Line of Credit Debt", "Credit Card Debt", "Student Loan Debt"]
df["Total Debt"] = df[debt_cols].sum(axis=1)
cal_cols = ["Net Worth", "Total Debt"]
df["ratio"] = df["Total Debt"]/(df["Net Worth"] + 0.00000001)
df = df.drop(columns=["Education Level", "Family Type", "Province",
                      "Line of Credit Debt", "Credit Card Debt",
                      "Student Loan Debt",
                      "Mortgage Debt", "Line of Credit Debt",
                      "Credit Card Debt", "Home Value", "Age Group", "Credit Card Payment"])

features = ["After-Tax Income",
    "Bank Deposits",
    "TFSA Balance",
    "Net Worth",
    "Total Debt"]


Q1 = df[features].quantile(0.25)
Q3 = df[features].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
mask = ~((df[features] < lower_bound) | (df[features] > upper_bound)).any(axis=1)
df = df[mask]

summary = pd.DataFrame({
    "Mean": df[features].mean(),
    "Median": df[features].median(),
    "Mode": df[features].mode().iloc[0]
})
print(summary)

import numpy as np
for col in features:
    df[col] = np.log1p(df[col])

pd.set_option("display.max_columns", None)
print(df.head())


plt.figure(figsize=(8,5))
sns.histplot(df["TFSA Balance"], bins=50, kde=True)
plt.xlabel("Bank Deposits")
plt.ylabel("Frequency")
plt.title("TFSA Balance Distribution")
plt.show()


X = df[[
    "Net Debt",
    "Net Worth",
    "TFSA Balance",
    "Bank Deposit",
    "After-Tax Income"
]]

y = df["stress"]  # Binary outcome (1 = stress, 0 = stable)

In [None]:
# -----------------------------
# 2. Train/Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# -----------------------------
# 3. Define Base Models
# -----------------------------

# Logistic Regression (Elastic Net)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_model = LogisticRegression(
    penalty="elasticnet",
    solver="saga",
    l1_ratio=0.5,
    max_iter=5000
)

# Gradient Boosting (XGBoost)
xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    random_state=42
)

In [None]:
# -----------------------------
# 4. Stacking Model
# -----------------------------

stack_model = StackingClassifier(
    estimators=[
        ("logistic", log_model),
        ("xgb", xgb_model)
    ],
    final_estimator=LogisticRegression(),
    passthrough=True
)

stack_model.fit(X_train, y_train)

In [None]:
# -----------------------------
# 5. Evaluate
# -----------------------------

probs_stack = stack_model.predict_proba(X_test)[:, 1]
auc_stack = roc_auc_score(y_test, probs_stack)

print("Stacked Model AUC:", auc_stack)

# Logistic Regression Interpretation

In [None]:
log_model.fit(X_train_scaled, y_train)

coefficients = pd.DataFrame({
    "Variable": X.columns,
    "Coefficient": log_model.coef_[0],
    "Odds_Ratio": np.exp(log_model.coef_[0])
}).sort_values(by="Odds_Ratio", ascending=False)

print(coefficients)

# XGBoost + SHAP (Best Variable Ranking) Interpretation

In [None]:
xgb_model.fit(X_train, y_train)

explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_train)

shap.summary_plot(shap_values, X_train)