In [1]:
from google.colab import files
import os

# Create the directory if it doesn't exist
kaggle_dir = '/root/.kaggle'
os.makedirs(kaggle_dir, exist_ok=True)

# Upload the kaggle.json file
print("Please upload your kaggle.json file:")
uploaded = files.upload()

# Move the uploaded file to the correct directory and set permissions
for fn in uploaded.keys():
    !mv "$fn" "{kaggle_dir}/kaggle.json"
    !chmod 600 "{kaggle_dir}/kaggle.json"
    print(f"Uploaded and configured {fn}")

Please upload your kaggle.json file:


Saving kaggle.json to kaggle.json
Uploaded and configured kaggle.json


In [2]:
import os
import glob
import random
import warnings
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

# ---------------- CONFIG ----------------
DATA_DIR = "/content/data"
KAGGLE_DATASET = "shriyashjagtap/indian-personal-finance-and-spending-habits"
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# Domain hyperparameters (tuneable)
MIN_MONTHLY_INCOME = 15_000
MAX_MONTHLY_INCOME = 500_000
EXPENSE_TO_INCOME_RATIO = 0.65
SAVINGS_RATE_RANGE = (0.05, 0.25)
MAX_SAVINGS_FRACTION = 0.5         # fraction of disposable income users can realistically save
MIN_PRACTICAL_SAVE = 500           # smallest practical monthly save shown to users
SYNTHETIC_SAVINGS_CAP_MULTIPLE = 24  # cap synthetic savings at income * multiple

os.makedirs(DATA_DIR, exist_ok=True)

# ---------------- HELPERS ----------------
def download_kaggle_dataset(dataset_ref=KAGGLE_DATASET, out_dir=DATA_DIR):
    try:
        from kaggle.api.kaggle_api_extended import KaggleApi
    except Exception as e:
        print("[kaggle] Kaggle API not available:", e)
        return False
    try:
        api = KaggleApi()
        api.authenticate()
        print("[kaggle] downloading dataset:", dataset_ref)
        api.dataset_download_files(dataset_ref, path=out_dir, unzip=True, quiet=False)
        print("[kaggle] download + unzip done")
        return True
    except Exception as e:
        print("[kaggle] download failed:", e)
        return False

def find_first_csv(data_dir=DATA_DIR):
    files = glob.glob(os.path.join(data_dir, "**", "*.csv"), recursive=True)
    if not files:
        return None
    for f in files:
        lf = f.lower()
        if any(k in lf for k in ("finance", "spending", "data", "personal")):
            return f
    return sorted(files, key=lambda p: os.path.getsize(p), reverse=True)[0]

def find_cols_by_tokens(cols, tokens):
    return [c for c in cols if any(tok in c.lower() for tok in tokens)]

def is_pct_col(name, series):
    lname = name.lower()
    if "percent" in lname or "percentage" in lname or lname.endswith("_pct"):
        return True
    s = pd.to_numeric(series, errors="coerce").dropna()
    if s.empty:
        return False
    if s.max() <= 100 and s.median() <= 100:
        return True
    return False

def smart_scale(series, name):
    s = pd.to_numeric(series, errors="coerce")
    non_na = s.dropna()
    if non_na.empty:
        return s.fillna(0)
    if is_pct_col(name, s):
        return s.fillna(0)
    med = non_na.median()
    mx = non_na.max()
    if med < 100 and "age" not in name.lower() and "count" not in name.lower() and "depend" not in name.lower():
        if mx < 100:
            return s.fillna(0)
        scaled = s * 1000
        print(f"[scale] scaled '{name}' median {med:.2f} -> {scaled.dropna().median():.2f}")
        return scaled.fillna(0)
    if 100 <= med <= 1000:
        print(f"[info] ambiguous units for '{name}' median {med:.1f} — left as-is.")
        return s.fillna(0)
    return s.fillna(0)

# ---------------- MAIN ----------------
def main():
    print("=== pipeline start ===")
    csv_path = find_first_csv()
    if csv_path is None:
        print("[info] no CSV found locally -> trying Kaggle download")
        ok = download_kaggle_dataset()
        if ok:
            csv_path = find_first_csv()
    if csv_path is None:
        raise FileNotFoundError("No CSV found in DATA_DIR and Kaggle failed. Put dataset CSV in /content/data")
    print("[info] using dataset:", csv_path)

    raw = pd.read_csv(csv_path, low_memory=False)
    raw.columns = [c.strip().lower().replace(" ", "_") for c in raw.columns]
    print("[info] raw shape:", raw.shape)

    # conservative detection
    potential_prefix = "potential_savings"
    potential_cols = [c for c in raw.columns if c.startswith(potential_prefix)]
    expense_tokens = ["rent","loan","insurance","groceries","transport","eating_out","entertainment","utilities","healthcare","education","misc","miscellaneous"]
    income_tokens = ["income","salary","monthly_income","earn"]
    savings_tokens = ["current_savings","current_saving","savings","saving"]

    expense_candidates = find_cols_by_tokens(raw.columns, expense_tokens)
    # exclude potential/desired/percentage columns from expenses
    expense_cols = [c for c in expense_candidates if not (c in potential_cols or "potential" in c.lower() or "desired" in c.lower() or "percentage" in c.lower())]
    income_cols = find_cols_by_tokens(raw.columns, income_tokens)
    desired_cols = [c for c in raw.columns if "desired_saving" in c or "desired_savings" in c]
    savings_candidates = [c for c in raw.columns if any(tok in c for tok in savings_tokens) and not any(x in c for x in ("desired","potential","percentage"))]
    demographic_cols = find_cols_by_tokens(raw.columns, ["age","dependents","occupation","city_tier","education","gender"])

    print("[info] expense_cols:", expense_cols)
    print("[info] income_cols:", income_cols)
    print("[info] potential_savings:", potential_cols)
    print("[info] savings_candidates:", savings_candidates)
    print("[info] demographic_cols:", demographic_cols)

    # scaling candidates (but skip potential/desired/percentage)
    scale_candidates = list(set(income_cols + expense_cols + potential_cols + desired_cols + savings_candidates))
    skip_prefixes = ['potential_savings', 'desired_savings', 'desired_saving']
    def should_autoscale(col):
        lname = col.lower()
        if any(lname.startswith(p) for p in skip_prefixes):
            return False
        if 'percentage' in lname or 'percent' in lname:
            return False
        return True

    df = raw.copy()
    for col in sorted(scale_candidates):
        if col in df.columns:
            if should_autoscale(col):
                df[col] = smart_scale(df[col], col)
            else:
                df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
                print(f"[skip-scale] kept '{col}' raw")

    # income consolidation
    if "income" in df.columns:
        df["income"] = pd.to_numeric(df["income"], errors="coerce")
    elif income_cols:
        df["income"] = pd.to_numeric(df[income_cols[0]], errors="coerce")
    else:
        print("[gen] generating synthetic income")
        df["income"] = np.exp(np.random.uniform(np.log(MIN_MONTHLY_INCOME), np.log(MAX_MONTHLY_INCOME), size=len(df))).round(0)
    df["income"] = df["income"].fillna(df["income"].median()).clip(lower=MIN_MONTHLY_INCOME, upper=MAX_MONTHLY_INCOME)

    # total_expenses
    if expense_cols:
        for c in expense_cols:
            df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)
        df["total_expenses"] = df[expense_cols].sum(axis=1, skipna=True)
    else:
        ratio = np.clip(np.random.normal(EXPENSE_TO_INCOME_RATIO, 0.1, len(df)), 0.35, 0.95)
        df["total_expenses"] = (df["income"] * ratio).round(0)

    # recompute disposable_income deterministically (drop raw if present)
    if "disposable_income" in df.columns:
        try:
            df.drop(columns=["disposable_income"], inplace=True)
        except Exception:
            pass
    df["disposable_income"] = (df["income"] - df["total_expenses"]).fillna(0).clip(lower=0)

    # current_savings: prefer real column if sensible
    chosen_sav = None
    for c in savings_candidates:
        if c in df.columns:
            med = pd.to_numeric(df[c], errors="coerce").dropna().median()
            if pd.notna(med) and med > 100:
                chosen_sav = c
                break

    if chosen_sav:
        df["current_savings"] = pd.to_numeric(df[chosen_sav], errors="coerce").fillna(0)
        df["savings_is_synthetic"] = False
        print(f"[use] using '{chosen_sav}' for current_savings")
    else:
        # generate synthetic savings and flag them
        df["savings_is_synthetic"] = True
        sav_ratio = np.random.uniform(SAVINGS_RATE_RANGE[0], SAVINGS_RATE_RANGE[1], size=len(df))
        months_saved = np.random.uniform(3, 36, size=len(df))
        df["current_savings"] = (df["income"] * sav_ratio * months_saved).round(0)
        print("[gen] generated synthetic current_savings for all rows (flagged)")

    # CAP synthetic savings to reasonable multiple
    cap_mult = SYNTHETIC_SAVINGS_CAP_MULTIPLE
    mask_synth = df["savings_is_synthetic"] == True
    if mask_synth.any():
        df.loc[mask_synth, "current_savings"] = df.loc[mask_synth, "current_savings"].clip(upper=(df.loc[mask_synth, "income"] * cap_mult))
        print(f"[cap] capped synthetic current_savings at income * {cap_mult}")

    df["current_savings"] = df["current_savings"].clip(lower=0)

    # total potential savings aggregation (kept raw as monthly-scale by default)
    if potential_cols:
        for c in potential_cols:
            df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)
        df["total_potential_savings"] = df[potential_cols].sum(axis=1, skipna=True)
    else:
        df["total_potential_savings"] = 0

    # goals
    df["months_remaining"] = np.random.choice([6,12,18,24,36,48,60], size=len(df), p=[0.10,0.20,0.25,0.20,0.15,0.07,0.03])
    multipliers = {6:1.5,12:3,18:4.5,24:6,36:9,48:12,60:15}
    df["goal_target_amount"] = df.apply(lambda r: r["income"] * multipliers.get(int(r["months_remaining"]),4) * np.random.uniform(0.8,1.2), axis=1)
    if "desired_savings" in df.columns:
        ds = pd.to_numeric(df["desired_savings"], errors="coerce").fillna(0)
        if ds.median() > 1000:
            use_ds = (ds > df["current_savings"] * 1.2) & (ds < df["income"] * 60)
            df.loc[use_ds, "goal_target_amount"] = ds.loc[use_ds]
            print(f"[use] applied 'desired_savings' as goal for {use_ds.sum()} rows")

    df["goal_target_amount"] = df["goal_target_amount"].clip(upper=df["income"] * 60)

    # monthly requirements + feasibility
    df["monthly_saving_required"] = ((df["goal_target_amount"] - df["current_savings"]) / df["months_remaining"]).clip(lower=0)
    df["max_feasible_monthly_saving"] = (df["disposable_income"] * MAX_SAVINGS_FRACTION).fillna(0)
    df["monthly_saving_label"] = np.minimum(df["monthly_saving_required"], df["max_feasible_monthly_saving"])
    # keep raw copy
    df["monthly_saving_label_raw"] = df["monthly_saving_label"].copy()

    # derived metrics
    df["expense_ratio"] = (df["total_expenses"] / df["income"]).replace([np.inf, -np.inf], np.nan).fillna(EXPENSE_TO_INCOME_RATIO)
    df["savings_rate"] = (df["disposable_income"] / df["income"]).clip(0,1).fillna(0)
    df["financial_health_score"] = (
        (1 - df["expense_ratio"]) * 0.4 +
        df["savings_rate"] * 0.3 +
        (df["current_savings"] / (df["income"] * 3 + 1e-9)) * 0.3
    ).clip(0,1)

    # canonical recompute & audit
    recomputed_disp = (df["income"] - df["total_expenses"]).fillna(0).clip(lower=0)
    mismatch_mask = (df["disposable_income"] != recomputed_disp) & (recomputed_disp > 0)
    if mismatch_mask.sum() > 0:
        print(f"[audit] disposable mismatch rows: {mismatch_mask.sum()} -> saving disposable_mismatch_debug.csv")
        df.loc[mismatch_mask, ["income","total_expenses","disposable_income"]].to_csv(os.path.join(DATA_DIR, "disposable_mismatch_debug.csv"), index=False)
    df["disposable_income"] = recomputed_disp

    # achievable_by_cutting & suggested plan
    df["achievable_by_cutting"] = (df["total_potential_savings"] >= df["monthly_saving_required"]).astype(int)

    def suggest_plan(r):
        if r["current_savings"] >= r["goal_target_amount"]:
            return 0.0
        if r["achievable_by_cutting"] == 1:
            return float(r["monthly_saving_required"])
        if r["max_feasible_monthly_saving"] >= MIN_PRACTICAL_SAVE:
            return float(r["max_feasible_monthly_saving"])
        return np.nan

    df["suggested_monthly_plan"] = df.apply(suggest_plan, axis=1)

    # goal_achieved: current_savings >= target OR (label==0 and tiny requirement)
    df["goal_achieved"] = ((df["current_savings"] >= df["goal_target_amount"]) | (df["monthly_saving_label"] == 0)).astype(int)

    # difficulty bucket
    def bucket(r):
        if r["goal_achieved"] == 1:
            return "achieved"
        if r["achievable_by_cutting"] == 1:
            return "achievable_by_cutting"
        rf = r["monthly_saving_required"]/(r["disposable_income"]+1e-9) if r["disposable_income"]>0 else np.nan
        if np.isnan(rf):
            return "unknown"
        if rf <= 0.2: return "easy"
        if rf <= 0.5: return "moderate"
        if r["max_feasible_monthly_saving"] >= MIN_PRACTICAL_SAVE: return "hard"
        return "structural_change_needed"

    df["difficulty_bucket"] = df.apply(bucket, axis=1)

    # Fix label semantics for rows that are achievable_by_cutting but label==0 (make training label numeric)
    mask_fix = (df["monthly_saving_label"] == 0) & (df["achievable_by_cutting"] == 1)
    fix_count = int(mask_fix.sum())
    if fix_count > 0:
        print(f"[fix] {fix_count} label==0 & achievable_by_cutting -> setting monthly_saving_label = monthly_saving_required")
        df.loc[mask_fix, "monthly_saving_label"] = df.loc[mask_fix, "monthly_saving_required"]

    # small labels keep zero if not achievable_by_cutting
    small_mask = (df["monthly_saving_label"] < MIN_PRACTICAL_SAVE) & (df["achievable_by_cutting"] == 0)
    df.loc[small_mask, "monthly_saving_label"] = 0.0

    # Final caps / fills
    df = df.fillna(0)
    df["income"] = df["income"].clip(lower=MIN_MONTHLY_INCOME, upper=MAX_MONTHLY_INCOME)
    df["total_expenses"] = df["total_expenses"].clip(lower=0, upper=df["income"] * 0.95)
    df["disposable_income"] = (df["income"] - df["total_expenses"]).clip(lower=0)

    # Save processed & features
    processed_cols = [
        "income","total_expenses","disposable_income","current_savings",
        "goal_target_amount","months_remaining","monthly_saving_label",
        "monthly_saving_required","max_feasible_monthly_saving","monthly_saving_label_raw",
        "expense_ratio","savings_rate","financial_health_score","goal_achieved",
        "total_potential_savings","achievable_by_cutting","suggested_monthly_plan","difficulty_bucket",
        "savings_is_synthetic"
    ]
    for c in demographic_cols:
        if c in df.columns and c not in processed_cols:
            processed_cols.append(c)

    processed_df = df[processed_cols].copy()
    processed_out = os.path.join(DATA_DIR, "processed_finance_dataset.csv")
    features_out = os.path.join(DATA_DIR, "processed_finance_dataset_features.csv")
    processed_df.to_csv(processed_out, index=False)
    df.to_csv(features_out, index=False)
    print("[save] wrote processed and features")

    # Save audits
    prob_zero_mask = (processed_df["monthly_saving_label"] == 0) & (processed_df["monthly_saving_required"] >= MIN_PRACTICAL_SAVE)
    if prob_zero_mask.any():
        processed_df.loc[prob_zero_mask].to_csv(os.path.join(DATA_DIR, "problem_rows_zero_label.csv"), index=False)
        print("[audit] wrote problem_rows_zero_label.csv")

    if "total_potential_savings" in df.columns:
        susp_mask = df["total_potential_savings"] > df["income"] * 12
        if susp_mask.any():
            df.loc[susp_mask].to_csv(os.path.join(DATA_DIR, "potential_savings_suspicious.csv"), index=False)
            print("[audit] wrote potential_savings_suspicious.csv")

    # train/test split (stratify on difficulty_bucket if meaningful)
    strat = df["difficulty_bucket"] if df["difficulty_bucket"].nunique() > 1 else df["goal_achieved"] if df["goal_achieved"].nunique() > 1 else None
    if strat is not None:
        tr, te = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED, stratify=strat)
    else:
        tr, te = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
    tr.to_csv(os.path.join(DATA_DIR, "train.csv"), index=False)
    te.to_csv(os.path.join(DATA_DIR, "test.csv"), index=False)
    print("[save] wrote train/test")

    # Summary
    print("\n=== pipeline summary ===")
    print("rows:", len(df))
    print("income range: {:.0f} - {:.0f}".format(df["income"].min(), df["income"].max()))
    print("avg monthly_saving_label: {:.0f}".format(df["monthly_saving_label"].mean()))
    print("goals achieved:", int(df["goal_achieved"].sum()), "({:.1f}%)".format(df["goal_achieved"].mean()*100))
    print("fraction label==0: {:.2%}".format((df["monthly_saving_label"]==0).mean()))
    print("difficulty counts:\n", df["difficulty_bucket"].value_counts())

    print("\nAudit files: disposable_mismatch_debug.csv, problem_rows_zero_label.csv, potential_savings_suspicious.csv (if created)")
    print("=== pipeline end ===")

if __name__ == "__main__":
    main()


=== pipeline start ===
[info] using dataset: /content/data/data.csv
[info] raw shape: (20000, 27)
[info] expense_cols: ['rent', 'loan_repayment', 'insurance', 'groceries', 'transport', 'eating_out', 'entertainment', 'utilities', 'healthcare', 'education', 'miscellaneous']
[info] income_cols: ['income', 'disposable_income']
[info] potential_savings: ['potential_savings_groceries', 'potential_savings_transport', 'potential_savings_eating_out', 'potential_savings_entertainment', 'potential_savings_utilities', 'potential_savings_healthcare', 'potential_savings_education', 'potential_savings_miscellaneous']
[info] savings_candidates: []
[info] demographic_cols: ['age', 'dependents', 'occupation', 'city_tier', 'education', 'desired_savings_percentage', 'potential_savings_education']
[skip-scale] kept 'desired_savings' raw
[skip-scale] kept 'desired_savings_percentage' raw
[scale] scaled 'loan_repayment' median 0.00 -> 0.00
[info] ambiguous units for 'miscellaneous' median 579.1 — left as-is.

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

DATA_DIR = "/content/data"
FULL_FEAT = DATA_DIR + "/processed_finance_dataset_features.csv"  # full file created by pipeline
df = pd.read_csv(FULL_FEAT)

print("Rows total:", len(df))
print("Difficulty counts:\n", df['difficulty_bucket'].value_counts())
print("achievable_by_cutting fraction:", df['achievable_by_cutting'].mean())
print("goal_achieved fraction:", df['goal_achieved'].mean())
print("monthly_saving_label stats:\n", df['monthly_saving_label'].describe())

# CLASSIFIER dataset (achievable_by_cutting)
clf_df = df.copy()
# choose features
feature_cols = ['income','disposable_income','current_savings','total_potential_savings',
                'expense_ratio','savings_rate','age','dependents','city_tier','occupation']
# simple drop NA
clf_df = clf_df.dropna(subset=['achievable_by_cutting'] + feature_cols)
X = clf_df[feature_cols]
y = clf_df['achievable_by_cutting']
# train/test split (stratify)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Classifier shapes:", X_train.shape, X_test.shape)

# REGRESSION dataset (predict monthly_saving_label) — exclude already achieved rows
reg_df = df[df['goal_achieved'] == 0].copy()
reg_df = reg_df[reg_df['monthly_saving_label'] > 0]  # only meaningful numeric targets
print("Regression dataset size (non-achieved & label>0):", len(reg_df))
if len(reg_df) >= 200:  # minimal size sanity
    Xr = reg_df[feature_cols]
    yr = reg_df['monthly_saving_label']
    Xr_train, Xr_test, yr_train, yr_test = train_test_split(Xr, yr, test_size=0.2, random_state=42)
    print("Regressor shapes:", Xr_train.shape, Xr_test.shape)
else:
    print("Warning: too few examples for regression; consider expanding dataset or relaxing filters.")

# Save splits
X_train.assign(target=y_train).to_csv(DATA_DIR+"/clf_train.csv", index=False)
X_test.assign(target=y_test).to_csv(DATA_DIR+"/clf_test.csv", index=False)
if len(reg_df) >= 200:
    Xr_train.assign(target=yr_train).to_csv(DATA_DIR+"/reg_train.csv", index=False)
    Xr_test.assign(target=yr_test).to_csv(DATA_DIR+"/reg_test.csv", index=False)

print("Saved clf/reg splits to", DATA_DIR)


Rows total: 20000
Difficulty counts:
 difficulty_bucket
achieved                 10648
hard                      3995
moderate                  3631
achievable_by_cutting     1231
easy                       495
Name: count, dtype: int64
achievable_by_cutting fraction: 0.32655
goal_achieved fraction: 0.5324
monthly_saving_label stats:
 count    20000.000000
mean      2440.178019
std       4326.414275
min          0.000000
25%          0.000000
50%        136.657356
75%       3309.913526
max      84640.655113
Name: monthly_saving_label, dtype: float64
Classifier shapes: (16000, 10) (4000, 10)
Regression dataset size (non-achieved & label>0): 9344
Regressor shapes: (7475, 10) (1869, 10)
Saved clf/reg splits to /content/data


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# -------------------------
# 1. Load dataset
# -------------------------
df = pd.read_csv("/content/data/processed_finance_dataset.csv")
print("Full dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

# -------------------------
# 2. Define target
# -------------------------
target_col = "achievable_by_cutting"

# Define safe features dynamically
drop_leakage = [
    target_col, "goal_achieved", "monthly_saving_label", "monthly_saving_required",
    "max_feasible_monthly_saving", "current_savings", "disposable_income",
    "difficulty_bucket", "label_zero_reason"
]

safe_features = [c for c in df.columns if c not in drop_leakage]
print("Using features:", safe_features)

X = df[safe_features]
y = df[target_col]

# -------------------------
# 3. Split train/test
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# -------------------------
# 4. Preprocessing
# -------------------------
categorical = [c for c in X.columns if df[c].dtype == "object"]
numeric = [c for c in X.columns if c not in categorical]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical)
    ]
)

# -------------------------
# 5. Model
# -------------------------
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("clf", XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        use_label_encoder=False,
        eval_metric="logloss"
    ))
])

# -------------------------
# 6. Train
# -------------------------
pipeline.fit(X_train, y_train)

# -------------------------
# 7. Evaluate
# -------------------------
y_pred = pipeline.predict(X_test)

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

# -------------------------
# 8. Save model
# -------------------------
joblib.dump(pipeline, "/content/savings_goal_clf.pkl")
print("Saved model to /content/savings_goal_clf.pkl")


Full dataset shape: (20000, 26)
Columns: ['income', 'total_expenses', 'disposable_income', 'current_savings', 'goal_target_amount', 'months_remaining', 'monthly_saving_label', 'monthly_saving_required', 'max_feasible_monthly_saving', 'monthly_saving_label_raw', 'expense_ratio', 'savings_rate', 'financial_health_score', 'goal_achieved', 'total_potential_savings', 'achievable_by_cutting', 'suggested_monthly_plan', 'difficulty_bucket', 'savings_is_synthetic', 'age', 'dependents', 'occupation', 'city_tier', 'education', 'desired_savings_percentage', 'potential_savings_education']
Using features: ['income', 'total_expenses', 'goal_target_amount', 'months_remaining', 'monthly_saving_label_raw', 'expense_ratio', 'savings_rate', 'financial_health_score', 'total_potential_savings', 'suggested_monthly_plan', 'savings_is_synthetic', 'age', 'dependents', 'occupation', 'city_tier', 'education', 'desired_savings_percentage', 'potential_savings_education']
Train shape: (16000, 18) Test shape: (4000, 

In [1]:
import pickle
import pandas as pd
import google.generativeai as genai
import joblib
from google.colab import userdata # Import userdata

# ----------------------------
# 1. Load the trained model
# ----------------------------
# Use joblib.load instead of pickle.load directly
with open("/content/savings_goal_clf.pkl", "rb") as f:
    model = joblib.load(f)

# Define the same feature order used in training
FEATURES = [
    'income', 'total_expenses', 'goal_target_amount', 'months_remaining',
    'monthly_saving_label_raw', 'expense_ratio', 'savings_rate',
    'financial_health_score', 'total_potential_savings',
    'suggested_monthly_plan', 'savings_is_synthetic', 'age', 'dependents',
    'occupation', 'city_tier', 'education', 'desired_savings_percentage',
    'potential_savings_education'
]

# ----------------------------
# 2. Example user input
# ----------------------------
user_input = {
    "income": 40000,
    "total_expenses": 32000,
    "goal_target_amount": 150000,
    "months_remaining": 24,
    "monthly_saving_label_raw": 2000,
    "expense_ratio": 0.8,
    "savings_rate": 0.05,
    "financial_health_score": 2, # Assuming this is a placeholder or needs calculation
    "total_potential_savings": 5000,
    "suggested_monthly_plan": 2500,
    "savings_is_synthetic": 0,
    "age": 30,
    "dependents": 2,
    "occupation": "Professional",
    "city_tier": "Tier_2",
    "education": 16, # Assuming this is years of education or a categorical encoding
    "desired_savings_percentage": 15,
    "potential_savings_education": 500 # Assuming this is a monetary value
}

# Wrap input in DataFrame (1-row, same order of features)
input_df = pd.DataFrame([user_input], columns=FEATURES)

# ----------------------------
# 3. Run prediction
# ----------------------------
prediction = model.predict(input_df)[0]
proba = model.predict_proba(input_df)[0][1]

print(f"Prediction: {prediction} (probability of achievable_by_cutting={proba:.2f})")

# ----------------------------
# 4. Setup Gemini API
# ----------------------------
# Use userdata.get to retrieve the API key from Colab Secrets
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)


model_gemini = genai.GenerativeModel("gemini-1.5-flash")

# Construct a structured prompt
prompt = f"""
You are a financial advisor AI. Interpret the following user financial profile and
the ML model's prediction about whether they can achieve their savings goal by cutting expenses.

User profile:
{user_input}

ML Prediction:
- Achievable by cutting: {bool(prediction)}
- Confidence: {proba:.2f}

Please provide:
1. A short plain-language explanation of the prediction.
2. Practical savings or expense adjustment advice tailored to the profile.
"""

# ----------------------------
# 5. Get insights from Gemini
# ----------------------------
response = model_gemini.generate_content(prompt)

print("\n=== Gemini Insights ===")
print(response.text)

Prediction: 1 (probability of achievable_by_cutting=0.99)

=== Gemini Insights ===
1. **Plain-Language Explanation:**

The AI model predicts with very high confidence (99%) that you can reach your savings goal of $150,000 within 24 months by cutting expenses.  Your current savings rate is low, but the model believes that by making reasonable adjustments to your spending, you can significantly increase your savings and meet your target.

2. **Practical Savings & Expense Adjustment Advice:**

The model suggests a monthly savings plan of $2500.  This is achievable, given your current $8000 surplus ($40,000 income - $32,000 expenses), even though a significant portion of that is currently consumed by your monthly expenses. To reach your goal, here's a step-by-step approach focusing on realistic and practical adjustments:

* **Analyze Spending:**  Carefully track your expenses for at least a month, categorizing everything.  This will highlight areas for potential reductions. Look for subscr