In [1]:
# ============== Task 4 — Config =================
# Data sources (prefer the clean engineered CSV you created in Task 1)
USE_CLEAN_FIRST = True
CLEAN_PATH = "loan_clean_subset.csv"             # put the file next to this notebook
RAW_PATH   = "accepted_2007_to_2018Q4.csv"    # if clean not available, we'll preprocess from raw
NROWS_FROM_RAW = 200_000                         # set None for full data if you have RAM

# Re-train inside this notebook (True) vs only evaluate saved artifacts (False)
RETRAIN_DL = True
RETRAIN_RL = True

# Saved artifact locations (if you want to load pre-trained)
DL_MODEL_DIR = "model_mlp_default_risk"          # from your Task-2 Cell 9
DL_MODEL_PATH = f"{DL_MODEL_DIR}/.keras"         # you customized this path in Task 2
RL_ART_DIR    = "offline_rl_cql"                 # from Task-3 Cell 9 if you saved there
RL_MODEL_PATH = f"{RL_ART_DIR}/cql_discrete_model.d3"
PREPROC_PATH  = f"{RL_ART_DIR}/preprocess.joblib"  # sklearn preprocessor saved in Task-3

RANDOM_STATE = 42


In [2]:
# ============== Installs (Jupyter / Colab) =================
# In a fresh environment you may need these:
# (On Windows Jupyter, ensure you're on Python 3.11 environment, not 3.13.)

!pip -q install -U numpy<2.0 pandas scikit-learn matplotlib joblib

# Offline RL stack (d3rlpy + gymnasium + torch). If already installed, this cell will no-op.
# If you see errors, ensure you're in Python 3.11 and re-run.
!pip -q install -U d3rlpy==2.4.0 gymnasium[classic-control]==0.29.1 torch==2.4.1


The system cannot find the file specified.

[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
import sys, torch
print("Python:", sys.version)
print("Torch :", torch.__version__)
print("Kernel exe:", sys.executable)


Python: 3.11.14 | packaged by Anaconda, Inc. | (main, Oct 21 2025, 18:30:03) [MSC v.1929 64 bit (AMD64)]
Torch : 2.4.1+cpu
Kernel exe: C:\Users\kamya\anaconda3\envs\rl-fintech\python.exe


In [11]:
!pip install tensorflow==2.15.0





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
# === Minimal setup before PyTorch MLP ===
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load your clean dataset (or point to your CSV)
df = pd.read_csv("loan_clean_subset.csv")   # path to your clean data from Task 1
assert "default" in df.columns, "Dataset must include a 'default' column."

X = df.drop(columns=["default"])
y = df["default"].astype(int).values

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Identify numeric and categorical columns
num_features = [c for c in X_train.columns if np.issubdtype(X_train[c].dtype, np.number)]
cat_features = [c for c in X_train.columns if not np.issubdtype(X_train[c].dtype, np.number)]

# Define preprocessing pipeline
num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")),
                     ("scaler", StandardScaler())])
cat_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                     ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))])

preprocess = ColumnTransformer([
    ("num", num_pipe, num_features),
    ("cat", cat_pipe, cat_features)
])

# Transform to numpy arrays
X_train_np = preprocess.fit_transform(X_train)
X_test_np  = preprocess.transform(X_test)

print("X_train_np shape:", X_train_np.shape)
print("X_test_np shape :", X_test_np.shape)


X_train_np shape: (141612, 88)
X_test_np shape : (35404, 88)


In [17]:
# ==== DL model (PyTorch MLP) — Train & Evaluate AUC/F1 ====
import numpy as np, torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, random_split
from sklearn.metrics import roc_auc_score, f1_score, classification_report, confusion_matrix
import os

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
INPUT_DIM = X_train_np.shape[1]
BATCH_SIZE = 1024
EPOCHS = 30
LR = 1e-3
VAL_SPLIT = 0.15
MODEL_DIR = "model_mlp_default_risk"
MODEL_PATH = os.path.join(MODEL_DIR, "pytorch_mlp.pt")

class MLP(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 256), nn.ReLU(), nn.Dropout(0.25),
            nn.Linear(256, 128),    nn.ReLU(), nn.Dropout(0.25),
            nn.Linear(128, 64),     nn.ReLU(), nn.Dropout(0.25),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.net(x)  # logits

# Build dataset & split
X_t = torch.tensor(X_train_np, dtype=torch.float32)
y_t = torch.tensor(y_train.reshape(-1,1), dtype=torch.float32)
dataset = TensorDataset(X_t, y_t)
val_size = int(len(dataset)*VAL_SPLIT)
train_size = len(dataset) - val_size
train_ds, val_ds = random_split(dataset, [train_size, val_size], generator=torch.Generator().manual_seed(42))
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_dl   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

model = MLP(INPUT_DIM).to(DEVICE)
opt = torch.optim.Adam(model.parameters(), lr=LR)
bce = nn.BCEWithLogitsLoss()

best_val_auc = -1.0
patience, patience_cnt = 5, 0
os.makedirs(MODEL_DIR, exist_ok=True)

def eval_auc_loss(dl):
    model.eval()
    all_logits, all_y, total_loss = [], [], 0.0
    with torch.no_grad():
        for xb, yb in dl:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            logits = model(xb)
            loss = bce(logits, yb)
            total_loss += loss.item()*len(xb)
            all_logits.append(logits.cpu())
            all_y.append(yb.cpu())
    logits = torch.cat(all_logits).numpy().ravel()
    y_true = torch.cat(all_y).numpy().ravel().astype(int)
    y_proba = 1/(1+np.exp(-logits))
    auc = roc_auc_score(y_true, y_proba)
    return auc, total_loss/len(dl.dataset)

for epoch in range(1, EPOCHS+1):
    model.train()
    for xb, yb in train_dl:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        opt.zero_grad()
        logits = model(xb)
        loss = bce(logits, yb)
        loss.backward()
        opt.step()
    val_auc, val_loss = eval_auc_loss(val_dl)
    print(f"Epoch {epoch:02d} | val_auc={val_auc:.4f} | val_loss={val_loss:.4f}")
    # Early stopping on val AUC
    if val_auc > best_val_auc + 1e-4:
        best_val_auc = val_auc
        torch.save(model.state_dict(), MODEL_PATH)
        patience_cnt = 0
    else:
        patience_cnt += 1
        if patience_cnt >= patience:
            print("Early stopping.")
            break

# Load best and evaluate on TEST
best_model = MLP(INPUT_DIM).to(DEVICE)
best_model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
best_model.eval()

with torch.no_grad():
    logits_test = best_model(torch.tensor(X_test_np, dtype=torch.float32).to(DEVICE)).cpu().numpy().ravel()
y_proba_dl = 1/(1+np.exp(-logits_test))
y_pred_dl  = (y_proba_dl >= 0.5).astype(int)

auc_dl = roc_auc_score(y_test, y_proba_dl)
f1_dl  = f1_score(y_test, y_pred_dl)

print(f"DL (PyTorch) — Test ROC-AUC: {auc_dl:.4f}")
print(f"DL (PyTorch) — Test F1     : {f1_dl:.4f}")
print("\nDL Classification report (thr=0.5):")
print(classification_report(y_test, y_pred_dl, digits=4))
print("DL Confusion matrix:")
print(confusion_matrix(y_test, y_pred_dl))


Epoch 01 | val_auc=0.7434 | val_loss=0.4426
Epoch 02 | val_auc=0.7456 | val_loss=0.4402
Epoch 03 | val_auc=0.7456 | val_loss=0.4401
Epoch 04 | val_auc=0.7462 | val_loss=0.4403
Epoch 05 | val_auc=0.7467 | val_loss=0.4391
Epoch 06 | val_auc=0.7464 | val_loss=0.4395
Epoch 07 | val_auc=0.7464 | val_loss=0.4397
Epoch 08 | val_auc=0.7457 | val_loss=0.4398
Epoch 09 | val_auc=0.7455 | val_loss=0.4397
Epoch 10 | val_auc=0.7460 | val_loss=0.4403
Early stopping.
DL (PyTorch) — Test ROC-AUC: 0.7464
DL (PyTorch) — Test F1     : 0.2182

DL Classification report (thr=0.5):
              precision    recall  f1-score   support

           0     0.8151    0.9743    0.8876     28199
           1     0.5729    0.1348    0.2182      7205

    accuracy                         0.8035     35404
   macro avg     0.6940    0.5545    0.5529     35404
weighted avg     0.7658    0.8035    0.7514     35404

DL Confusion matrix:
[[27475   724]
 [ 6234   971]]


  best_model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))


In [19]:
# ==== Cell 4: PyTorch MLP — Train & Evaluate (AUC / F1) ====
import os, numpy as np, torch, torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, random_split
from sklearn.metrics import roc_auc_score, f1_score, classification_report, confusion_matrix

DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"
INPUT_DIM  = X_train_np.shape[1]
BATCH_SIZE = 1024
EPOCHS     = 30
LR         = 1e-3
VAL_SPLIT  = 0.15
MODEL_DIR  = "model_mlp_default_risk"
MODEL_PATH = os.path.join(MODEL_DIR, "pytorch_mlp.pt")

class MLP(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 256), nn.ReLU(), nn.Dropout(0.25),
            nn.Linear(256, 128),    nn.ReLU(), nn.Dropout(0.25),
            nn.Linear(128, 64),     nn.ReLU(), nn.Dropout(0.25),
            nn.Linear(64, 1)
        )
    def forward(self, x):  # returns logits
        return self.net(x)

# Build dataset & split
X_t = torch.tensor(X_train_np, dtype=torch.float32)
y_t = torch.tensor(y_train.reshape(-1,1), dtype=torch.float32)
ds  = TensorDataset(X_t, y_t)
val_sz   = int(len(ds)*VAL_SPLIT)
train_sz = len(ds)-val_sz
train_ds, val_ds = random_split(ds, [train_sz, val_sz], generator=torch.Generator().manual_seed(42))
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_dl   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

model = MLP(INPUT_DIM).to(DEVICE)
opt   = torch.optim.Adam(model.parameters(), lr=LR)
bce   = nn.BCEWithLogitsLoss()

best_auc, patience, wait = -1.0, 5, 0
os.makedirs(MODEL_DIR, exist_ok=True)

def eval_auc(dl):
    model.eval()
    all_logits, all_y, total = [], [], 0
    with torch.no_grad():
        for xb, yb in dl:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            logits = model(xb)
            loss   = bce(logits, yb)
            total += loss.item()*len(xb)
            all_logits.append(logits.cpu())
            all_y.append(yb.cpu())
    logits = torch.cat(all_logits).numpy().ravel()
    ytrue  = torch.cat(all_y).numpy().ravel().astype(int)
    yprob  = 1/(1+np.exp(-logits))
    return roc_auc_score(ytrue, yprob), total/len(dl.dataset)

for ep in range(1, EPOCHS+1):
    model.train()
    for xb, yb in train_dl:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        opt.zero_grad()
        loss = bce(model(xb), yb)
        loss.backward()
        opt.step()
    val_auc, val_loss = eval_auc(val_dl)
    print(f"Epoch {ep:02d} | val_auc={val_auc:.4f} | val_loss={val_loss:.4f}")
    if val_auc > best_auc + 1e-4:
        best_auc, wait = val_auc, 0
        torch.save(model.state_dict(), MODEL_PATH)
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping.")
            break

# Load best and evaluate on TEST
best = MLP(INPUT_DIM).to(DEVICE)
best.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
best.eval()
with torch.no_grad():
    logits_test = best(torch.tensor(X_test_np, dtype=torch.float32).to(DEVICE)).cpu().numpy().ravel()
y_proba_dl = 1/(1+np.exp(-logits_test))
y_pred_dl  = (y_proba_dl >= 0.5).astype(int)

auc_dl = roc_auc_score(y_test, y_proba_dl)
f1_dl  = f1_score(y_test, y_pred_dl)
print(f"\nDL (PyTorch) — Test ROC-AUC: {auc_dl:.4f}")
print(f"DL (PyTorch) — Test F1     : {f1_dl:.4f}")
print("\nDL Classification report (thr=0.5):")
print(classification_report(y_test, y_pred_dl, digits=4))
print("DL Confusion matrix:")
print(confusion_matrix(y_test, y_pred_dl))


Epoch 01 | val_auc=0.7434 | val_loss=0.4429
Epoch 02 | val_auc=0.7462 | val_loss=0.4409
Epoch 03 | val_auc=0.7470 | val_loss=0.4388
Epoch 04 | val_auc=0.7473 | val_loss=0.4389
Epoch 05 | val_auc=0.7476 | val_loss=0.4383
Epoch 06 | val_auc=0.7472 | val_loss=0.4383
Epoch 07 | val_auc=0.7473 | val_loss=0.4382
Epoch 08 | val_auc=0.7470 | val_loss=0.4392
Epoch 09 | val_auc=0.7478 | val_loss=0.4389
Epoch 10 | val_auc=0.7471 | val_loss=0.4385
Epoch 11 | val_auc=0.7471 | val_loss=0.4384
Epoch 12 | val_auc=0.7468 | val_loss=0.4385
Epoch 13 | val_auc=0.7458 | val_loss=0.4415
Epoch 14 | val_auc=0.7454 | val_loss=0.4396
Early stopping.

DL (PyTorch) — Test ROC-AUC: 0.7453
DL (PyTorch) — Test F1     : 0.2960

DL Classification report (thr=0.5):
              precision    recall  f1-score   support

           0     0.8245    0.9573    0.8860     28199
           1     0.5480    0.2028    0.2960      7205

    accuracy                         0.8037     35404
   macro avg     0.6863    0.5800    0.5

  best.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))


In [21]:
# ==== Cell 5: Offline RL — Build rewards, train CQL (robust), compute EPV ====
import numpy as np, pandas as pd, d3rlpy
from d3rlpy.dataset import MDPDataset

# Business reward function
def compute_reward(approve: int, default_flag: int, loan_amnt: float, int_rate: float) -> float:
    if approve == 0:
        return 0.0
    return (loan_amnt * (int_rate/100.0)) if default_flag == 0 else -float(loan_amnt)

# Check required reward columns exist in your clean file
assert "loan_amnt" in X.columns and "int_rate" in X.columns, \
    "Your clean dataset must include 'loan_amnt' and 'int_rate' columns for reward calculation."

# Build one-step transitions for BOTH actions per train sample
def build_bandit_mdp(X_df, X_np, y):
    obs, act, rew, ter = [], [], [], []
    for i in range(len(y)):
        s  = X_np[i]
        la = float(X_df.iloc[i]["loan_amnt"])
        ir = float(X_df.iloc[i]["int_rate"])
        d  = int(y[i])
        # deny (a=0)
        obs.append(s); act.append(0); rew.append(compute_reward(0, d, la, ir)); ter.append(1.0)
        # approve (a=1)
        obs.append(s); act.append(1); rew.append(compute_reward(1, d, la, ir)); ter.append(1.0)
    return (np.asarray(obs, np.float32),
            np.asarray(act, np.int64),
            np.asarray(rew, np.float32),
            np.asarray(ter, np.float32))

obs_tr, act_tr, rew_tr, ter_tr = build_bandit_mdp(X_train, X_train_np, y_train)
train_dataset = MDPDataset(observations=obs_tr, actions=act_tr, rewards=rew_tr, terminals=ter_tr)

# Construct a CQL agent in a version-robust way
algo = None
try:
    from d3rlpy.algos import DiscreteCQL
    try:
        algo = DiscreteCQL(learning_rate=3e-4, batch_size=2048, n_steps=1, use_gpu=False)
    except TypeError:
        algo = None
except Exception:
    pass

if algo is None:
    try:
        from d3rlpy.algos import DiscreteCQLConfig
        cfg = DiscreteCQLConfig()
        if hasattr(cfg, "create"): algo = cfg.create(device="cpu")
        else:
            from d3rlpy.algos import DiscreteCQL
            algo = DiscreteCQL(cfg, device="cpu")
    except Exception:
        from d3rlpy.algos import CQL, CQLConfig
        cfg = CQLConfig()
        algo = cfg.create(device="cpu") if hasattr(cfg, "create") else CQL(cfg, device="cpu")

# Fit with a robust wrapper that tries multiple signatures
# ---- Minimal trainer for older d3rlpy (positional n_steps only) ----
# Many 1.x builds require: fit(dataset, n_steps) with no keywords.
N_STEPS = 100_000  # you can increase later (e.g., 200k–500k)

print(f"[trainer] Starting CQL training for {N_STEPS:,} steps (positional API).")
algo.fit(train_dataset, N_STEPS)  # <-- positional n_steps only
print("[trainer] Training complete.")


# Greedy action dispatcher across versions
def greedy_action(algo, states: np.ndarray) -> np.ndarray:
    for fn_name in ["predict", "predict_best_action", "predict_action"]:
        if hasattr(algo, fn_name):
            out = getattr(algo, fn_name)(states)
            if isinstance(out, tuple): out = out[0]
            return np.asarray(out, dtype=int)
    raise RuntimeError("No compatible predict method found for greedy action.")

# Evaluate RL on test: empirical Estimated Policy Value (EPV)
a_hat_rl = greedy_action(algo, X_test_np).astype(int)

def row_reward(i, a):
    la = float(X_test.iloc[i]["loan_amnt"])
    ir = float(X_test.iloc[i]["int_rate"])
    d  = int(y_test[i])
    return compute_reward(a, d, la, ir)

rewards_rl    = np.array([row_reward(i, a_hat_rl[i]) for i in range(len(y_test))], dtype=float)
epv_empirical = rewards_rl.mean()
approval_rate = float((a_hat_rl == 1).mean())
approved_paid = int(((a_hat_rl == 1) & (y_test == 0)).sum())
approved_def  = int(((a_hat_rl == 1) & (y_test == 1)).sum())

print(f"\nRL — Estimated Policy Value (empirical test): {epv_empirical:,.2f} per application")
print(f"RL — Approval rate (test): {approval_rate*100:.2f}%")
print(f"RL — Approved & Fully Paid: {approved_paid}")
print(f"RL — Approved & Defaulted : {approved_def}")

# Optional: FQE if available
try:
    from d3rlpy.ope import FQE
    fqe = FQE(algo)
    fqe.fit(train_dataset, n_epochs=5, verbose=False)
    v_est = float(fqe.predict_value(X_test_np).mean())
    print(f"RL — FQE Estimated Policy Value (approx): {v_est:,.2f} per application")
except Exception as e:
    v_est = None
    print("[Info] FQE unavailable/failed; skipping. Reason:", e)


2025-10-29 17:34.00 [info     ] Signatures have been automatically determined. action_signature=Signature(dtype=[dtype('int64')], shape=[(1,)]) observation_signature=Signature(dtype=[dtype('float32')], shape=[(88,)]) reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)])
2025-10-29 17:34.00 [info     ] Action-space has been automatically determined. action_space=<ActionSpace.DISCRETE: 2>
2025-10-29 17:34.02 [info     ] Action size has been automatically determined. action_size=2
[trainer] Starting CQL training for 100,000 steps (positional API).
2025-10-29 17:34.02 [info     ] dataset info                   dataset_info=DatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(88,)]), action_signature=Signature(dtype=[dtype('int64')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=2)
2025-10-29 17:34.02 [info     ] Directory is created at d3rlpy_logs\DiscreteCQL_20251

Epoch 1/10: 100%|████████| 10000/10000 [01:35<00:00, 104.30it/s, loss=2.28e+3, td_loss=2.28e+3, conservative_loss=1.03]


2025-10-29 17:35.38 [info     ] DiscreteCQL_20251029173402: epoch=1 step=10000 epoch=1 metrics={'time_sample_batch': 0.001950192952156067, 'time_algorithm_update': 0.007246652841567993, 'loss': 2284.6669262908936, 'td_loss': 2283.6405488830565, 'conservative_loss': 1.0263784131526947, 'time_step': 0.009455177330970763} step=10000
2025-10-29 17:35.38 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251029173402\model_10000.d3


Epoch 2/10: 100%|█████████| 10000/10000 [01:42<00:00, 97.23it/s, loss=2.27e+3, td_loss=2.27e+3, conservative_loss=1.06]


2025-10-29 17:37.21 [info     ] DiscreteCQL_20251029173402: epoch=2 step=20000 epoch=2 metrics={'time_sample_batch': 0.0017347086429595948, 'time_algorithm_update': 0.008112270545959473, 'loss': 2275.357523045349, 'td_loss': 2274.293320652771, 'conservative_loss': 1.0642012583494187, 'time_step': 0.01013002429008484} step=20000
2025-10-29 17:37.21 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251029173402\model_20000.d3


Epoch 3/10: 100%|█████████| 10000/10000 [01:44<00:00, 95.34it/s, loss=2.29e+3, td_loss=2.29e+3, conservative_loss=1.05]


2025-10-29 17:39.06 [info     ] DiscreteCQL_20251029173402: epoch=3 step=30000 epoch=3 metrics={'time_sample_batch': 0.001788645362854004, 'time_algorithm_update': 0.00826271939277649, 'loss': 2286.6103536483765, 'td_loss': 2285.5583837646486, 'conservative_loss': 1.0519686249405145, 'time_step': 0.010330669593811036} step=30000
2025-10-29 17:39.06 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251029173402\model_30000.d3


Epoch 4/10: 100%|█████████| 10000/10000 [01:41<00:00, 98.77it/s, loss=2.27e+3, td_loss=2.27e+3, conservative_loss=1.06]


2025-10-29 17:40.47 [info     ] DiscreteCQL_20251029173402: epoch=4 step=40000 epoch=4 metrics={'time_sample_batch': 0.0017792885065078736, 'time_algorithm_update': 0.007895888662338257, 'loss': 2273.507225967407, 'td_loss': 2272.443395089722, 'conservative_loss': 1.0638314249038696, 'time_step': 0.009961041808128356} step=40000
2025-10-29 17:40.47 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251029173402\model_40000.d3


Epoch 5/10: 100%|█████████| 10000/10000 [01:47<00:00, 92.59it/s, loss=2.28e+3, td_loss=2.28e+3, conservative_loss=1.07]


2025-10-29 17:42.35 [info     ] DiscreteCQL_20251029173402: epoch=5 step=50000 epoch=5 metrics={'time_sample_batch': 0.001859654927253723, 'time_algorithm_update': 0.008246270608901977, 'loss': 2278.993591516113, 'td_loss': 2277.921621609497, 'conservative_loss': 1.0719685282200575, 'time_step': 0.010522920179367066} step=50000
2025-10-29 17:42.35 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251029173402\model_50000.d3


Epoch 6/10: 100%|█████████| 10000/10000 [01:51<00:00, 89.99it/s, loss=2.27e+3, td_loss=2.27e+3, conservative_loss=1.07]


2025-10-29 17:44.26 [info     ] DiscreteCQL_20251029173402: epoch=6 step=60000 epoch=6 metrics={'time_sample_batch': 0.001890283966064453, 'time_algorithm_update': 0.008564613676071166, 'loss': 2273.804710974121, 'td_loss': 2272.7331754089355, 'conservative_loss': 1.0715356320679188, 'time_step': 0.010842738986015319} step=60000
2025-10-29 17:44.26 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251029173402\model_60000.d3


Epoch 7/10: 100%|█████████| 10000/10000 [01:49<00:00, 91.08it/s, loss=2.27e+3, td_loss=2.27e+3, conservative_loss=1.08]


2025-10-29 17:46.16 [info     ] DiscreteCQL_20251029173402: epoch=7 step=70000 epoch=7 metrics={'time_sample_batch': 0.001917043948173523, 'time_algorithm_update': 0.008452614426612853, 'loss': 2269.1253551116943, 'td_loss': 2268.0493310821535, 'conservative_loss': 1.0760244467437268, 'time_step': 0.010736567091941833} step=70000
2025-10-29 17:46.16 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251029173402\model_70000.d3


Epoch 8/10: 100%|█████████| 10000/10000 [01:44<00:00, 95.55it/s, loss=2.28e+3, td_loss=2.28e+3, conservative_loss=1.08]


2025-10-29 17:48.01 [info     ] DiscreteCQL_20251029173402: epoch=8 step=80000 epoch=8 metrics={'time_sample_batch': 0.0018487441778182983, 'time_algorithm_update': 0.008175725674629212, 'loss': 2282.2005809188845, 'td_loss': 2281.119483039856, 'conservative_loss': 1.081098737603426, 'time_step': 0.010303324151039123} step=80000
2025-10-29 17:48.01 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251029173402\model_80000.d3


Epoch 9/10: 100%|█████████| 10000/10000 [01:43<00:00, 96.60it/s, loss=2.27e+3, td_loss=2.27e+3, conservative_loss=1.07]


2025-10-29 17:49.45 [info     ] DiscreteCQL_20251029173402: epoch=9 step=90000 epoch=9 metrics={'time_sample_batch': 0.0018027489900588989, 'time_algorithm_update': 0.008112362003326415, 'loss': 2266.9785000457764, 'td_loss': 2265.9045291030884, 'conservative_loss': 1.0739714259356261, 'time_step': 0.01019034698009491} step=90000
2025-10-29 17:49.45 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251029173402\model_90000.d3


Epoch 10/10: 100%|██████████| 10000/10000 [01:42<00:00, 97.57it/s, loss=2.3e+3, td_loss=2.3e+3, conservative_loss=1.08]


2025-10-29 17:51.27 [info     ] DiscreteCQL_20251029173402: epoch=10 step=100000 epoch=10 metrics={'time_sample_batch': 0.0018259360551834106, 'time_algorithm_update': 0.007992677760124207, 'loss': 2296.7082074401856, 'td_loss': 2295.631825073242, 'conservative_loss': 1.0763824317485093, 'time_step': 0.01009698393344879} step=100000
2025-10-29 17:51.27 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251029173402\model_100000.d3
[trainer] Training complete.

RL — Estimated Policy Value (empirical test): -1,248.34 per application
RL — Approval rate (test): 92.79%
RL — Approved & Fully Paid: 27030
RL — Approved & Defaulted : 5821
[Info] FQE unavailable/failed; skipping. Reason: _FQEBase.__init__() missing 1 required positional argument: 'config'


In [22]:
# ==== Cell 6: Policy comparison — find disagreement examples ====
import pandas as pd
# DL policy: approve if predicted default prob < 0.5
dl_approve = (y_proba_dl < 0.5).astype(int)
rl_approve = a_hat_rl

disagree_idx = np.where(dl_approve != rl_approve)[0]
print(f"Total disagreements: {len(disagree_idx)}")

show_cols = ["loan_amnt","int_rate","annual_inc","dti","emp_length","revol_util","purpose"]
cols_present = [c for c in show_cols if c in X_test.columns]

def rl_row_reward(i):
    return row_reward(i, rl_approve[i])

rows = []
for i in disagree_idx[:20]:  # display up to 20 examples
    r = {
        "idx": i,
        "DL_policy": "Approve" if dl_approve[i]==1 else "Deny",
        "RL_policy": "Approve" if rl_approve[i]==1 else "Deny",
        "True_Default": int(y_test[i]),
        "DL_PD": float(y_proba_dl[i]),
        "RL_reward_if_taken": rl_row_reward(i),
    }
    for c in cols_present:
        r[c] = X_test.iloc[i][c]
    rows.append(r)

ex_df = pd.DataFrame(rows)
pd.set_option('display.max_colwidth', None)
display(ex_df.head(10))


Total disagreements: 697


Unnamed: 0,idx,DL_policy,RL_policy,True_Default,DL_PD,RL_reward_if_taken,loan_amnt,int_rate,annual_inc,dti,emp_length,revol_util,purpose
0,93,Approve,Deny,1,0.472188,0.0,19075.0,17.57,53000.0,22.12,9.0,28.6,debt_consolidation
1,94,Deny,Approve,1,0.501262,-20000.0,20000.0,18.25,62500.0,19.49,10.0,77.1,other
2,112,Approve,Deny,0,0.480644,0.0,20500.0,19.19,57000.0,32.19,0.5,29.7,debt_consolidation
3,195,Approve,Deny,0,0.477234,0.0,21000.0,18.25,80000.0,18.24,3.0,48.7,other
4,224,Approve,Deny,1,0.47588,0.0,15000.0,15.61,100300.0,21.09,4.0,67.1,credit_card
5,330,Deny,Approve,1,0.548874,-20000.0,20000.0,13.67,72000.0,14.7,10.0,72.0,debt_consolidation
6,390,Deny,Approve,1,0.52558,-15300.0,15300.0,18.55,38000.0,17.72,10.0,50.0,medical
7,428,Deny,Approve,0,0.504016,4797.6,24000.0,19.99,70000.0,21.45,10.0,51.8,debt_consolidation
8,519,Deny,Approve,1,0.520685,-33425.0,33425.0,14.65,200000.0,14.3,6.0,8.1,debt_consolidation
9,584,Deny,Approve,1,0.538829,-33100.0,33100.0,24.24,72000.0,23.77,10.0,64.3,debt_consolidation


In [23]:
# ==== Cell 7: Final report — auto-filled text you can paste ====
import textwrap

lines = []
lines.append("FINAL ANALYSIS — Default Risk Classifier (DL, PyTorch) vs Profit-Seeking Policy (Offline RL, CQL)")
lines.append("")
lines.append("1) Key Results on Held-Out Test Set")
lines.append(f"   • Deep Learning (DL): ROC-AUC = {auc_dl:.4f}, F1@0.5 = {f1_dl:.4f}")
lines.append(f"   • Offline RL (CQL): Estimated Policy Value (empirical) = {epv_empirical:,.2f} per application")
if 'v_est' in locals() and v_est is not None:
    lines.append(f"                   FQE Estimated Policy Value (approx) = {v_est:,.2f} per application")
lines.append(f"   • RL Approval rate: {approval_rate*100:.2f}%; Approved Paid: {approved_paid}, Approved Defaulted: {approved_def}")
lines.append("")
lines.append("2) Metric Rationale")
lines.append("   • DL metrics — AUC captures ranking quality across thresholds; F1 summarizes precision/recall at the chosen operating point on imbalanced data.")
lines.append("   • RL metric — Estimated Policy Value (EPV) is the average per-application reward under the learned policy (profit minus loss), aligning directly to business ROI.")
lines.append("")
lines.append("3) Policy Comparison & Insights")
lines.append(f"   • Disagreements: {len(disagree_idx)} cases where DL and RL differ.")
lines.append("   • RL can approve some DL-flagged ‘risky’ applicants when expected interest (loan_amnt × int_rate) outweighs learned default risk → positive expected profit.")
lines.append("   • RL can deny some DL-approvals when upside is weak (low interest) or features signal higher loss risk → negative expected profit avoided.")
lines.append("")
lines.append("4) Future Steps (Practical Plan)")
lines.append("   • Economic thresholding for DL: tune threshold to maximize expected value, not just F1; calibrate probabilities (Platt/Isotonic).")
lines.append("   • Reward realism: include fees, LGD<1, recoveries, prepayments, funding/servicing costs, and time value (NPV).")
lines.append("   • Portfolio constraints & fairness: min/max approval by segment, risk caps, bias audits; deploy with guardrails.")
lines.append("   • OPE hardening: log behavior propensities; consider IPS/DR estimators; FQE with uncertainty bands.")
lines.append("   • Deployment: ship calibrated DL as champion; run RL as challenger (shadow/A/B) with tight drift monitoring.")
lines.append("   • Data wishlist: bureau aggregates, trend features, fraud/device signals, application channel, macro-at-issue, recovery details.")
lines.append("   • Algorithms: XGBoost/LightGBM/CatBoost, TabNet, Wide&Deep; RL: IQL/BCQ, conservative model-based RL, constrained optimization for portfolio targets.")
lines.append("")
lines.append("5) Limitations")
lines.append("   • One-step bandit framing ignores payment timing & partial recoveries; reward assumptions drive outcomes.")
lines.append("   • Offline RL value depends on logged data coverage; distribution shift and rare outcomes can bias estimates.")
lines.append("   • Empirical EPV assumes the test set well-represents deployment distribution; monitor and recalibrate regularly.")
print(textwrap.dedent("\n".join(lines)))


FINAL ANALYSIS — Default Risk Classifier (DL, PyTorch) vs Profit-Seeking Policy (Offline RL, CQL)

1) Key Results on Held-Out Test Set
   • Deep Learning (DL): ROC-AUC = 0.7453, F1@0.5 = 0.2960
   • Offline RL (CQL): Estimated Policy Value (empirical) = -1,248.34 per application
   • RL Approval rate: 92.79%; Approved Paid: 27030, Approved Defaulted: 5821

2) Metric Rationale
   • DL metrics — AUC captures ranking quality across thresholds; F1 summarizes precision/recall at the chosen operating point on imbalanced data.
   • RL metric — Estimated Policy Value (EPV) is the average per-application reward under the learned policy (profit minus loss), aligning directly to business ROI.

3) Policy Comparison & Insights
   • Disagreements: 697 cases where DL and RL differ.
   • RL can approve some DL-flagged ‘risky’ applicants when expected interest (loan_amnt × int_rate) outweighs learned default risk → positive expected profit.
   • RL can deny some DL-approvals when upside is weak (low in

In [24]:
# ==== Cell 8 (optional): Save artifacts ====
import joblib, d3rlpy, os

# Save DL model (PyTorch)
os.makedirs("model_mlp_default_risk", exist_ok=True)
torch.save(best.state_dict(), "model_mlp_default_risk/pytorch_mlp.pt")

# Save preprocessor for serving (so you can transform raw inputs the same way)
os.makedirs("offline_rl_cql", exist_ok=True)
joblib.dump(preprocess, "offline_rl_cql/preprocess.joblib")

# Save RL policy
try:
    algo.save("offline_rl_cql/cql_discrete_model.d3")
    print("Saved RL policy to offline_rl_cql/cql_discrete_model.d3")
except Exception as e:
    print("Could not save RL policy (ok to ignore for now):", e)
print("Saved preprocessor to offline_rl_cql/preprocess.joblib")
print("Saved DL (PyTorch) to model_mlp_default_risk/pytorch_mlp.pt")


Saved RL policy to offline_rl_cql/cql_discrete_model.d3
Saved preprocessor to offline_rl_cql/preprocess.joblib
Saved DL (PyTorch) to model_mlp_default_risk/pytorch_mlp.pt
