In [1]:
# %pip install kagglehub
# %pip install pandas
# %pip install pennylane
# %pip install torch
# %pip install scikit-learn
# %pip install matplotlib

In [2]:
import numpy as np
import pandas as pd
import os
import re
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (precision_recall_curve, average_precision_score, average_precision_score, f1_score, balanced_accuracy_score,
                             roc_curve, auc, brier_score_loss,
                             classification_report, confusion_matrix, accuracy_score)
from sklearn.calibration import calibration_curve
from sklearn.model_selection import train_test_split
import pennylane as qml
from tqdm import tqdm
import pennylane as qml
from tqdm import tqdm
from pathlib import Path
import kagglehub

import warnings
warnings.filterwarnings("ignore")

DATA PREPROCESS

In [3]:
# Robust cleaners
def clean_dataframe(df: pd.DataFrame, *, drop_high_nan=True, high_nan_ratio=0.95) -> pd.DataFrame:
    """General cleaner: empty strings -> NaN, drop all-empty cols, optional high-NaN drop,
    then median/mode impute. Also removes +/-inf."""
    d = df.copy()

    for c in d.columns:
        if d[c].dtype == "O":
            d[c] = d[c].astype(str).str.strip()
    d = d.replace(r'^\s*$', np.nan, regex=True)
    d = d.dropna(axis=1, how="all")
    d = d.replace([np.inf, -np.inf], np.nan)
    if drop_high_nan and len(d):
        nan_ratio = d.isna().mean()
        keep_cols = nan_ratio[nan_ratio <= high_nan_ratio].index
        d = d[keep_cols]

    for c in d.select_dtypes(include=[np.number]).columns:
        if d[c].isna().any():
            med = d[c].median()
            d[c] = d[c].fillna(med if pd.notna(med) else 0.0)

    for c in d.select_dtypes(exclude=[np.number]).columns:
        if d[c].isna().any():
            mode_vals = d[c].mode(dropna=True)
            fillv = mode_vals.iloc[0] if not mode_vals.empty else "missing"
            d[c] = d[c].fillna(fillv)

    return d


def _to_num01(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.strip().str.lower()
    if s.isin(["yes","no"]).any():
        return (s == "yes").astype(float)
    if s.isin(["true","false"]).any():
        return (s == "true").astype(float)
    with np.errstate(all="ignore"):
        v = pd.to_numeric(s, errors="coerce")
    return v


def build_provider_agg(df, provider_col="Provider"):
    """Aggregate claim-level rows to provider-level features with robust cleaning."""
    if provider_col not in df.columns:
        return None
    d = clean_dataframe(df)
    for c in d.columns:
        if d[c].dtype == "O":
            lc = str(c).lower()
            if any(k in lc for k in ["chronic", "fraud", "yes", "no", "diabetes", "alzheimer", "heart", "cancer"]):
                d[c] = _to_num01(d[c])

    for c in d.columns:
        cl = c.lower()
        if any(k in cl for k in ["admit", "adm", "disch", "dsch", "date"]):
            try:
                d[c] = pd.to_datetime(d[c], errors="coerce")
            except Exception:
                pass

    adm = next((c for c in d.columns if re.search("admit|adm", c, re.I)), None)
    dis = next((c for c in d.columns if re.search("disch|dsch|dis", c, re.I)), None)
    if adm and dis and np.issubdtype(d[adm].dtype, np.datetime64) and np.issubdtype(d[dis].dtype, np.datetime64):
        d["_LOS_"] = (d[dis] - d[adm]).dt.days.astype("float")
    else:
        d["_LOS_"] = np.nan

    id_like = ("claimid","claimnumber","patientid","beneid","memberid","claim_id","claim_number","patient_id","bene_id")
    num_cols_local = []
    for c in d.columns:
        if c == provider_col: 
            continue
        cl = c.lower()
        if any(tok in cl for tok in id_like):
            continue
        if pd.api.types.is_numeric_dtype(d[c]):
            num_cols_local.append(c)

    grp = d.groupby(provider_col, dropna=False)
    feats = pd.DataFrame(index=grp.size().index)
    feats.index.name = provider_col

    feats["n_rows"] = grp.size().astype(float)
    if "BeneID" in d.columns:
        feats["n_unique_bene"] = grp["BeneID"].nunique().astype(float)

    feats["los_mean"] = grp["_LOS_"].mean()
    feats["los_std"]  = grp["_LOS_"].std()

    money_cols = [c for c in d.columns if pd.api.types.is_numeric_dtype(d[c]) and re.search("reimb|paid|pay|amount|amt|charge", c, re.I)]
    pick_cols = money_cols if money_cols else num_cols_local[:10]

    for c in pick_cols:
        g = grp[c]
        feats[f"{c}_sum"]  = g.sum()
        feats[f"{c}_mean"] = g.mean()
        feats[f"{c}_std"]  = g.std()
        feats[f"{c}_min"]  = g.min()
        feats[f"{c}_max"]  = g.max()

    for c in ["DiagnosisGroupCode","ClaimType","AttendingPhysician","OperatingPhysician","Gender","Race"]:
        if c in d.columns:
            feats[f"{c}_nunique"] = grp[c].nunique()

    feats = feats.replace([np.inf, -np.inf], np.nan)
    feats = clean_dataframe(feats, drop_high_nan=False)  # small cleanup post-agg
    return feats



FRAUD_KAGGLE_SLUGS = [
    "rohitrox/healthcare-provider-fraud-detection-analysis",
    "shivamb/healthcare-provider-fraud-detection",
    "luisfredgs/healthcare-provider-fraud-detection-analysis",
    "govindkrishnareddy/healthcare-provider-fraud-detection-analysis",
]


fraud_dir = None
for slug in FRAUD_KAGGLE_SLUGS:
    try:
        fraud_dir = kagglehub.dataset_download(slug)
        print("Kaggle dataset found:", slug)
        break
    except Exception as e:
        print("Tried:", slug, "->", e)
assert fraud_dir is not None, "Could not download provider fraud dataset from Kaggle with known slugs."



data_path = Path(fraud_dir)
csv_files = list(data_path.rglob("*.csv"))
assert csv_files, f"No CSVs found under {data_path}"

TARGET_CANDIDATES = ["PotentialFraud", "Fraud", "fraud", "IsFraud", "FraudFound_P", "target"]
df_fraud = None
target_col = None
SRC_CSV = None

for csvp in csv_files:
    try:
        raw = pd.read_csv(csvp)
        dft = clean_dataframe(raw)
        for tcol in TARGET_CANDIDATES:
            if tcol in dft.columns:
                df_fraud  = dft
                target_col= tcol
                SRC_CSV   = csvp
                break
        if df_fraud is not None:
            break
    except Exception:
        continue

assert df_fraud is not None, f"None of the CSVs had a known fraud target column {TARGET_CANDIDATES}"

display(df_fraud.head())


def normalize_target(col):
    if col.dtype == "O":
        s = col.astype(str).str.strip().str.lower()
        if s.isin(["yes","no"]).any():     return (s == "yes").astype(int)
        if s.isin(["true","false"]).any(): return (s == "true").astype(int)
        if s.isin(["fraud","nonfraud","not_fraud","legit"]).any():
            return s.isin(["fraud"]).astype(int)
        try:
            return pd.to_numeric(s, errors="coerce").fillna(0).astype(int)
        except Exception:
            return (s == s.unique()[-1]).astype(int)
    v = pd.to_numeric(col, errors="coerce").fillna(0)
    if set(np.unique(v)) - {0,1}:
        mv = v.mode().iloc[0]
        v = (v != mv).astype(int)
    return v


y_bin = normalize_target(df_fraud[target_col]).astype(int)
y_labels = y_bin.map({0: "nonfraud", 1: "fraud"}).astype("category")

X_fraud = df_fraud.drop(columns=[target_col]).copy()


X_train, X_temp, y_train, y_temp = train_test_split(
    X_fraud, y_labels, test_size=0.30, stratify=y_labels, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

num_cols = [c for c in X_fraud.columns if pd.api.types.is_numeric_dtype(X_fraud[c])]
cat_cols = [c for c in X_fraud.columns if c not in num_cols]


csv_paths = list(data_path.rglob("*.csv"))
provider_features = None

for p in csv_paths:
    try:
        dfp_raw = pd.read_csv(p)
        if "Provider" not in dfp_raw.columns:
            continue
        dfp = clean_dataframe(dfp_raw)
        agg = build_provider_agg(dfp, provider_col="Provider")
        if agg is None or agg.empty:
            continue
        agg.columns = [f"{p.stem}__{c}" for c in agg.columns]
        agg = agg.reset_index()
        provider_features = agg if provider_features is None else provider_features.merge(agg, on="Provider", how="outer")
    except Exception:
        continue

assert provider_features is not None and not provider_features.empty, "No provider-level features could be built."


labels_df = df_fraud[[target_col]].copy()
labels_df = labels_df.rename(columns={target_col: "FraudLabel"})
labels_df["FraudLabel"] = (normalize_target(labels_df["FraudLabel"]) > 0).astype(int)
labels_df["Provider"] = df_fraud["Provider"] if "Provider" in df_fraud.columns else df_fraud.iloc[:, 0]

data_all = labels_df.merge(provider_features, on="Provider", how="left")
data_all = data_all.drop(columns=["Provider"])

data_all = clean_dataframe(data_all, drop_high_nan=True, high_nan_ratio=0.98)

y_bin = data_all["FraudLabel"].astype(int).values
X_all = data_all.drop(columns=["FraudLabel"]).copy()

print("Provider feature matrix (post-clean):", X_all.shape)


Xtr, Xte, ytr, yte = train_test_split(X_all, y_bin, test_size=0.20, stratify=y_bin, random_state=42)
Xtr, Xva, ytr, yva = train_test_split(Xtr, ytr, test_size=0.25, stratify=ytr, random_state=42)

scaler = StandardScaler()
Xtr = scaler.fit_transform(Xtr)
Xva = scaler.transform(Xva)
Xte = scaler.transform(Xte)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Xt = torch.tensor(Xtr, dtype=torch.float32)
Xv = torch.tensor(Xva, dtype=torch.float32)
Xs = torch.tensor(Xte, dtype=torch.float32)
yt_t = torch.tensor(ytr, dtype=torch.long)
yv_t = torch.tensor(yva, dtype=torch.long)
ys_t = torch.tensor(yte, dtype=torch.long)


counts = np.bincount(ytr, minlength=2).astype(float); counts[counts==0] = 1
class_weights = torch.tensor((counts.sum()/(2*counts)).astype(np.float32))
sample_weights = class_weights[yt_t].double()


train_loader = DataLoader(
    TensorDataset(Xt, yt_t), batch_size=64,
    sampler=WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)
)


val_loader   = DataLoader(TensorDataset(Xv, yv_t), batch_size=64, shuffle=False)
test_loader  = DataLoader(TensorDataset(Xs, ys_t), batch_size=64, shuffle=False)

Kaggle dataset found: rohitrox/healthcare-provider-fraud-detection-analysis


Unnamed: 0,Provider,PotentialFraud
0,PRV51001,No
1,PRV51003,Yes
2,PRV51004,No
3,PRV51005,Yes
4,PRV51007,No


Provider feature matrix (post-clean): (5410, 37)


Quantum-Hybrid QNN

In [4]:
qdev = qml.device("default.qubit", wires=6)

@qml.qnode(qdev, interface="torch")
def q_circuit(inputs, weights):
    # angle embedding into 6 wires
    for i in range(6):
        qml.RY(inputs[i % inputs.shape[0]], wires=i)
    qml.templates.BasicEntanglerLayers(weights, wires=range(6))
    return [qml.expval(qml.PauliZ(i)) for i in range(2)]

q_layer1 = qml.qnn.TorchLayer(q_circuit, {"weights": (4,6)}).to(device)
q_layer2 = qml.qnn.TorchLayer(q_circuit, {"weights": (4,6)}).to(device)


class HybridQuantumFraudQNNModel(nn.Module):
    def __init__(self, in_dim, hid=64, p=0.2):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, hid)
        self.bn1 = nn.BatchNorm1d(hid)
        self.drop = nn.Dropout(p)
        self.fc2 = nn.Linear(hid, 6)

        self.qnn1 = q_layer1
        self.qnn2 = q_layer2

        self.out = nn.Linear(2, 2)

    def forward(self, x):
        x = self.drop(torch.relu(self.bn1(self.fc1(x))))
        x = torch.relu(self.fc2(x))
        x = torch.stack([self.qnn1(xi) for xi in x], dim=0)
        x = torch.stack([self.qnn2(xi) for xi in x], dim=0)

        return self.out(x)

Train

In [5]:
def train_fraud_model(model, train_loader, val_loader,
                      epochs=10, patience=4, lr=3e-4, wd=1e-5,
                      class_weights=None):
    model.to(device)
    optim = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    crit  = nn.CrossEntropyLoss(
        weight=class_weights.to(device) if class_weights is not None else None,
        label_smoothing=0.03
    )

    best_ap, best_state, best_thr = -np.inf, None, 0.5
    patience_ctr = 0

    for ep in range(1, epochs+1):
        # -------- Train --------
        model.train()
        loop = tqdm(train_loader, desc=f"Epoch [{ep}/{epochs}]", leave=False)
        for xb, yb in loop:
            xb, yb = xb.to(device), yb.to(device)
            optim.zero_grad()
            logits = model(xb)
            loss = crit(logits, yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optim.step()
            loop.set_postfix(loss=float(loss))

        # -------- Validate --------
        model.eval()
        probs, true = [], []
        with torch.no_grad():
            for xv, yv in val_loader:
                xv = xv.to(device)
                p = torch.softmax(model(xv), dim=1)[:, 1]  # P(fraud)
                probs.append(p.cpu().numpy()); true.append(yv.numpy())
        probs = np.concatenate(probs)
        true  = np.concatenate(true).astype(int)

        # monitor AP(fraud)
        ap = average_precision_score(true, probs)

        # threshold tuning for macro-F1 (finer grid)
        thr_grid = np.linspace(0.01, 0.99, 1000)
        f1s   = [f1_score(true, (probs >= t).astype(int), average="macro") for t in thr_grid]
        baccs = [balanced_accuracy_score(true, (probs >= t).astype(int)) for t in thr_grid]

        i_f1       = int(np.nanargmax(f1s))
        t_best     = float(thr_grid[i_f1])
        f1_best    = float(f1s[i_f1])
        bacc_best  = float(baccs[i_f1])

        print(
            f"Epoch {ep} | Val AP(fraud)={ap:.3f} | "
            f"Best macro-F1={f1_best:.3f} @ Thr={t_best:.3f} | "
            f"Balanced Acc={bacc_best:.3f}"
        )

        # early stopping on AP(fraud)
        if ap > best_ap + 1e-4:
            best_ap    = ap
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            best_thr   = t_best
            patience_ctr = 0
        else:
            patience_ctr += 1
            if patience_ctr >= patience:
                print("Early stopping.")
                break

    if best_state:
        model.load_state_dict(best_state)
    print(f"Best Val AP(fraud)={best_ap:.3f} with threshold={best_thr:.3f}")
    return best_thr

model = HybridQuantumFraudQNNModel(in_dim=Xtr.shape[1], hid=64, p=0.2)

best_thr = train_fraud_model(
    model, train_loader, val_loader,
    epochs=5, patience=4, class_weights=class_weights
)

                                                                       

Epoch 1 | Val AP(fraud)=0.507 | Best macro-F1=0.514 @ Thr=0.324 | Balanced Acc=0.519


                                                                       

Epoch 2 | Val AP(fraud)=0.509 | Best macro-F1=0.557 @ Thr=0.337 | Balanced Acc=0.544


                                                                        

Epoch 3 | Val AP(fraud)=0.822 | Best macro-F1=0.926 @ Thr=0.349 | Balanced Acc=0.941


                                                                        

Epoch 4 | Val AP(fraud)=0.983 | Best macro-F1=0.981 @ Thr=0.364 | Balanced Acc=0.979


                                                                        

Epoch 5 | Val AP(fraud)=0.999 | Best macro-F1=0.992 @ Thr=0.378 | Balanced Acc=0.990
Best Val AP(fraud)=0.999 with threshold=0.378


Evaluate on test

In [6]:
def eval_with_threshold(model, loader, thr=0.5, title="Model (Test)"):
    thr = float(np.atleast_1d(thr)[0])

    model.eval()
    probs, true = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            logits = model(xb)
            p = torch.softmax(logits, dim=1)[:, 1]
            probs.append(p.cpu().numpy()); true.append(yb.numpy())
    probs = np.concatenate(probs).ravel()
    true  = np.concatenate(true).astype(int).ravel()

    fpr, tpr, _ = roc_curve(true, probs, pos_label=1)
    rocA = auc(fpr, tpr)
    pr, rc, _ = precision_recall_curve(true, probs, pos_label=1)
    ap = average_precision_score(true, probs)
    pt, pp = calibration_curve(true, probs, n_bins=10, strategy="uniform")
    brier = brier_score_loss(true, probs)

    print(f"ROC-AUC={rocA:.3f} | PR-AP={ap:.3f} | Brier={brier:.3f}")

    pred = (probs >= thr).astype(int)
    print("Threshold:", thr)
    print("Accuracy:", f"{accuracy_score(true, pred)*100:.2f}%")
    print("\nClassification report:\n",
          classification_report(true, pred, target_names=["nonfraud","fraud"]))
    print("Confusion matrix:\n", confusion_matrix(true, pred))

    return {"roc_auc": float(rocA), "pr_ap": float(ap), "brier": float(brier)}, probs, true, pred

eval_with_threshold(model, test_loader, thr=best_thr)

ROC-AUC=1.000 | PR-AP=0.998 | Brier=0.164
Threshold: 0.37786786786786786
Accuracy: 99.72%

Classification report:
               precision    recall  f1-score   support

    nonfraud       1.00      1.00      1.00       981
       fraud       0.98      0.99      0.99       101

    accuracy                           1.00      1082
   macro avg       0.99      0.99      0.99      1082
weighted avg       1.00      1.00      1.00      1082

Confusion matrix:
 [[979   2]
 [  1 100]]


({'roc_auc': 0.9998082377045043,
  'pr_ap': 0.9980634810025565,
  'brier': 0.16402428698441854},
 array([0.3757431 , 0.37572235, 0.3757747 , ..., 0.37571874, 0.37571317,
        0.37579125], dtype=float32),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 0]))

Saving

In [7]:
os.makedirs("./insurance_artifacts/models", exist_ok=True)
torch.save(model.state_dict(), "./insurance_artifacts/models/insurance_fraud_hybrid_quantum.pt")
print("Saved hybrid QNN to ./insurance_artifacts/models/insurance_fraud_hybrid_quantum.pt")

Saved hybrid QNN to ./insurance_artifacts/models/insurance_fraud_hybrid_quantum.pt


Upload model

In [8]:
# uploaded_fraud = torch.load("./models/insurance_fraud_hybrid_quantum.pt", map_location="cpu")