In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import NearestNeighbors
from scipy.stats import ks_2samp, entropy
from sklearn.metrics import normalized_mutual_info_score
from scipy.stats import spearmanr

# ---------- Q scorer (uses your DACV) ----------
def score_quality_Q(real_df: pd.DataFrame, syn_df: pd.DataFrame, target_col="Target"):
    cols = [c for c in real_df.columns if c != target_col]
    cat_cols = [c for c in cols if real_df[c].dtype == "object"]
    num_cols = [c for c in cols if c not in cat_cols]

    # D (numeric hist/KS/JSD + RC; categorical TVD)
    def js_divergence(p, q, base=np.e):
        p = np.asarray(p, dtype=float); q = np.asarray(q, dtype=float)
        p = p / (p.sum() + 1e-12); q = q / (q.sum() + 1e-12)
        m = 0.5*(p+q); 
        return 0.5*entropy(p, m, base=base)+0.5*entropy(q, m, base=base)
    def hist_compare(xr, xs, bins=30, rng=None, smooth=1e-9):
        if rng is None:
            lo = np.nanmin(np.concatenate([xr, xs])); hi = np.nanmax(np.concatenate([xr, xs])); 
            if lo==hi: hi = lo+1.0; rng=(lo,hi)
        hr, edges = np.histogram(xr, bins=bins, range=rng)
        hs, _     = np.histogram(xs, bins=bins, range=rng)
        pr = (hr + smooth) / (hr.sum() + smooth*bins)
        ps = (hs + smooth) / (hs.sum() + smooth*bins)
        return {"jsd": float(js_divergence(pr, ps))}
    def tvd_from_counts(p, q):
        keys = sorted(set(p).union(q), key=str)
        pv = np.array([p.get(k, 0.0) for k in keys], dtype=float)
        qv = np.array([q.get(k, 0.0) for k in keys], dtype=float)
        return 0.5 * np.abs(pv - qv).sum()

    D_parts = []
    for c in num_cols:
        xr = real_df[c].dropna().values; xs = syn_df[c].dropna().values
        if xr.size and xs.size:
            ks = ks_2samp(xr, xs).statistic
            jsd = hist_compare(xr, xs)["jsd"]
            lo, hi = np.nanmin(xr), np.nanmax(xr)
            rc = float(np.mean((xs>=lo)&(xs<=hi)))
            D_parts.append( (1-ks, 1-jsd, rc) )
    D_score = np.mean([np.mean(p) for p in D_parts]) if D_parts else np.nan

    real_cat = real_df[cat_cols].copy().astype(str)
    syn_cat  = syn_df[cat_cols].copy().astype(str)
    tvds=[]
    for c in cat_cols:
        pr = real_cat[c].value_counts(normalize=True).to_dict()
        ps = syn_cat[c].value_counts(normalize=True).to_dict()
        tvds.append(1.0 - tvd_from_counts(pr, ps))
    D_cat = float(np.mean(tvds)) if tvds else np.nan

    # C (assoc)
    def spearman_matrix(df, cols):
        m=len(cols); R=np.ones((m,m))*np.nan
        for i,a in enumerate(cols):
            for j,b in enumerate(cols):
                if i==j: R[i,i]=1.0
                elif i<j:
                    rho,_=spearmanr(df[a], df[b], nan_policy="omit")
                    R[i,j]=R[j,i]=rho
        return R
    C_num=np.nan
    if len(num_cols)>=2:
        Rr=spearman_matrix(real_df,num_cols); Rs=spearman_matrix(syn_df,num_cols)
        mask = np.isfinite(Rr)&np.isfinite(Rs)
        C_num = float(1.0 - np.nanmean(np.abs(Rr[mask]-Rs[mask])))

    def nmi_matrix(df, cols):
        m=len(cols); M=np.ones((m,m))*np.nan
        for i,a in enumerate(cols):
            for j,b in enumerate(cols):
                if i==j: M[i,i]=1.0
                elif i<j: M[i,j]=M[j,i]=normalized_mutual_info_score(df[a].astype(str), df[b].astype(str))
        return M
    C_cat=np.nan
    if len(cat_cols)>=2:
        Mr=nmi_matrix(real_cat,cat_cols); Ms=nmi_matrix(syn_cat,cat_cols)
        mask=np.isfinite(Mr)&np.isfinite(Ms)
        C_cat = float(1.0 - np.nanmean(np.abs(Mr[mask]-Ms[mask])))

    # V (coverage)
    CC=[]
    for c in cat_cols:
        pr=real_cat[c].value_counts(normalize=True)
        ps=syn_cat[c].value_counts(normalize=True)
        cats=set(pr.index)
        cc=float(np.mean([k in ps.index for k in cats]))
        CC.append(cc)
    V_score = float(np.mean(CC)) if CC else np.nan

    # aggregate (weights like your cell)
    parts = []
    for x in [D_score, D_cat, C_num, C_cat, V_score]:
        if not np.isnan(x): parts.append(x)
    Q = float(np.mean(parts)) if parts else 0.0
    return Q

# ---------- P scorer (your DCR, Qδ, I) ----------
def score_privacy_P(real_df: pd.DataFrame, syn_df: pd.DataFrame, target_col="Target"):
    cols = [c for c in real_df.columns if c != target_col]
    cat_cols = [c for c in cols if real_df[c].dtype == "object"]
    num_cols = [c for c in cols if c not in cat_cols]

    def make_ohe():
        try: return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        except TypeError: return OneHotEncoder(handle_unknown="ignore", sparse=False)

    enc = ColumnTransformer([
        ("num", "passthrough", num_cols),
        ("cat", make_ohe(),    cat_cols),
    ])
    X_real = enc.fit_transform(real_df[cols]); X_fake = enc.transform(syn_df[cols])

    # DCR
    nn = NearestNeighbors(n_neighbors=1).fit(X_real)
    dists,_ = nn.kneighbors(X_fake); dists=dists.ravel()
    DCR_mean = float(np.mean(dists))

    # Qδ
    quantiles = np.linspace(0.05,0.95,19); qd_vals=[]
    for c in num_cols:
        xr = np.asarray(real_df[c].dropna().values)
        xs = np.asarray(syn_df[c].dropna().values)
        if xr.size and xs.size:
            qr = np.quantile(xr, quantiles); qs = np.quantile(xs, quantiles)
            qd_vals.append(float(np.mean(np.abs(qr-qs))))
    Q_delta = float(np.mean(qd_vals)) if qd_vals else float("nan")

    # I
    def norm_df(df, cols):
        out=df[cols].copy()
        for c in cols:
            if np.issubdtype(out[c].dtype, np.number): out[c]=out[c].round(6)
            else: out[c]=out[c].astype(str).str.strip()
        return out
    real_norm = norm_df(real_df, cols); fake_norm = norm_df(syn_df, cols)
    dup_within_synth = float(1.0 - len(fake_norm.drop_duplicates())/max(1,len(fake_norm)))
    set_real=set(map(tuple, real_norm.to_numpy())); set_fake=set(map(tuple, fake_norm.to_numpy()))
    overlap_between = float(len(set_real & set_fake)/max(1,len(fake_norm)))

    # map to [0,1] scores (same idea as your cell)
    def normalize01(x, lo=0.0, hi=1.0):
        if np.isnan(x): return np.nan
        if hi==lo: return 0.0
        v=(x-lo)/(hi-lo); return float(max(0.0, min(1.0, v)))
    def inv01(x, cap=1.0):
        if np.isnan(x): return np.nan
        return float(max(0.0, min(1.0, 1.0 - x/cap)))

    d95 = float(np.percentile(dists, 95)) if len(dists) else 1.0
    DCR_score = normalize01(DCR_mean, lo=0.0, hi=d95 if d95>0 else 1.0)
    Qd_cap = float(np.nanmedian(qd_vals))*4 if qd_vals else (abs(Q_delta)+1e-6)
    Qdelta_score = inv01(Q_delta, cap=Qd_cap if Qd_cap>0 else 1.0)
    I_combined = 0.5*(dup_within_synth + overlap_between)
    I_score = inv01(I_combined, cap=0.10)

    P = 0.5*DCR_score + 0.3*Qdelta_score + 0.2*I_score
    return float(P)