# Evaluation Functions

### Data Fidelity

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from scipy.stats import wasserstein_distance, entropy, pearsonr, gaussian_kde
from scipy.spatial.distance import jensenshannon

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
# Wasserstein Distance
def compute_wasserstein_distance(real_data, synthetic_data):
    """Compute Wasserstein Distance for all numerical features."""
    WS_distance_list = []
    for col in real_data.columns:
        if np.issubdtype(real_data[col].dtype, np.number):  # Check if the column is numerical
            wsd = wasserstein_distance(real_data[col].dropna().values, synthetic_data[col].dropna().values) # compute WSD
            WS_distance_list.append(wsd)
    return np.mean(WS_distance_list)

In [3]:
# # Jensen-Shannon Divergence (JSD) - for categorical data
# def compute_jsd_categorical(real_data, synthetic_data):
#     """Compute Jensen-Shannon Divergence (JSD) for all categorical features."""
#     jsd_list = []
#     for col in real_data.columns:
#         if not np.issubdtype(real_data[col].dtype, np.number): # categorical only
#             real_freq = real_data[col].value_counts(normalize=True).sort_index()
#             synth_freq = synthetic_data[col].value_counts(normalize=True).sort_index()
            
#             # Align the distributions to the same set of categories
#             all_categories = real_freq.index.union(synth_freq.index)
#             real_prob = real_freq.reindex(all_categories, fill_value=0).values
#             synth_prob = synth_freq.reindex(all_categories, fill_value=0).values
            
#             # Compute Jensen-Shannon Divergence (distance)
#             jsd = jensenshannon(real_prob, synth_prob, base=2)
#             jsd_list[col] = jsd
#     return np.mean(jsd_list)

In [4]:
# Jensen-Shannon Divergence (JSD) - for numerical data
def compute_jsd_numerical(real_data, synthetic_data, bins=50):
    """Compute Jensen-Shannon Divergence (JSD) for all numerical features."""
    jsd_list = []
    for col in real_data.columns:
        if np.issubdtype(real_data[col].dtype, np.number): # numerical only
            real_hist, bin_edges = np.histogram(real_data[col], bins=bins, density=True)
            synthetic_hist, _ = np.histogram(synthetic_data[col], bins=bin_edges, density=True)

            # Normalize to probability distributions
            real_prob = real_hist / np.sum(real_hist)
            synthetic_prob = synthetic_hist / np.sum(synthetic_hist)

            # Compute JSD
            jsd = jensenshannon(real_prob, synthetic_prob, base=2)
            jsd_list.append(jsd)
    return np.mean(jsd_list)

In [5]:
# L2 distance between Pearson correlation matrices (Numerical)
def compute_l2dist_pearson(real_data, synthetic_data):
    """Compute L2 distance between Pearson correlation matrices."""
    numerical_cols = real_data.select_dtypes(include=[np.number]).columns.tolist()

    real_corr = real_data[numerical_cols].corr()
    synthetic_corr = synthetic_data[numerical_cols].corr()

    # # flatten the correlation matrices
    # real_corr = real_corr.values.flatten()
    # synthetic_corr = synthetic_corr.values.flatten()

    l2_distance = np.linalg.norm(real_corr.values.flatten() - synthetic_corr.values.flatten())
    return l2_distance

### Data Utility

In [6]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix

In [16]:
def downstream_classification(training_data, holdout_data, target):
    """Train an XGBoost classifier on the provided data and evaluate on the holdout set."""
    X_holdout = holdout_data.drop(columns=[target])
    y_holdout = holdout_data[target]

    X_train = training_data.drop(columns=[target])
    y_train = training_data[target]

    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_holdout) 
    y_proba = model.predict_proba(X_holdout)[:, 1]

    metrics = {
        "accuracy": accuracy_score(y_holdout, y_pred),
        "auc": roc_auc_score(y_holdout, y_proba),
        "f1": f1_score(y_holdout, y_pred),
        "precision": precision_score(y_holdout, y_pred),
        "recall": recall_score(y_holdout, y_pred),
        "confusion_matrix": confusion_matrix(y_holdout, y_pred)
    }
    return metrics

# Load the models and prep synthetic data for evaluation

In [17]:
real_data = pd.read_csv('Datasets/creditcard_train.csv')
target_col = 'Class'
discrete_columns = ["Class"]
holdout_data = pd.read_csv('Datasets/creditcard_test.csv')

In [None]:
# Libraries 
import torch
from ctgan import CTGAN, TVAE
from Scripts.contrastive_ctgan import ContrastiveCTGAN, Embedder

torch.manual_seed(42)

num_samples = real_data.shape[0]

# CTGAN Initialization and Data Generation
ctgan_path = "Models/BaselineCTGAN.pth"
ctgan_model = torch.load(ctgan_path, weights_only=False)
# Generate synthetic data with CTGAN (Baseline)
synth_data = ctgan_model.sample(num_samples)
synthetic_data_path = 'SyntheticDatasets/Eval/synthetic_data_BASECTGAN_EVAL.csv'
pd.DataFrame(synth_data).to_csv(synthetic_data_path, index=False)
print(f"Synthetic data saved at {synthetic_data_path}")


# TVAE Initialization and Data Generation
tvae_model = TVAE(epochs=300, batch_size=500, cuda=True)
tvae_model.fit(real_data)
# Generate synthetic data with TVAE (Baseline).
synth_data_tvae = tvae_model.sample(num_samples)
synthetic_data_tvae_path = 'SyntheticDatasets/Eval/synthetic_data_TVAE_EVAL.csv'
pd.DataFrame(synth_data_tvae).to_csv(synthetic_data_tvae_path, index=False)
print(f"Synthetic data saved at {synthetic_data_tvae_path}")

Synthetic data saved at SyntheticDatasets/Eval/synthetic_data_BASECTGAN_EVAL.csv


---

# Real Data Evaluation

In [None]:
# Data Utility:
print("\nData Utility Metrics:")
metrics = downstream_classification(real_data, holdout_data, target_col)
for metric, value in metrics.items():
    rounded_value = np.round(value, 4) if isinstance(value, np.ndarray) else round(value, 4)
    print(f"{metric.capitalize()}: {rounded_value}")

print("\nConfusion Matrix:")
print(metrics["confusion_matrix"])


Data Utility Metrics:
Accuracy: 0.9996
Auc: 0.9777
F1: 0.899
Precision: 0.957
Recall: 0.8476
Confusion_matrix: [[56852     4]
 [   16    89]]

Confusion Matrix:
[[56852     4]
 [   16    89]]


# Baseline CTGAN Evaluation

In [None]:
# Load the synthetic data generated by CTGAN (BASELINE)
synth_data = pd.read_csv('SyntheticDatasets/Eval/synthetic_data_BASECTGAN_EVAL.csv')

print("--------- Baseline CTGAN ---------")

# Data Fidelity
print("Data Fidelity Metrics:")
print(f"WSD (numerical): {compute_wasserstein_distance(real_data, synth_data)}")
# print(f"JSD (categorical): {compute_jsd_categorical(real_data, synth_data)}")
print(f"JSD (numerical): {compute_jsd_numerical(real_data, synth_data)}")
print(f"L2 Distance Pearson Correlation: {compute_l2dist_pearson(real_data, synth_data)}")

# Data Utility:
print("\nData Utility Metrics:")
metrics = downstream_classification(synth_data, holdout_data, target_col)
for metric, value in metrics.items():
    rounded_value = np.round(value, 4) if isinstance(value, np.ndarray) else round(value, 4)
    print(f"{metric.capitalize()}: {rounded_value}")

--------- Baseline CTGAN ---------
Data Fidelity Metrics:
WSD (numerical): 167.65606352345813
JSD (numerical): 0.198252741102858
L2 Distance Pearson Correlation: 8.194095499106425

Data Utility Metrics:
Accuracy: 0.9792
Auc: 0.9762
F1: 0.1392
Precision: 0.0754
Recall: 0.9143
Confusion_matrix: [[55678  1178]
 [    9    96]]


# Baseline TVAE Evaluation

In [None]:
# Load the synthetic data generated by TVAE
synth_data_tvae = pd.read_csv('SyntheticDatasets/Eval/synthetic_data_TVAE_EVAL.csv')

print("--------- TVAE ---------")

# Data Fidelity Metrics:
print("Data Fidelity Metrics:")
print(f"WSD (numerical): {compute_wasserstein_distance(real_data, synth_data_tvae)}")
print(f"JSD (numerical): {compute_jsd_numerical(real_data, synth_data_tvae)}")
print(f"L2 Distance Pearson Correlation: {compute_l2dist_pearson(real_data, synth_data_tvae)}")

# Data Utility Metrics:
print("\nData Utility Metrics:")
metrics_tvae = downstream_classification(synth_data_tvae, holdout_data, target_col)
for metric, value in metrics_tvae.items():
    rounded_value = np.round(value, 4) if isinstance(value, np.ndarray) else round(value, 4)
    print(f"{metric.capitalize()}: {rounded_value}")

--------- TVAE ---------
Data Fidelity Metrics:
WSD (numerical): 240.7881261661354
JSD (numerical): 0.163311066577089
L2 Distance Pearson Correlation: nan

Data Utility Metrics:
Accuracy: 0.9982
Auc: 0.5
F1: 0.0
Precision: 0.0
Recall: 0.0
Confusion_matrix: [[56856     0]
 [  105     0]]


# ContraCTGAN (Lambda 0.5, Temp 0.5) Evaluation [Best Model So Far]

In [20]:
synth_data = pd.read_csv('SyntheticDatasets/synthetic_data_ContrastiveCTGAN_lambda0_5_temp0_5.csv') 

print("--------- ContraCTGAN (Lambda 0.5, Temp 0.5) ---------")

# Data Fidelity
print("Data Fidelity Metrics:")
print(f"WSD (numerical): {compute_wasserstein_distance(real_data, synth_data)}")
# print(f"JSD (categorical): {compute_jsd_categorical(real_data, synth_data)}")
print(f"JSD (numerical): {compute_jsd_numerical(real_data, synth_data)}")
print(f"L2 Distance Pearson Correlation: {compute_l2dist_pearson(real_data, synth_data)}")

# Data Utility:
print("\nData Utility Metrics:")
metrics = downstream_classification(synth_data, holdout_data, target_col)
for metric, value in metrics.items():
    rounded_value = np.round(value, 4) if isinstance(value, np.ndarray) else round(value, 4)
    print(f"{metric.capitalize()}: {rounded_value}")

--------- ContraCTGAN (Lambda 0.5, Temp 0.5) ---------
Data Fidelity Metrics:
WSD (numerical): 109.73358401517287
JSD (numerical): 0.19841696736554457
L2 Distance Pearson Correlation: 7.736916446568145

Data Utility Metrics:
Accuracy: 0.9919
Auc: 0.9673
F1: 0.2883
Precision: 0.1718
Recall: 0.8952
Confusion_matrix: [[56403   453]
 [   11    94]]


# ContraCTGAN (Lambda 0.5, Temp 0.1) Evaluation

In [22]:
# Load the synthetic data generated by ContraCTGAN (Lambda 0.5, Temp 0.1)
synth_data = pd.read_csv('SyntheticDatasets/synthetic_data_contrastive_ctgan_full_hpt_corrected.csv') 

print("--------- ContraCTGAN (Lambda 0.5, Temp 0.1) ---------")

# Data Fidelity
print("Data Fidelity Metrics:")
print(f"WSD (numerical): {compute_wasserstein_distance(real_data, synth_data)}")
# print(f"JSD (categorical): {compute_jsd_categorical(real_data, synth_data)}")
print(f"JSD (numerical): {compute_jsd_numerical(real_data, synth_data)}")
print(f"L2 Distance Pearson Correlation: {compute_l2dist_pearson(real_data, synth_data)}")

# Data Utility:
print("\nData Utility Metrics:")
metrics = downstream_classification(synth_data, holdout_data, target_col)
for metric, value in metrics.items():
    rounded_value = np.round(value, 4) if isinstance(value, np.ndarray) else round(value, 4)
    print(f"{metric.capitalize()}: {rounded_value}")

--------- ContraCTGAN (Lambda 0.5, Temp 0.1) ---------
Data Fidelity Metrics:
WSD (numerical): 122.52302629002686
JSD (numerical): 0.20565993878865202
L2 Distance Pearson Correlation: 8.183968862790755

Data Utility Metrics:
Accuracy: 0.9894
Auc: 0.9774
F1: 0.2418
Precision: 0.1393
Recall: 0.9143
Confusion_matrix: [[56263   593]
 [    9    96]]


# ContraCTGAN (Lambda 0.2, Temp 1.0) Evaluation

In [23]:
synth_data = pd.read_csv('SyntheticDatasets/synthetic_data_ContrastiveCTGAN_lambda0_2_temp1_0.csv') 

print("--------- ContraCTGAN (Lambda 0.2, Temp 1.0) ---------")

# Data Fidelity
print("Data Fidelity Metrics:")
print(f"WSD (numerical): {compute_wasserstein_distance(real_data, synth_data)}")
# print(f"JSD (categorical): {compute_jsd_categorical(real_data, synth_data)}")
print(f"JSD (numerical): {compute_jsd_numerical(real_data, synth_data)}")
print(f"L2 Distance Pearson Correlation: {compute_l2dist_pearson(real_data, synth_data)}")

# Data Utility:
print("\nData Utility Metrics:")
metrics = downstream_classification(synth_data, holdout_data, target_col)
for metric, value in metrics.items():
    rounded_value = np.round(value, 4) if isinstance(value, np.ndarray) else round(value, 4)
    print(f"{metric.capitalize()}: {rounded_value}")

--------- ContraCTGAN (Lambda 0.2, Temp 1.0) ---------
Data Fidelity Metrics:
WSD (numerical): 187.76774679887382
JSD (numerical): 0.20332505692723915
L2 Distance Pearson Correlation: 7.921025555365883

Data Utility Metrics:
Accuracy: 0.9931
Auc: 0.9808
F1: 0.316
Precision: 0.1932
Recall: 0.8667
Confusion_matrix: [[56476   380]
 [   14    91]]


# ContraCTGAN (Lambda 0.8, Temp 0.8) Evaluation

In [24]:
synth_data = pd.read_csv('SyntheticDatasets/synthetic_data_ContrastiveCTGAN_lambda0_8_temp0_8.csv') 

print("--------- ContraCTGAN (Lambda 0.8, Temp 0.8) ---------")

# Data Fidelity
print("Data Fidelity Metrics:")
print(f"WSD (numerical): {compute_wasserstein_distance(real_data, synth_data)}")
# print(f"JSD (categorical): {compute_jsd_categorical(real_data, synth_data)}")
print(f"JSD (numerical): {compute_jsd_numerical(real_data, synth_data)}")
print(f"L2 Distance Pearson Correlation: {compute_l2dist_pearson(real_data, synth_data)}")

# Data Utility:
print("\nData Utility Metrics:")
metrics = downstream_classification(synth_data, holdout_data, target_col)
for metric, value in metrics.items():
    rounded_value = np.round(value, 4) if isinstance(value, np.ndarray) else round(value, 4)
    print(f"{metric.capitalize()}: {rounded_value}")

--------- ContraCTGAN (Lambda 0.8, Temp 0.8) ---------
Data Fidelity Metrics:
WSD (numerical): 81.33844724624835
JSD (numerical): 0.1995632958651905
L2 Distance Pearson Correlation: 8.56982301290151

Data Utility Metrics:
Accuracy: 0.9884
Auc: 0.969
F1: 0.2259
Precision: 0.1289
Recall: 0.9143
Confusion_matrix: [[56207   649]
 [    9    96]]


# ContraCTGAN (Lambda 1.0, Temp 0.07) Evaluation

In [25]:
synth_data = pd.read_csv('SyntheticDatasets/synthetic_data_ContrastiveCTGAN_lambda1_0_temp0_07.csv') 

print("--------- ContraCTGAN (Lambda 1.0, Temp 0.07) ---------")

# Data Fidelity
print("Data Fidelity Metrics:")
print(f"WSD (numerical): {compute_wasserstein_distance(real_data, synth_data)}")
# print(f"JSD (categorical): {compute_jsd_categorical(real_data, synth_data)}")
print(f"JSD (numerical): {compute_jsd_numerical(real_data, synth_data)}")
print(f"L2 Distance Pearson Correlation: {compute_l2dist_pearson(real_data, synth_data)}")

# Data Utility:
print("\nData Utility Metrics:")
metrics = downstream_classification(synth_data, holdout_data, target_col)
for metric, value in metrics.items():
    rounded_value = np.round(value, 4) if isinstance(value, np.ndarray) else round(value, 4)
    print(f"{metric.capitalize()}: {rounded_value}")

--------- ContraCTGAN (Lambda 1.0, Temp 0.07) ---------
Data Fidelity Metrics:
WSD (numerical): 96.34658901639892
JSD (numerical): 0.19800927092971657
L2 Distance Pearson Correlation: 7.9728675952997206

Data Utility Metrics:
Accuracy: 0.987
Auc: 0.9773
F1: 0.2019
Precision: 0.1138
Recall: 0.8952
Confusion_matrix: [[56124   732]
 [   11    94]]
