# Checking Synthetic Data Vault (SDV) library

In [37]:
import pandas as pd
import numpy as np
from sdv.metadata import Metadata
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.sampling import Condition
from pathlib import Path
import json

## Load data

In [7]:
fraud_data = pd.read_csv('../../data_synthesizer/original_data/creditcard.csv')
fraud_data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


### Prep data

In [8]:
df_model = fraud_data.drop(columns=['Time']).copy()
df_model['Class'] = df_model['Class'].astype(int)

Build metadata

In [19]:
metadata = Metadata.detect_from_dataframe(
    data=df_model,
    table_name='creditcard'
)
# Ensure the target is categorical (so 0/1 isn’t treated as numeric)
metadata.update_column(column_name='Class', sdtype='categorical')
metadata.validate()
metadata.save_to_json('metadata.json')

### Train synthesizer
GaussianCopula is fast, stable, and supports efficient conditional sampling for rare classes.

In [22]:
synth = GaussianCopulaSynthesizer(
    metadata = Metadata.load_from_json(filepath='metadata.json'),
    enforce_min_max_values=True,
    enforce_rounding=True,
)
synth.fit(df_model)

### Save artifacts  (generator + metadata + base stats)

In [30]:
Path("artifacts").mkdir(parents=True, exist_ok=True)

# Save synthetizer
synth_path = "artifacts/creditcard_fraud_gc.pkl"
synth.save(synth_path)

# Save metadata
metadata_path = "artifacts/creditcard_fraud_metadata.json"
metadata.save_to_json(metadata_path, mode="overwrite")

# Save stats
base_rate = float(df_model['Class'].mean())
with open("artifacts/creditcard_fraud_stats.json", "w") as f:
    json.dump({"class_positive_rate": base_rate, "n_rows": int(len(df_model))}, f, indent=2)

print(f"Saved synthesizer: {synth_path}")
print(f"Saved metadata:    {metadata_path}")
print(f"Observed fraud rate in real data: {base_rate:.6f}")

Saved synthesizer: artifacts/creditcard_fraud_gc.pkl
Saved metadata:    artifacts/creditcard_fraud_metadata.json
Observed fraud rate in real data: 0.001727


### Helper: sample with the SAME Class proportion as the real data

In [39]:
def sample_with_base_rate(synthesizer, n_rows, fraud_rate=0.001727, rng=None, shuffle=True, output_path=None):
    """
    Sample n_rows where each row's Class ~ Bernoulli(fraud_rate).
    Features are sampled conditionally on Class to preserve structure.

    Parameters
    ----------
    synthesizer : fitted SDV synthesizer
    fraud_rate  : float in [0, 1]  (P(Class=1)). The observed rate in the original data set is 0.001727, that's why it is the default value
    n_rows      : int >= 0
    rng         : None | int | np.random.Generator  (for reproducibility)
    shuffle     : bool  (shuffle final rows)
    output_path : optional CSV path

    Returns
    -------
    pandas.DataFrame
    """
    if not (0.0 <= float(fraud_rate) <= 1.0):
        raise ValueError("fraud_rate must be in [0, 1].")

    # RNG setup
    if isinstance(rng, (int, np.integer)) or rng is None:
        rng = np.random.default_rng(rng)

    n1 = int(rng.binomial(n_rows, float(fraud_rate)))  # number of frauds
    n0 = n_rows - n1

    # Build conditions; skip zero-sized ones
    conditions = []
    if n0:
        conditions.append(Condition(num_rows=n0, column_values={"Class": 0}))
    if n1:
        conditions.append(Condition(num_rows=n1, column_values={"Class": 1}))

    # Edge case: n_rows == 0: return empty with correct schema
    if n_rows == 0:
        return pd.DataFrame(columns=synthesizer.get_metadata().get_table_columns()['creditcard'].keys())

    out = synthesizer.sample_from_conditions(conditions=conditions)

    # Ensure 0/1 dtype and optional shuffle
    out["Class"] = out["Class"].astype(int)
    if shuffle and len(out) > 1:
        seed = int(rng.integers(0, 2**32 - 1))
        out = out.sample(frac=1, random_state=seed).reset_index(drop=True)

    if output_path:
        out.to_csv(output_path, index=False)

    return out


### Example usage

In [40]:
# Compute once from your training data (or set manually)
base_rate = float(df_model["Class"].mean())  # e.g., ~0.001727

batch_7  = sample_with_base_rate(synth, fraud_rate=base_rate, n_rows=7,  rng=42)
batch_30 = sample_with_base_rate(synth, fraud_rate=base_rate, n_rows=30, rng=42)

Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 148.33it/s]
Sampling conditions: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 674.17it/s]


In [45]:
batch_1M = sample_with_base_rate(synth, fraud_rate=base_rate, n_rows=1000000, rng=42)

Sampling conditions: 100%|█████████████████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:21<00:00, 47068.49it/s]


In [47]:
batch_1M[batch_1M.Class == 1]

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
403,1.761146,-8.138719,-1.057656,-1.168410,-2.093164,0.110452,-1.236977,-1.781072,-0.336239,0.397166,...,-0.903585,-0.374324,-0.863331,0.255347,0.286201,0.503460,0.302237,-0.346026,49.91,1
2444,1.353925,-0.591574,-0.148755,-1.261753,1.091031,0.405972,0.427274,1.125221,-0.330725,-0.732673,...,1.930367,0.970734,0.584548,0.342830,1.198284,0.384057,-0.464434,-0.264429,5.50,1
2450,-4.421341,6.134222,-0.469009,-2.330659,1.904492,-0.288051,2.538307,-0.974461,0.727062,-1.080972,...,-0.590960,-0.591625,-0.621803,0.245799,-0.383383,-0.326867,0.243363,0.222467,8.07,1
2489,0.839838,4.799381,2.739623,-0.039962,0.428892,1.219348,1.943755,-0.202500,-0.980017,-0.301948,...,0.257958,-0.499642,-0.888852,0.298534,0.910300,-0.127269,0.482149,-0.139002,106.61,1
2591,0.438413,6.011559,-1.371798,1.360588,1.496617,1.910484,1.875583,0.601232,-0.052782,0.054242,...,-0.184722,-0.487772,0.100800,0.339255,0.835997,-0.461546,-0.133199,0.004870,154.64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995192,1.397376,-5.555822,-0.723445,-0.317898,-1.415777,1.625585,0.120784,0.572715,-1.122518,0.529325,...,0.660654,-0.170618,-0.929666,0.497338,0.230520,-0.003563,0.266749,-0.345907,282.74,1
995918,-0.749280,1.863531,0.825995,1.426633,1.358307,1.721060,0.437566,1.449039,0.378597,0.729913,...,-1.696791,0.423078,0.726608,0.082440,0.078376,0.554169,-0.650448,0.233488,1.88,1
996306,1.533956,-1.452190,-0.568301,-0.201456,1.165855,-0.064513,-2.269968,0.282302,0.230158,-0.992814,...,-2.295988,0.449921,1.077980,0.031887,0.274178,-0.697216,0.082276,-0.534255,1.65,1
997226,-0.322831,-1.524471,-0.585723,0.004045,-2.362019,1.117774,0.483870,-1.148491,-2.691219,0.655082,...,0.373553,1.854642,0.083956,-0.140077,-0.381198,0.787688,-0.448135,-0.270931,13.34,1
