In [2]:
from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import evaluate_quality, get_column_plot
from sdv.single_table import TVAESynthesizer, CopulaGANSynthesizer

import pandas as pd

In [3]:
data = pd.read_csv('datasets/heart.csv')

data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0


In [4]:
metadata = SingleTableMetadata()

# Detect metadata from CSV file
metadata.detect_from_csv(filepath='datasets/heart.csv')

metadata

{
    "columns": {
        "age": {
            "sdtype": "numerical"
        },
        "sex": {
            "sdtype": "categorical"
        },
        "cp": {
            "sdtype": "categorical"
        },
        "trestbps": {
            "sdtype": "numerical"
        },
        "chol": {
            "sdtype": "numerical"
        },
        "fbs": {
            "sdtype": "categorical"
        },
        "restecg": {
            "sdtype": "categorical"
        },
        "thalach": {
            "sdtype": "numerical"
        },
        "exang": {
            "sdtype": "categorical"
        },
        "oldpeak": {
            "sdtype": "numerical"
        },
        "slope": {
            "sdtype": "categorical"
        },
        "ca": {
            "sdtype": "categorical"
        },
        "thal": {
            "sdtype": "categorical"
        },
        "num": {
            "sdtype": "categorical"
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}

In [6]:
# fit the synthesizer 
synthesizer = TVAESynthesizer(
    metadata,
    epochs=500,
)
synthesizer.fit(data)
synthesizer.save(filepath='synthesizers/TVAEShd.pkl')



In [7]:
synthesizer.get_loss_values()

Unnamed: 0,Epoch,Batch,Loss
0,0,0,45.770950
1,1,0,41.205048
2,2,0,39.832375
3,3,0,37.767738
4,4,0,37.228668
...,...,...,...
495,495,0,10.994474
496,496,0,11.104820
497,497,0,10.700350
498,498,0,10.953691


In [8]:
# or load it
synthesizer = TVAESynthesizer.load(
    filepath='synthesizers/TVAEShd.pkl'
)

In [9]:
synthetic_data = synthesizer.sample(
    num_rows=100_000,
    batch_size=100,
    output_file_path='synthetic/TVAEShd.csv',
)

Sampling rows: 100%|██████████| 100000/100000 [01:08<00:00, 1464.08it/s]


In [10]:
# fit the sythesizer
synthesizer = CopulaGANSynthesizer(
    metadata,
    epochs=500,
    verbose=True,
)

synthesizer.fit(data)
synthesizer.save(filepath='synthesizers/CGANhd.pkl')

Gen. (-0.80) | Discrim. (-0.07): 100%|██████████| 500/500 [00:15<00:00, 32.39it/s]


In [11]:
synthesizer.get_loss_values()

Unnamed: 0,Epoch,Generator Loss,Discriminator Loss
0,0,1.243888,-0.005376
1,1,1.259459,-0.031961
2,2,1.298920,-0.060987
3,3,1.284342,-0.058610
4,4,1.265114,-0.052619
...,...,...,...
495,495,-0.956729,0.005447
496,496,-1.042024,0.077818
497,497,-0.927343,-0.080001
498,498,-0.901953,-0.121351


In [None]:
# or load it
synthesizer = CopulaGANSynthesizer.load(
    filepath='synthesizers/CGANhd.pkl'
)

In [12]:
synthetic_data = synthesizer.sample(
    num_rows=100_000,
    batch_size=100,
    output_file_path='synthetic/CGANhd.csv',
)

Sampling rows: 100%|██████████| 100000/100000 [01:18<00:00, 1272.26it/s]
