In [1]:
import pandas as pd
import json
from sdv.evaluation.single_table import run_diagnostic
from sdv.metadata import Metadata
import matplotlib.pyplot as plt

#### GaussianCopulaSynthesizer

In [None]:
metadata = Metadata.load_from_json(filepath='../2_modeling/gc_metadata.json')
for i in range(0, 10):
    print(f'Running diagnostic for set {i}')
    real_data = pd.read_csv(
        f'../data/train/set_{i}.csv'
    )
    synthetic_data = pd.read_csv(
        f'../data/synthetic/gc/set_{i}.csv'
    )

    report = run_diagnostic(
        real_data=real_data,
        synthetic_data=synthetic_data,
        metadata=metadata
    )
    if report.get_score() < 0.99:
        print(f'Warning: Diagnostic score for set {i} is below 0.99')
        print(f'Score: {report.get_score()}')
        break
    fig = report.get_visualization(property_name='Data Validity')
    fig.update_layout(
        title=dict(
            text=f'Diagnostic Report for Set {i}',
            x=0.5,
            y=0.95,
            font=dict(
                family='Helvetica, Arial, sans-serif',
                size=24,
                color='black'
            )
        ),
        margin=dict(l=40, r=40, t=80, b=40),
        paper_bgcolor='white',
        plot_bgcolor='white'
    )

    fig.write_image(
        f'diagnostics/gc/set_{i}.png',
        width=1200,
        height=800,
        scale=2
    )


Running diagnostic for set 0
Generating report ...

(1/2) Evaluating Data Validity: |          | 0/20 [00:00<?, ?it/s]|

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<00:00, 1395.08it/s]|
Data Validity Score: 99.91%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 425.56it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 99.96%

Running diagnostic for set 1
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<00:00, 1958.99it/s]|
Data Validity Score: 99.82%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 992.73it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 99.91%

Running diagnostic for set 2
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<00:00, 1818.31it/s]|
Data Validity Score: 99.89%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 371.74it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 99.95%

Running diagnostic for set 3
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<00:00, 1992.45it/s]|
Data Validity Score: 

#### CTGANSynthesizer

In [2]:
metadata = Metadata.load_from_json(filepath='../2_modeling/ctgan_metadata.json')
for i in range(0, 10):
    print(f'Running diagnostic for set {i}')
    real_data = pd.read_csv(
        f'../data/train/set_{i}.csv'
    )
    synthetic_data = pd.read_csv(
        f'../data/synthetic/ctgan/set_{i}.csv'
    )

    report = run_diagnostic(
        real_data=real_data,
        synthetic_data=synthetic_data,
        metadata=metadata
    )
    if report.get_score() < 0.99:
        print(f'Warning: Diagnostic score for set {i} is below 0.99')
        print(f'Score: {report.get_score()}')
        break
    fig = report.get_visualization(property_name='Data Validity')
    fig.update_layout(
        title=dict(
            text=f'Diagnostic Report for Set {i}',
            x=0.5,
            y=0.95,
            font=dict(
                family='Helvetica, Arial, sans-serif',
                size=24,
                color='black'
            )
        ),
        margin=dict(l=40, r=40, t=80, b=40),
        paper_bgcolor='white',
        plot_bgcolor='white'
    )

    fig.write_image(
        f'diagnostics/ctgan/set_{i}.png',
        width=1200,
        height=800,
        scale=2
    )


Running diagnostic for set 0
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<00:00, 154.94it/s]|
Data Validity Score: 99.8%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 280.82it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 99.9%

Running diagnostic for set 1
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<00:00, 1159.40it/s]|
Data Validity Score: 99.88%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 315.05it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 99.94%

Running diagnostic for set 2
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<00:00, 2011.99it/s]|
Data Validity Score: 99.95%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 396.59it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 99.98%

Running diagnostic for set 3
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 

#### TVAESynthesizer

In [3]:
metadata = Metadata.load_from_json(filepath='../2_modeling/tvaes_metadata.json')
for i in range(0, 10):
    print(f'Running diagnostic for set {i}')
    real_data = pd.read_csv(
        f'../data/train/set_{i}.csv'
    )
    synthetic_data = pd.read_csv(
        f'../data/synthetic/tvaes/set_{i}.csv'
    )

    report = run_diagnostic(
        real_data=real_data,
        synthetic_data=synthetic_data,
        metadata=metadata
    )
    if report.get_score() < 0.99:
        print(f'Warning: Diagnostic score for set {i} is below 0.99')
        print(f'Score: {report.get_score()}')
        break
    fig = report.get_visualization(property_name='Data Validity')
    fig.update_layout(
        title=dict(
            text=f'Diagnostic Report for Set {i}',
            x=0.5,
            y=0.95,
            font=dict(
                family='Helvetica, Arial, sans-serif',
                size=24,
                color='black'
            )
        ),
        margin=dict(l=40, r=40, t=80, b=40),
        paper_bgcolor='white',
        plot_bgcolor='white'
    )

    fig.write_image(
        f'diagnostics/tvaes/set_{i}.png',
        width=1200,
        height=800,
        scale=2
    )


Running diagnostic for set 0
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<00:00, 5693.37it/s]|
Data Validity Score: 99.98%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 290.04it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 99.99%

Running diagnostic for set 1
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<00:00, 5597.63it/s]|
Data Validity Score: 99.98%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 425.86it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 99.99%

Running diagnostic for set 2
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<00:00, 3116.12it/s]|
Data Validity Score: 99.98%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 117.03it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 99.99%

Running diagnostic for set 3
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/