In [1]:
import pandas as pd
import json
from sdv.evaluation.single_table import run_diagnostic
from sdv.metadata import Metadata
import matplotlib.pyplot as plt

#### GaussianCopulaSynthesizer

In [2]:
metadata = Metadata.load_from_json(filepath='../2_modeling/gc_metadata.json')
for i in range(0, 10):
    print(f'Running diagnostic for set {i}')
    real_data = pd.read_csv(
        f'../data/train/set_{i}.csv'
    )
    synthetic_data = pd.read_csv(
        f'../data/synthetic/gc/set_{i}.csv'
    )

    report = run_diagnostic(
        real_data=real_data,
        synthetic_data=synthetic_data,
        metadata=metadata
    )
    if report.get_score() < 0.99:
        print(f'Warning: Diagnostic score for set {i} is below 0.99')
        print(f'Score: {report.get_score()}')
        break
    fig = report.get_visualization(property_name='Data Validity')
    fig.update_layout(
        title=dict(
            text=f'Diagnostic Report for Set {i}',
            x=0.5,
            y=0.95,
            font=dict(
                family='Helvetica, Arial, sans-serif',
                size=24,
                color='black'
            )
        ),
        margin=dict(l=40, r=40, t=80, b=40),
        paper_bgcolor='white',
        plot_bgcolor='white'
    )

    fig.write_image(
        f'diagnostics/gc/set_{i}.png',
        width=1200,
        height=800,
        scale=2
    )


Running diagnostic for set 0
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<00:00, 3338.62it/s]|
Data Validity Score: 99.93%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 103.76it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 99.96%

Running diagnostic for set 1
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<00:00, 1818.59it/s]|
Data Validity Score: 99.75%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 308.47it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 99.88%

Running diagnostic for set 2
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<00:00, 835.80it/s]|
Data Validity Score: 99.91%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<?, ?it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 99.96%

Running diagnostic for set 3
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<

#### CTGANSynthesizer

In [3]:
metadata = Metadata.load_from_json(filepath='../2_modeling/ctgan_metadata.json')
for i in range(0, 10):
    print(f'Running diagnostic for set {i}')
    real_data = pd.read_csv(
        f'../data/train/set_{i}.csv'
    )
    synthetic_data = pd.read_csv(
        f'../data/synthetic/ctgan/set_{i}.csv'
    )

    report = run_diagnostic(
        real_data=real_data,
        synthetic_data=synthetic_data,
        metadata=metadata
    )
    if report.get_score() < 0.99:
        print(f'Warning: Diagnostic score for set {i} is below 0.99')
        print(f'Score: {report.get_score()}')
        break
    fig = report.get_visualization(property_name='Data Validity')
    fig.update_layout(
        title=dict(
            text=f'Diagnostic Report for Set {i}',
            x=0.5,
            y=0.95,
            font=dict(
                family='Helvetica, Arial, sans-serif',
                size=24,
                color='black'
            )
        ),
        margin=dict(l=40, r=40, t=80, b=40),
        paper_bgcolor='white',
        plot_bgcolor='white'
    )

    fig.write_image(
        f'diagnostics/ctgan/set_{i}.png',
        width=1200,
        height=800,
        scale=2
    )


Running diagnostic for set 0
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<00:00, 1877.74it/s]|
Data Validity Score: 99.7%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<?, ?it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 99.85%

Running diagnostic for set 1
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<?, ?it/s]|
Data Validity Score: 99.65%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 69.26it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 99.82%

Running diagnostic for set 2
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<00:00, 5204.17it/s]|
Data Validity Score: 99.24%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 85.45it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 99.62%

Running diagnostic for set 3
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<00:00, 1241.

#### CopulaGANSynthesizer

In [4]:
metadata = Metadata.load_from_json(filepath='../2_modeling/cgan_metadata.json')
for i in range(0, 10):
    print(f'Running diagnostic for set {i}')
    real_data = pd.read_csv(
        f'../data/train/set_{i}.csv'
    )
    synthetic_data = pd.read_csv(
        f'../data/synthetic/cgan/set_{i}.csv'
    )

    report = run_diagnostic(
        real_data=real_data,
        synthetic_data=synthetic_data,
        metadata=metadata
    )
    if report.get_score() < 0.99:
        print(f'Warning: Diagnostic score for set {i} is below 0.99')
        print(f'Score: {report.get_score()}')
        break
    fig = report.get_visualization(property_name='Data Validity')
    fig.update_layout(
        title=dict(
            text=f'Diagnostic Report for Set {i}',
            x=0.5,
            y=0.95,
            font=dict(
                family='Helvetica, Arial, sans-serif',
                size=24,
                color='black'
            )
        ),
        margin=dict(l=40, r=40, t=80, b=40),
        paper_bgcolor='white',
        plot_bgcolor='white'
    )

    fig.write_image(
        f'diagnostics/cgan/set_{i}.png',
        width=1200,
        height=800,
        scale=2
    )


Running diagnostic for set 0
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<00:00, 1276.63it/s]|
Data Validity Score: 99.27%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<?, ?it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 99.64%

Running diagnostic for set 1
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<00:00, 1206.56it/s]|
Data Validity Score: 98.92%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<?, ?it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 99.46%

Running diagnostic for set 2
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<00:00, 3094.63it/s]|
Data Validity Score: 98.39%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<?, ?it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 99.19%

Running diagnostic for set 3
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<?, ?it/s]|
Data V