In [1]:
import os
import pandas as pd
import json
from sdv.metadata import Metadata
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import get_column_plot

#### GaussianCopulaSynthesizer

In [2]:
metadata = Metadata.load_from_json(filepath='../2_modeling/gc_metadata.json')
for i in range(0, 10):
    print(f'Running diagnostic for set {i}')
    real_data = pd.read_csv(
        f'../data/train/set_{i}.csv'
    )
    synthetic_data = pd.read_csv(
        f'../data/synthetic/gc/set_{i}.csv'
    )
    report = evaluate_quality(
        real_data=real_data,
        synthetic_data=synthetic_data,
        metadata=metadata
    )
    if report.get_score() < 0.85:
        print(f'Warning: Queality score for set {i} is below 0.85')
        print(f'Score: {report.get_score()}')
        break
    print(report.get_properties())
    fig = report.get_visualization(property_name='Column Shapes')
    fig.update_layout(
        title=dict(
            text=f'Quality Report for Set {i}',
            x=0.5,
            y=0.95,
            font=dict(
                family='Helvetica, Arial, sans-serif',
                size=24,
                color='black'
            )
        ),
        margin=dict(l=40, r=40, t=80, b=40),
        paper_bgcolor='white',
        plot_bgcolor='white'
    )
    fig.write_image(
        f'quality_reports/gc/set_{i}.png',
        width=1200,
        height=800,
        scale=2
    )
    os.makedirs(f'quality_reports/gc/set_{i}', exist_ok=True)

    for column_name in metadata.get_column_names():
        fig = get_column_plot(
            real_data=real_data,
            synthetic_data=synthetic_data,
            metadata=metadata,
            column_name=column_name
        )
        fig.update_layout(
            title=f'Quality Report for column {column_name} in set {i}'
        )
        fig.write_image(
            f'quality_reports/gc/set_{i}/column_{column_name}.png',
            width=1200,
            height=800,
            scale=2
        )

Running diagnostic for set 0
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 20/20 [00:00<00:00, 1035.41it/s]|
Column Shapes Score: 85.21%

(2/2) Evaluating Column Pair Trends: |██████████| 190/190 [00:00<00:00, 217.67it/s]|
Column Pair Trends Score: 91.84%

Overall Score (Average): 88.53%

             Property     Score
0       Column Shapes  0.852128
1  Column Pair Trends  0.918433
Running diagnostic for set 1
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 20/20 [00:00<00:00, 810.33it/s]|
Column Shapes Score: 84.75%

(2/2) Evaluating Column Pair Trends: |██████████| 190/190 [00:00<00:00, 224.99it/s]|
Column Pair Trends Score: 91.86%

Overall Score (Average): 88.3%

             Property     Score
0       Column Shapes  0.847518
1  Column Pair Trends  0.918567
Running diagnostic for set 2
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 20/20 [00:00<00:00, 952.37it/s]|
Column Shapes Score: 85.46%

(2/2) Evaluating Column Pai


ks_2samp: Exact calculation unsuccessful. Switching to method=asymp.



(2/2) Evaluating Column Pair Trends: |██████████| 190/190 [00:00<00:00, 216.26it/s]|
Column Pair Trends Score: 92.1%

Overall Score (Average): 87.69%

             Property     Score
0       Column Shapes  0.832862
1  Column Pair Trends  0.921025
Running diagnostic for set 6
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 20/20 [00:00<00:00, 952.35it/s]|
Column Shapes Score: 84.56%

(2/2) Evaluating Column Pair Trends: |██████████| 190/190 [00:00<00:00, 213.86it/s]|
Column Pair Trends Score: 92.43%

Overall Score (Average): 88.5%

             Property     Score
0       Column Shapes  0.845583
1  Column Pair Trends  0.924340
Running diagnostic for set 7
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 20/20 [00:00<00:00, 1000.08it/s]|
Column Shapes Score: 83.85%

(2/2) Evaluating Column Pair Trends: |██████████| 190/190 [00:00<00:00, 223.68it/s]|
Column Pair Trends Score: 92.37%

Overall Score (Average): 88.11%

             Property     Score
0  

CTGANSynthesizer

In [3]:
metadata = Metadata.load_from_json(filepath='../2_modeling/ctgan_metadata.json')
for i in range(0, 10):
    print(f'Running diagnostic for set {i}')
    real_data = pd.read_csv(
        f'../data/train/set_{i}.csv'
    )
    synthetic_data = pd.read_csv(
        f'../data/synthetic/ctgan/set_{i}.csv'
    )
    report = evaluate_quality(
        real_data=real_data,
        synthetic_data=synthetic_data,
        metadata=metadata
    )
    if report.get_score() < 0.85:
        print(f'Warning: Queality score for set {i} is below 0.85')
        print(f'Score: {report.get_score()}')
        break
    print(report.get_properties())
    fig = report.get_visualization(property_name='Column Shapes')
    fig.update_layout(
        title=dict(
            text=f'Quality Report for Set {i}',
            x=0.5,
            y=0.95,
            font=dict(
                family='Helvetica, Arial, sans-serif',
                size=24,
                color='black'
            )
        ),
        margin=dict(l=40, r=40, t=80, b=40),
        paper_bgcolor='white',
        plot_bgcolor='white'
    )
    fig.write_image(
        f'quality_reports/ctgan/set_{i}.png',
        width=1200,
        height=800,
        scale=2
    )
    os.makedirs(f'quality_reports/ctgan/set_{i}', exist_ok=True)

    for column_name in metadata.get_column_names():
        fig = get_column_plot(
            real_data=real_data,
            synthetic_data=synthetic_data,
            metadata=metadata,
            column_name=column_name
        )
        fig.update_layout(
            title=f'Quality Report for column {column_name} in set {i}'
        )
        fig.write_image(
            f'quality_reports/ctgan/set_{i}/column_{column_name}.png',
            width=1200,
            height=800,
            scale=2
        )

Running diagnostic for set 0
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 20/20 [00:00<00:00, 1159.13it/s]|
Column Shapes Score: 68.55%

(2/2) Evaluating Column Pair Trends: |██████████| 190/190 [00:00<00:00, 215.26it/s]|
Column Pair Trends Score: 76.71%

Overall Score (Average): 72.63%

Score: 0.7262855466562547


#### CopulaGANSynthesizer

In [4]:
metadata = Metadata.load_from_json(filepath='../2_modeling/cgan_metadata.json')
for i in range(0, 10):
    print(f'Running diagnostic for set {i}')
    real_data = pd.read_csv(
        f'../data/train/set_{i}.csv'
    )
    synthetic_data = pd.read_csv(
        f'../data/synthetic/cgan/set_{i}.csv'
    )
    report = evaluate_quality(
        real_data=real_data,
        synthetic_data=synthetic_data,
        metadata=metadata
    )
    if report.get_score() < 0.85:
        print(f'Warning: Queality score for set {i} is below 0.85')
        print(f'Score: {report.get_score()}')
        break
    print(report.get_properties())
    fig = report.get_visualization(property_name='Column Shapes')
    fig.update_layout(
        title=dict(
            text=f'Quality Report for Set {i}',
            x=0.5,
            y=0.95,
            font=dict(
                family='Helvetica, Arial, sans-serif',
                size=24,
                color='black'
            )
        ),
        margin=dict(l=40, r=40, t=80, b=40),
        paper_bgcolor='white',
        plot_bgcolor='white'
    )
    fig.write_image(
        f'quality_reports/cgan/set_{i}.png',
        width=1200,
        height=800,
        scale=2
    )
    os.makedirs(f'quality_reports/cgan/set_{i}', exist_ok=True)

    for column_name in metadata.get_column_names():
        fig = get_column_plot(
            real_data=real_data,
            synthetic_data=synthetic_data,
            metadata=metadata,
            column_name=column_name
        )
        fig.update_layout(
            title=f'Quality Report for column {column_name} in set {i}'
        )
        fig.write_image(
            f'quality_reports/cgan/set_{i}/column_{column_name}.png',
            width=1200,
            height=800,
            scale=2
        )

Running diagnostic for set 0
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 20/20 [00:00<00:00, 1094.99it/s]|
Column Shapes Score: 67.89%

(2/2) Evaluating Column Pair Trends: |██████████| 190/190 [00:00<00:00, 228.34it/s]|
Column Pair Trends Score: 77.09%

Overall Score (Average): 72.49%

Score: 0.7248851450261624
