In [1]:
import pandas as pd
import json
from sdv.metadata import Metadata
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import get_column_plot

import os

In [2]:
metadata = Metadata.load_from_json(filepath='../2_modeling/gc_metadata.json')
for i in range(0, 10):
    print(f'Running diagnostic for set {i}')
    real_data = pd.read_csv(
        f'../data/train/set_{i}.csv'
    )
    synthetic_data = pd.read_csv(
        f'../data/synthetic/gc/set_{i}.csv'
    )
    report = evaluate_quality(
        real_data=real_data,
        synthetic_data=synthetic_data,
        metadata=metadata
    )
    if report.get_score() < 0.85:
        print(f'Warning: Queality score for set {i} is below 0.85')
        print(f'Score: {report.get_score()}')
        break
    print(report.get_properties())
    fig = report.get_visualization(property_name='Column Shapes')
    fig.update_layout(
        title=dict(
            text=f'Quality Report for Set {i}',
            x=0.5,
            y=0.95,
            font=dict(
                family='Helvetica, Arial, sans-serif',
                size=24,
                color='black'
            )
        ),
        margin=dict(l=40, r=40, t=80, b=40),
        paper_bgcolor='white',
        plot_bgcolor='white'
    )
    fig.write_image(
        f'quality_reports/gc/set_{i}.png',
        width=1200,
        height=800,
        scale=2
    )
    os.makedirs(f'quality_reports/gc/set_{i}', exist_ok=True)

    for column_name in metadata.get_column_names():
        fig = get_column_plot(
            real_data=real_data,
            synthetic_data=synthetic_data,
            metadata=metadata,
            column_name=column_name
        )
        fig.update_layout(
            title=f'Quality Report for column {column_name} in set {i}'
        )
        fig.write_image(
            f'quality_reports/gc/set_{i}/column_{column_name}.png',
            width=1200,
            height=800,
            scale=2
        )

Running diagnostic for set 0
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 20/20 [00:00<00:00, 836.27it/s]|
Column Shapes Score: 84.57%

(2/2) Evaluating Column Pair Trends: |██████████| 190/190 [00:00<00:00, 197.67it/s]|
Column Pair Trends Score: 92.09%

Overall Score (Average): 88.33%

             Property     Score
0       Column Shapes  0.845745
1  Column Pair Trends  0.920895
Running diagnostic for set 1
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 20/20 [00:00<00:00, 850.37it/s]|
Column Shapes Score: 83.71%

(2/2) Evaluating Column Pair Trends: |██████████| 190/190 [00:00<00:00, 225.47it/s]|
Column Pair Trends Score: 91.96%

Overall Score (Average): 87.83%

             Property     Score
0       Column Shapes  0.837057
1  Column Pair Trends  0.919601
Running diagnostic for set 2
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 20/20 [00:00<00:00, 909.20it/s]|
Column Shapes Score: 84.15%

(2/2) Evaluating Column Pai


ks_2samp: Exact calculation unsuccessful. Switching to method=asymp.



(2/2) Evaluating Column Pair Trends: |██████████| 190/190 [00:00<00:00, 200.84it/s]|
Column Pair Trends Score: 92.32%

Overall Score (Average): 88.43%

             Property     Score
0       Column Shapes  0.845390
1  Column Pair Trends  0.923211
Running diagnostic for set 4
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 20/20 [00:00<00:00, 907.88it/s]|
Column Shapes Score: 82.67%

(2/2) Evaluating Column Pair Trends: |██████████| 190/190 [00:00<00:00, 203.29it/s]|
Column Pair Trends Score: 92.28%

Overall Score (Average): 87.48%

             Property     Score
0       Column Shapes  0.826678
1  Column Pair Trends  0.922844
Running diagnostic for set 5
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 20/20 [00:00<00:00, 882.42it/s]|
Column Shapes Score: 82.24%




ks_2samp: Exact calculation unsuccessful. Switching to method=asymp.



(2/2) Evaluating Column Pair Trends: |██████████| 190/190 [00:00<00:00, 210.74it/s]|
Column Pair Trends Score: 92.34%

Overall Score (Average): 87.29%

             Property     Score
0       Column Shapes  0.822438
1  Column Pair Trends  0.923434
Running diagnostic for set 6
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 20/20 [00:00<00:00, 1003.81it/s]|
Column Shapes Score: 83.78%

(2/2) Evaluating Column Pair Trends: |██████████| 190/190 [00:00<00:00, 200.98it/s]|
Column Pair Trends Score: 92.54%

Overall Score (Average): 88.16%

             Property     Score
0       Column Shapes  0.837809
1  Column Pair Trends  0.925374
Running diagnostic for set 7
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 20/20 [00:00<00:00, 921.25it/s]|
Column Shapes Score: 82.88%

(2/2) Evaluating Column Pair Trends: |██████████| 190/190 [00:00<00:00, 219.05it/s]|
Column Pair Trends Score: 92.46%

Overall Score (Average): 87.67%

             Property     Score
0


ks_2samp: Exact calculation unsuccessful. Switching to method=asymp.



(2/2) Evaluating Column Pair Trends: |██████████| 190/190 [00:00<00:00, 214.98it/s]|
Column Pair Trends Score: 92.35%

Overall Score (Average): 88.24%

             Property     Score
0       Column Shapes  0.841166
1  Column Pair Trends  0.923547
Running diagnostic for set 9
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 20/20 [00:00<00:00, 1753.58it/s]|
Column Shapes Score: 83.75%

(2/2) Evaluating Column Pair Trends: |██████████| 190/190 [00:00<00:00, 193.79it/s]|
Column Pair Trends Score: 92.24%

Overall Score (Average): 87.99%

             Property     Score
0       Column Shapes  0.837456
1  Column Pair Trends  0.922396
