### Imports

In [11]:
import pandas as pd
import os
import json

In [12]:
from sdv.metadata import Metadata
from sdv.sampling import Condition

from sdv.single_table import GaussianCopulaSynthesizer
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import TVAESynthesizer

In [13]:
NUM_ROWS = 50
NUM_EPOCHS = 5_000
NUM_SYNT_DATA = 100

In [14]:
def round_to_nearest_half(x):
    return round(x * 2) / 2

#### Cargar customizaciones

In [15]:
with open('distributions.json') as f:
    distributions = json.load(f)

In [16]:
with open('constraints.json') as f:
    constraints = json.load(f)

#### GaussianCopulaSynthesizer

In [24]:
for i in range(10):
    print(f'Generating synthetic data for set {i}')
    df = pd.read_csv(f'../data/train/set_{i}.csv')
    metadata = Metadata()
    metadata.detect_table_from_dataframe(data=df, table_name='TLP')
    categorical_columns = [
        'ED_2Clases'
    ]
    metadata.update_columns(
        column_names=categorical_columns,
        sdtype='categorical',
        table_name='TLP'
    )
    numerical_columns = [
        'demo-genero',
        'clin-reservaCognitiva_escolaridad'
    ]
    metadata.update_columns(
        column_names=numerical_columns,
        sdtype='numerical',
        table_name='TLP'
    )
    metadata.validate()
    if os.path.exists('gc_metadata.json'):
        os.remove('gc_metadata.json')
    metadata.save_to_json('gc_metadata.json')

    num_d, num_h = df['ED_2Clases'].value_counts()
    num_synthetic_data = num_d - num_h

    class_d = Condition(
        num_rows=NUM_SYNT_DATA,
        column_values={'ED_2Clases': 'D'}
    )

    class_h = Condition(
        num_rows=NUM_SYNT_DATA + num_synthetic_data,
        column_values={'ED_2Clases': 'H'}
    )

    gc_synthesizer = GaussianCopulaSynthesizer(
        metadata,
        enforce_min_max_values=False,
        enforce_rounding=True,
        # numerical_distributions=distributions,
        default_distribution='gaussian_kde'
    )

    gc_synthesizer.add_constraints(constraints)
    gc_synthesizer.fit(df)

    gc_synthetic_data = gc_synthesizer.sample_from_conditions(
        conditions=[class_d, class_h],
        batch_size = 50,
        max_tries_per_batch = 100
    )

    gc_synthetic_data = pd.concat([df, gc_synthetic_data], ignore_index=True)
    gc_synthetic_data.to_csv(
        f'../data/synthetic/gc/set_{i}.csv',
        index=False
    )

Generating synthetic data for set 0


Sampling conditions: : 400it [00:02, 145.80it/s]                       


Generating synthetic data for set 1


Sampling conditions: : 400it [00:02, 144.56it/s]                       


Generating synthetic data for set 2


Sampling conditions: : 400it [00:02, 144.95it/s]                       


Generating synthetic data for set 3


Sampling conditions: : 400it [00:02, 144.72it/s]                       


Generating synthetic data for set 4


Sampling conditions: : 400it [00:02, 146.11it/s]                       


Generating synthetic data for set 5


Sampling conditions: : 400it [00:02, 146.73it/s]                       


Generating synthetic data for set 6


Sampling conditions: : 400it [00:02, 142.72it/s]                       


Generating synthetic data for set 7


Sampling conditions: : 400it [00:02, 144.96it/s]                       


Generating synthetic data for set 8


Sampling conditions: : 400it [00:02, 149.90it/s]                       


Generating synthetic data for set 9


Sampling conditions: : 400it [00:02, 147.55it/s]                       


#### CTGANSynthesizer

In [None]:
for i in range(10):
    print(f'Generating synthetic data for set {i}')
    df = pd.read_csv(f'../data/train/set_{i}.csv')
    metadata = Metadata()
    metadata.detect_table_from_dataframe(data=df, table_name='TLP')
    categorical_columns = [
        'ED_2Clases'
    ]
    metadata.update_columns(
        column_names=categorical_columns,
        sdtype='categorical',
        table_name='TLP'
    )
    numerical_columns = [
        'demo-genero',
        'clin-reservaCognitiva_escolaridad'
    ]
    metadata.update_columns(
        column_names=numerical_columns,
        sdtype='numerical',
        table_name='TLP'
    )
    metadata.validate()
    if os.path.exists('ctgan_metadata.json'):
        os.remove('ctgan_metadata.json')
    metadata.save_to_json('ctgan_metadata.json')

    num_d, num_h = df['ED_2Clases'].value_counts()
    num_synthetic_data = num_d - num_h

    class_d = Condition(
        num_rows=NUM_SYNT_DATA,
        column_values={'ED_2Clases': 'D'}
    )

    class_h = Condition(
        num_rows=NUM_SYNT_DATA + num_synthetic_data,
        column_values={'ED_2Clases': 'H'}
    )

    ctgan_synthesizer = CTGANSynthesizer(
        metadata,
        enforce_min_max_values=False,
        enforce_rounding=True,
        epochs=NUM_EPOCHS,
        verbose=True,
        cuda=True
    )

    ctgan_synthesizer.add_constraints(constraints)
    ctgan_synthesizer.fit(df)

    ctgan_synthetic_data = ctgan_synthesizer.sample_from_conditions(
        conditions=[class_d, class_h],
        batch_size = 50,
        max_tries_per_batch = 100
    )
    ctgan_synthetic_data = pd.concat([df, ctgan_synthetic_data], ignore_index=True)
    ctgan_synthetic_data.to_csv(
        f'../data/synthetic/ctgan/set_{i}.csv',
        index=False
    )

Generating synthetic data for set 0


Gen. (-0.55) | Discrim. (-0.14): 100%|██████████| 5000/5000 [04:51<00:00, 17.16it/s]
Sampling conditions: : 400it [00:03, 115.37it/s]                       


OSError: Cannot save file into a non-existent directory: '..\..\data\synthetic\ctgan'

#### 4.3. TVAESynthesizer

In [None]:
for i in range(10):
    print(f'Generating synthetic data for set {i}')
    df = pd.read_csv(f'../../data/train/set_{i}.csv')
    metadata = Metadata()
    metadata.detect_table_from_dataframe(data=df, table_name='TLP')
    categorical_columns = [
        'ED_2Clases', 'clin-frecUsoEmail'
    ]
    metadata.update_columns(
        column_names=categorical_columns,
        sdtype='categorical',
        table_name='TLP'
    )
    metadata.validate()

    num_d, num_h = df['ED_2Clases'].value_counts()
    num_synthetic_data = num_d - num_h

    class_d = Condition(
        num_rows=NUM_SYNT_DATA,
        column_values={'ED_2Clases': 'D'}
    )

    class_h = Condition(
        num_rows=NUM_SYNT_DATA + num_synthetic_data,
        column_values={'ED_2Clases': 'H'}
    )

    tvaes_synthesizer = TVAESynthesizer(
        metadata,
        enforce_min_max_values=True,
        enforce_rounding=True,
        epochs=NUM_EPOCHS,
        verbose=True,
        cuda=True
    )

    tvaes_synthesizer.auto_assign_transformers(df)
    processed_df = tvaes_synthesizer.preprocess(df)
    tvaes_synthesizer.fit_processed_data(processed_df)
    tvaes_synthetic_data = tvaes_synthesizer.sample_from_conditions(
        conditions=[class_d, class_h],
        batch_size = 50,
        max_tries_per_batch = 100
    )
    tvaes_synthetic_data = pd.concat([df, tvaes_synthetic_data], ignore_index=True)
    tvaes_synthetic_data.to_csv(
        f'../../data/synthetic/tvaes/set_{i}.csv',
        index=False
    )

#### 4.4. CopulaGANSynthesizer

In [None]:
for i in range(10):
    print(f'Generating synthetic data for set {i}')
    df = pd.read_csv(f'../../data/train/set_{i}.csv')
    metadata = Metadata()
    metadata.detect_table_from_dataframe(data=df, table_name='TLP')
    categorical_columns = [
        'ED_2Clases', 'clin-frecUsoEmail'
    ]
    metadata.update_columns(
        column_names=categorical_columns,
        sdtype='categorical',
        table_name='TLP'
    )
    metadata.validate()

    num_d, num_h = df['ED_2Clases'].value_counts()
    num_synthetic_data = num_d - num_h

    class_d = Condition(
        num_rows=NUM_SYNT_DATA,
        column_values={'ED_2Clases': 'D'}
    )

    class_h = Condition(
        num_rows=NUM_SYNT_DATA + num_synthetic_data,
        column_values={'ED_2Clases': 'H'}
    )

    cg_synthesizer = CopulaGANSynthesizer(
        metadata,
        enforce_min_max_values=True,
        enforce_rounding=True,
        epochs=NUM_EPOCHS,
        verbose=True,
        cuda=True
    )

    cg_synthesizer.auto_assign_transformers(df)
    processed_df = cg_synthesizer.preprocess(df)
    cg_synthesizer.fit_processed_data(processed_df)
    cg_synthetic_data = cg_synthesizer.sample_from_conditions(
        conditions=[class_d, class_h],
        batch_size = 50,
        max_tries_per_batch = 100
    )
    cg_synthetic_data = pd.concat([df, cg_synthetic_data], ignore_index=True)
    cg_synthetic_data.to_csv(
        f'../../data/synthetic/cg/set_{i}.csv',
        index=False
    )

### 5. Evaluating Real vs. Synthetic Data

In [None]:
from sdv.evaluation.single_table import get_column_plot

plot_columns = list(df.columns)

#### 5.1. GaussianCopulaSynthesizer

In [None]:
from sdv.evaluation.single_table import run_diagnostic

gc_diagnostic = run_diagnostic(
    real_data=df,
    synthetic_data=gc_synthetic_data,
    metadata=metadata
)

In [None]:
from sdv.evaluation.single_table import evaluate_quality

gc_quality_report = evaluate_quality(
    df,
    gc_synthetic_data,
    metadata
)

In [None]:
gc_details = gc_quality_report.get_details('Column Pair Trends')
gc_details[gc_details['Real Correlation'].notnull()]

In [None]:
gc_quality_report.get_details('Column Shapes')

In [None]:
for column in plot_columns:
     fig = get_column_plot(
         real_data=df,
         synthetic_data=gc_synthetic_data,
         column_name=column,
         metadata=metadata
     )
     fig.show()

#### 5.2. CTGANSynthesizer

In [None]:
ctgan_diagnostic = run_diagnostic(
    real_data=df,
    synthetic_data=ctgan_synthetic_data,
    metadata=metadata
)

In [None]:
ctgan_quality_report = evaluate_quality(
    df,
    ctgan_synthetic_data,
    metadata
)

In [None]:
ctgan_details = ctgan_quality_report.get_details('Column Pair Trends')
ctgan_details[ctgan_details['Real Correlation'].notnull()]

In [None]:
ctgan_quality_report.get_details('Column Shapes')

In [None]:
for column in plot_columns:
    fig = get_column_plot(
        real_data=df,
        synthetic_data=ctgan_synthetic_data,
        column_name=column,
        metadata=metadata
    )
    fig.show()

#### 5.3. TVAESSynthesizer

In [None]:
tvaes_diagnostic = run_diagnostic(
    real_data=df,
    synthetic_data=tvaes_synthetic_data,
    metadata=metadata
)

In [None]:
tvaes_quality_report = evaluate_quality(
    df,
    tvaes_synthetic_data,
    metadata
)

In [None]:
tvaes_details = tvaes_quality_report.get_details('Column Pair Trends')
tvaes_details[tvaes_details['Real Correlation'].notnull()]

In [None]:
tvaes_quality_report.get_details('Column Shapes')

In [None]:
for column in plot_columns:
    fig = get_column_plot(
        real_data=df,
        synthetic_data=tvaes_synthetic_data,
        column_name=column,
        metadata=metadata
    )
    fig.show()