### 1. Imports

In [8]:
import pandas as pd
import os

In [9]:
from sdv.metadata import Metadata
from sdv.sampling import Condition

from sdv.single_table import GaussianCopulaSynthesizer
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import TVAESynthesizer
from sdv.single_table import CopulaGANSynthesizer

In [10]:
NUM_ROWS = 50
NUM_EPOCHS = 5_000
NUM_SYNT_DATA = 100

In [11]:
def round_to_nearest_half(x):
    return round(x * 2) / 2

#### 1. Definir reglas

In [None]:
my_constraint = {
    'constraint_class': 'ScalarRange',
    'constraint_parameters': {
        'column_name': 'amenities_fee',
        'low_value': 0.0,
        'high_value': 500.0,
        'strict_boundaries': False
    }
}

my_synthesizer.add_constraints(constraints=[
    my_constraint
])

#### 4.1. GaussianCopulaSynthesizer

In [None]:
for i in range(10):
    print(f'Generating synthetic data for set {i}')
    df = pd.read_csv(f'../../data/train/set_{i}.csv')
    metadata = Metadata()
    metadata.detect_table_from_dataframe(data=df, table_name='TLP')
    categorical_columns = [
        'ED_2Clases'
    ]
    metadata.update_columns(
        column_names=categorical_columns,
        sdtype='categorical',
        table_name='TLP'
    )
    metadata.validate()

    num_d, num_h = df['ED_2Clases'].value_counts()
    num_synthetic_data = num_d - num_h

    class_d = Condition(
        num_rows=NUM_SYNT_DATA,
        column_values={'ED_2Clases': 'D'}
    )

    class_h = Condition(
        num_rows=NUM_SYNT_DATA + num_synthetic_data,
        column_values={'ED_2Clases': 'H'}
    )

    gc_synthesizer = GaussianCopulaSynthesizer(
        metadata,
        enforce_min_max_values=True,
        enforce_rounding=True,
        default_distribution='beta'
    )
    gc_synthesizer.auto_assign_transformers(df)
    processed_df = gc_synthesizer.preprocess(df)
    gc_synthesizer.fit_processed_data(processed_df)
    gc_synthetic_data = gc_synthesizer.sample_from_conditions(
        conditions=[class_d, class_h],
        batch_size = 50,
        max_tries_per_batch = 100
    )
    # Post-process float columns to round to nearest half
    gc_synthetic_data['eval-TLP-FigRey-totalCopia-PD'] = gc_synthetic_data['eval-TLP-FigRey-totalCopia-PD'].apply(round_to_nearest_half)
    gc_synthetic_data['eval-TLP-FigRey-totalMemoria-PD'] = gc_synthetic_data['eval-TLP-FigRey-totalMemoria-PD'].apply(round_to_nearest_half)

    gc_synthetic_data = pd.concat([df, gc_synthetic_data], ignore_index=True)
    gc_synthetic_data.to_csv(
        f'../../data/synthetic/gc/set_{i}.csv',
        index=False
    )

Generating synthetic data for set 0


Sampling conditions: : 350it [00:00, 600.01it/s]                       


Generating synthetic data for set 1


Sampling conditions: : 350it [00:00, 533.62it/s]                       


Generating synthetic data for set 2


Sampling conditions: : 350it [00:00, 581.78it/s]                       


Generating synthetic data for set 3


Sampling conditions: : 350it [00:00, 601.81it/s]                       


Generating synthetic data for set 4


Sampling conditions: : 350it [00:00, 610.55it/s]                       


Generating synthetic data for set 5


Sampling conditions: : 350it [00:00, 584.12it/s]                       


Generating synthetic data for set 6


Sampling conditions: : 350it [00:00, 604.58it/s]                       


Generating synthetic data for set 7


Sampling conditions: : 350it [00:00, 585.97it/s]                       


Generating synthetic data for set 8


Sampling conditions: : 350it [00:00, 570.28it/s]                       


Generating synthetic data for set 9


Sampling conditions: : 350it [00:00, 607.60it/s]                       


#### 4.2.CTGANSynthesizer

In [None]:
for i in range(10):
    print(f'Generating synthetic data for set {i}')
    df = pd.read_csv(f'../../data/train/set_{i}.csv')
    metadata = Metadata()
    metadata.detect_table_from_dataframe(data=df, table_name='TLP')
    categorical_columns = [
        'ED_2Clases', 'clin-frecUsoEmail'
    ]
    metadata.update_columns(
        column_names=categorical_columns,
        sdtype='categorical',
        table_name='TLP'
    )
    metadata.validate()

    num_d, num_h = df['ED_2Clases'].value_counts()
    num_synthetic_data = num_d - num_h

    class_d = Condition(
        num_rows=NUM_SYNT_DATA,
        column_values={'ED_2Clases': 'D'}
    )

    class_h = Condition(
        num_rows=NUM_SYNT_DATA + num_synthetic_data,
        column_values={'ED_2Clases': 'H'}
    )

    ctgan_synthesizer = CTGANSynthesizer(
        metadata,
        enforce_min_max_values=True,
        enforce_rounding=True,
        locales=['es_ES'],
        epochs=NUM_EPOCHS,
        verbose=True,
        cuda=True
    )

    ctgan_synthesizer.auto_assign_transformers(df)
    processed_df = ctgan_synthesizer.preprocess(df)
    ctgan_synthesizer.fit_processed_data(processed_df)
    ctgan_synthetic_data = ctgan_synthesizer.sample_from_conditions(
        conditions=[class_d, class_h],
        batch_size = 50,
        max_tries_per_batch = 100
    )
    ctgan_synthetic_data = pd.concat([df, ctgan_synthetic_data], ignore_index=True)
    ctgan_synthetic_data.to_csv(
        f'../../data/synthetic/ctgan/set_{i}.csv',
        index=False
    )

Generating synthetic data for set 0


  from .autonotebook import tqdm as notebook_tqdm
Gen. (-6.77) | Discrim. (0.50): 100%|██████████| 1000/1000 [03:28<00:00,  4.80it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:08, 36.24it/s]


Generating synthetic data for set 1


Gen. (-6.23) | Discrim. (0.53): 100%|██████████| 1000/1000 [03:17<00:00,  5.06it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:07, 41.88it/s]


Generating synthetic data for set 2


Gen. (-6.93) | Discrim. (-0.36): 100%|██████████| 1000/1000 [03:32<00:00,  4.70it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:07, 42.03it/s]


Generating synthetic data for set 3


Gen. (-5.08) | Discrim. (-0.53): 100%|██████████| 1000/1000 [03:34<00:00,  4.65it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:07, 41.86it/s]


Generating synthetic data for set 4


Gen. (-8.04) | Discrim. (0.05): 100%|██████████| 1000/1000 [03:32<00:00,  4.72it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:06, 43.78it/s]


Generating synthetic data for set 5


Gen. (-4.64) | Discrim. (0.13): 100%|██████████| 1000/1000 [03:35<00:00,  4.65it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:08, 34.89it/s]


Generating synthetic data for set 6


Gen. (-7.49) | Discrim. (0.38): 100%|██████████| 1000/1000 [03:31<00:00,  4.72it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:06, 47.02it/s]


Generating synthetic data for set 7


Gen. (-7.71) | Discrim. (-0.59): 100%|██████████| 1000/1000 [03:35<00:00,  4.65it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:07, 37.82it/s]


Generating synthetic data for set 8


Gen. (-8.95) | Discrim. (0.04): 100%|██████████| 1000/1000 [03:30<00:00,  4.75it/s] 
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:06, 44.68it/s]


Generating synthetic data for set 9


Gen. (-3.38) | Discrim. (0.83): 100%|██████████| 1000/1000 [03:34<00:00,  4.66it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:06, 44.23it/s]


#### 4.3. TVAESynthesizer

In [16]:
for i in range(10):
    print(f'Generating synthetic data for set {i}')
    df = pd.read_csv(f'../../data/train/set_{i}.csv')
    metadata = Metadata()
    metadata.detect_table_from_dataframe(data=df, table_name='TLP')
    categorical_columns = [
        'ED_2Clases', 'clin-frecUsoEmail'
    ]
    metadata.update_columns(
        column_names=categorical_columns,
        sdtype='categorical',
        table_name='TLP'
    )
    metadata.validate()

    num_d, num_h = df['ED_2Clases'].value_counts()
    num_synthetic_data = num_d - num_h

    class_d = Condition(
        num_rows=NUM_SYNT_DATA,
        column_values={'ED_2Clases': 'D'}
    )

    class_h = Condition(
        num_rows=NUM_SYNT_DATA + num_synthetic_data,
        column_values={'ED_2Clases': 'H'}
    )

    tvaes_synthesizer = TVAESynthesizer(
        metadata,
        enforce_min_max_values=True,
        enforce_rounding=True,
        epochs=NUM_EPOCHS,
        verbose=True,
        cuda=True
    )

    tvaes_synthesizer.auto_assign_transformers(df)
    processed_df = tvaes_synthesizer.preprocess(df)
    tvaes_synthesizer.fit_processed_data(processed_df)
    tvaes_synthetic_data = tvaes_synthesizer.sample_from_conditions(
        conditions=[class_d, class_h],
        batch_size = 50,
        max_tries_per_batch = 100
    )
    tvaes_synthetic_data = pd.concat([df, tvaes_synthetic_data], ignore_index=True)
    tvaes_synthetic_data.to_csv(
        f'../../data/synthetic/tvaes/set_{i}.csv',
        index=False
    )

Generating synthetic data for set 0


Loss: -274.601: 100%|██████████| 5000/5000 [04:20<00:00, 19.20it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:07, 41.90it/s]


Generating synthetic data for set 1


Loss: -277.875: 100%|██████████| 5000/5000 [04:18<00:00, 19.37it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:07, 42.41it/s]


Generating synthetic data for set 2


Loss: -290.687: 100%|██████████| 5000/5000 [04:18<00:00, 19.35it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:06, 47.56it/s]


Generating synthetic data for set 3


Loss: -281.785: 100%|██████████| 5000/5000 [04:17<00:00, 19.42it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:07, 39.93it/s]


Generating synthetic data for set 4


Loss: -276.529: 100%|██████████| 5000/5000 [04:14<00:00, 19.62it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:08, 37.40it/s]


Generating synthetic data for set 5


Loss: -282.755: 100%|██████████| 5000/5000 [04:18<00:00, 19.35it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:08, 37.10it/s]


Generating synthetic data for set 6


Loss: -285.812: 100%|██████████| 5000/5000 [04:18<00:00, 19.34it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:06, 45.09it/s]


Generating synthetic data for set 7


Loss: -285.543: 100%|██████████| 5000/5000 [04:30<00:00, 18.47it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:06, 47.17it/s]


Generating synthetic data for set 8


Loss: -281.513: 100%|██████████| 5000/5000 [04:24<00:00, 18.90it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:09, 30.61it/s]


Generating synthetic data for set 9


Loss: -279.625: 100%|██████████| 5000/5000 [04:26<00:00, 18.75it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:06, 48.64it/s]


#### 4.4. CopulaGANSynthesizer

In [4]:
for i in range(10):
    print(f'Generating synthetic data for set {i}')
    df = pd.read_csv(f'../../data/train/set_{i}.csv')
    metadata = Metadata()
    metadata.detect_table_from_dataframe(data=df, table_name='TLP')
    categorical_columns = [
        'ED_2Clases', 'clin-frecUsoEmail'
    ]
    metadata.update_columns(
        column_names=categorical_columns,
        sdtype='categorical',
        table_name='TLP'
    )
    metadata.validate()

    num_d, num_h = df['ED_2Clases'].value_counts()
    num_synthetic_data = num_d - num_h

    class_d = Condition(
        num_rows=NUM_SYNT_DATA,
        column_values={'ED_2Clases': 'D'}
    )

    class_h = Condition(
        num_rows=NUM_SYNT_DATA + num_synthetic_data,
        column_values={'ED_2Clases': 'H'}
    )

    cg_synthesizer = CopulaGANSynthesizer(
        metadata,
        enforce_min_max_values=True,
        enforce_rounding=True,
        epochs=NUM_EPOCHS,
        verbose=True,
        cuda=True
    )

    cg_synthesizer.auto_assign_transformers(df)
    processed_df = cg_synthesizer.preprocess(df)
    cg_synthesizer.fit_processed_data(processed_df)
    cg_synthetic_data = cg_synthesizer.sample_from_conditions(
        conditions=[class_d, class_h],
        batch_size = 50,
        max_tries_per_batch = 100
    )
    cg_synthetic_data = pd.concat([df, cg_synthetic_data], ignore_index=True)
    cg_synthetic_data.to_csv(
        f'../../data/synthetic/cg/set_{i}.csv',
        index=False
    )

Generating synthetic data for set 0


  from .autonotebook import tqdm as notebook_tqdm
Gen. (-9.03) | Discrim. (0.24): 100%|██████████| 5000/5000 [16:45<00:00,  4.97it/s]  
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:09, 32.72it/s]


Generating synthetic data for set 1


Gen. (-14.16) | Discrim. (-0.57): 100%|██████████| 5000/5000 [15:50<00:00,  5.26it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:06, 46.80it/s]


Generating synthetic data for set 2


Gen. (-12.30) | Discrim. (-0.72): 100%|██████████| 5000/5000 [16:33<00:00,  5.03it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:07, 38.18it/s]


Generating synthetic data for set 3


Gen. (-10.89) | Discrim. (-0.36): 100%|██████████| 5000/5000 [16:47<00:00,  4.96it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:07, 39.92it/s]


Generating synthetic data for set 4


Gen. (-11.96) | Discrim. (0.33): 100%|██████████| 5000/5000 [16:44<00:00,  4.98it/s] 
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:09, 32.68it/s]


Generating synthetic data for set 5


Gen. (-10.55) | Discrim. (-0.66): 100%|██████████| 5000/5000 [17:15<00:00,  4.83it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:09, 30.55it/s]


Generating synthetic data for set 6


Gen. (-11.14) | Discrim. (-0.23): 100%|██████████| 5000/5000 [17:41<00:00,  4.71it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:07, 38.17it/s]


Generating synthetic data for set 7


Gen. (-8.82) | Discrim. (0.41): 100%|██████████| 5000/5000 [17:36<00:00,  4.73it/s]  
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:07, 38.96it/s]


Generating synthetic data for set 8


Gen. (-10.99) | Discrim. (-0.04): 100%|██████████| 5000/5000 [17:20<00:00,  4.80it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:09, 30.76it/s]


Generating synthetic data for set 9


Gen. (-12.58) | Discrim. (-0.07): 100%|██████████| 5000/5000 [16:55<00:00,  4.92it/s]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
  sampled_rows[COND_IDX] = dataframe[COND_IDX].to_numpy()[: len(sampled_rows)]
Sampling conditions: : 300it [00:07, 37.86it/s]


### 5. Evaluating Real vs. Synthetic Data

In [None]:
from sdv.evaluation.single_table import get_column_plot

plot_columns = list(df.columns)

#### 5.1. GaussianCopulaSynthesizer

In [None]:
from sdv.evaluation.single_table import run_diagnostic

gc_diagnostic = run_diagnostic(
    real_data=df,
    synthetic_data=gc_synthetic_data,
    metadata=metadata
)

In [None]:
from sdv.evaluation.single_table import evaluate_quality

gc_quality_report = evaluate_quality(
    df,
    gc_synthetic_data,
    metadata
)

In [None]:
gc_details = gc_quality_report.get_details('Column Pair Trends')
gc_details[gc_details['Real Correlation'].notnull()]

In [None]:
gc_quality_report.get_details('Column Shapes')

In [None]:
for column in plot_columns:
     fig = get_column_plot(
         real_data=df,
         synthetic_data=gc_synthetic_data,
         column_name=column,
         metadata=metadata
     )
     fig.show()

#### 5.2. CTGANSynthesizer

In [None]:
ctgan_diagnostic = run_diagnostic(
    real_data=df,
    synthetic_data=ctgan_synthetic_data,
    metadata=metadata
)

In [None]:
ctgan_quality_report = evaluate_quality(
    df,
    ctgan_synthetic_data,
    metadata
)

In [None]:
ctgan_details = ctgan_quality_report.get_details('Column Pair Trends')
ctgan_details[ctgan_details['Real Correlation'].notnull()]

In [None]:
ctgan_quality_report.get_details('Column Shapes')

In [None]:
for column in plot_columns:
    fig = get_column_plot(
        real_data=df,
        synthetic_data=ctgan_synthetic_data,
        column_name=column,
        metadata=metadata
    )
    fig.show()

#### 5.3. TVAESSynthesizer

In [None]:
tvaes_diagnostic = run_diagnostic(
    real_data=df,
    synthetic_data=tvaes_synthetic_data,
    metadata=metadata
)

In [None]:
tvaes_quality_report = evaluate_quality(
    df,
    tvaes_synthetic_data,
    metadata
)

In [None]:
tvaes_details = tvaes_quality_report.get_details('Column Pair Trends')
tvaes_details[tvaes_details['Real Correlation'].notnull()]

In [None]:
tvaes_quality_report.get_details('Column Shapes')

In [None]:
for column in plot_columns:
    fig = get_column_plot(
        real_data=df,
        synthetic_data=tvaes_synthetic_data,
        column_name=column,
        metadata=metadata
    )
    fig.show()