### Imports

In [1]:
import pandas as pd
import os
import json

In [2]:
from sdv.metadata import Metadata
from sdv.sampling import Condition

from sdv.single_table import GaussianCopulaSynthesizer
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import CopulaGANSynthesizer

In [3]:
NUM_ROWS = 50
BATCH_SIZE = 500
NUM_EPOCHS = 300
NUM_SYNT_DATA = 25

#### Cargar customizaciones

In [4]:
with open('distributions.json') as f:
    distributions = json.load(f)

In [5]:
with open('constraints.json') as f:
    constraints = json.load(f)

#### GaussianCopulaSynthesizer

In [6]:
for i in range(10):
    print(f'Generating synthetic data for set {i}')
    df = pd.read_csv(f'../data/train/set_{i}.csv')
    metadata = Metadata()
    metadata.detect_table_from_dataframe(data=df, table_name='TLP')
    categorical_columns = [
        'ED_2Clases'
    ]
    metadata.update_columns(
        column_names=categorical_columns,
        sdtype='categorical',
        table_name='TLP'
    )
    numerical_columns = [
        'demo-genero',
        'clin-reservaCognitiva_escolaridad'
    ]
    metadata.update_columns(
        column_names=numerical_columns,
        sdtype='numerical',
        computer_representation="Int8",
        table_name='TLP'
    )
    metadata.validate()
    if os.path.exists('gc_metadata.json'):
        os.remove('gc_metadata.json')
    metadata.save_to_json('gc_metadata.json')

    num_d, num_h = df['ED_2Clases'].value_counts()

    class_d = Condition(
        num_rows=num_h,
        column_values={'ED_2Clases': 'D'}
    )

    class_h = Condition(
        num_rows=num_d,
        column_values={'ED_2Clases': 'H'}
    )

    gc_synthesizer = GaussianCopulaSynthesizer(
        metadata,
        enforce_min_max_values=False,
        enforce_rounding=True,
        default_distribution='gaussian_kde'
    )

    gc_synthesizer.add_constraints(constraints)
    gc_synthesizer.fit(df)

    gc_synthetic_data = gc_synthesizer.sample_from_conditions(
        conditions=[class_d, class_h],
        batch_size = BATCH_SIZE,
        max_tries_per_batch = 100
    )

    gc_synthetic_data.to_csv(
        f'../data/synthetic/gc/set_{i}.csv',
        index=False
    )

Generating synthetic data for set 0


Sampling conditions: 100%|██████████| 282/282 [00:01<00:00, 231.00it/s]


Generating synthetic data for set 1


Sampling conditions: 100%|██████████| 282/282 [00:01<00:00, 232.05it/s]


Generating synthetic data for set 2


Sampling conditions: 100%|██████████| 282/282 [00:01<00:00, 234.98it/s]


Generating synthetic data for set 3


Sampling conditions: 100%|██████████| 282/282 [00:01<00:00, 231.78it/s]


Generating synthetic data for set 4


Sampling conditions: 100%|██████████| 283/283 [00:01<00:00, 233.11it/s]


Generating synthetic data for set 5


Sampling conditions: 100%|██████████| 283/283 [00:01<00:00, 232.61it/s]


Generating synthetic data for set 6


Sampling conditions: 100%|██████████| 283/283 [00:01<00:00, 224.32it/s]


Generating synthetic data for set 7


Sampling conditions: 100%|██████████| 283/283 [00:01<00:00, 223.26it/s]


Generating synthetic data for set 8


Sampling conditions: 100%|██████████| 283/283 [00:01<00:00, 237.02it/s]


Generating synthetic data for set 9


Sampling conditions: 100%|██████████| 283/283 [00:01<00:00, 232.62it/s]


#### CTGANSynthesizer

In [7]:
for i in range(10):
    print(f'Generating synthetic data for set {i}')
    df = pd.read_csv(f'../data/train/set_{i}.csv')
    metadata = Metadata()
    metadata.detect_table_from_dataframe(data=df, table_name='TLP')
    categorical_columns = [
        'ED_2Clases'
    ]
    metadata.update_columns(
        column_names=categorical_columns,
        sdtype='categorical',
        table_name='TLP'
    )
    numerical_columns = [
        'demo-genero',
        'clin-reservaCognitiva_escolaridad'
    ]
    metadata.update_columns(
        column_names=numerical_columns,
        sdtype='numerical',
        table_name='TLP'
    )
    metadata.validate()
    if os.path.exists('ctgan_metadata.json'):
        os.remove('ctgan_metadata.json')
    metadata.save_to_json('ctgan_metadata.json')

    num_d, num_h = df['ED_2Clases'].value_counts()

    class_d = Condition(
        num_rows=num_h,
        column_values={'ED_2Clases': 'D'}
    )

    class_h = Condition(
        num_rows=num_d,
        column_values={'ED_2Clases': 'H'}
    )

    ctgan_synthesizer = CTGANSynthesizer(
        metadata,
        enforce_min_max_values=False,
        enforce_rounding=True,
        epochs=NUM_EPOCHS,
        verbose=True,
        cuda=True
    )

    ctgan_synthesizer.add_constraints(constraints)
    ctgan_synthesizer.fit(df)

    ctgan_synthetic_data = ctgan_synthesizer.sample_from_conditions(
        conditions=[class_d, class_h],
        batch_size = BATCH_SIZE,
        max_tries_per_batch = 100
    )
    ctgan_synthetic_data.to_csv(
        f'../data/synthetic/ctgan/set_{i}.csv',
        index=False
    )

Generating synthetic data for set 0


Gen. (-2.35) | Discrim. (0.06): 100%|██████████| 300/300 [00:17<00:00, 17.21it/s] 
Sampling conditions: 100%|██████████| 282/282 [00:00<00:00, 340.96it/s]


Generating synthetic data for set 1


Gen. (-2.22) | Discrim. (-0.21): 100%|██████████| 300/300 [00:17<00:00, 17.26it/s]
Sampling conditions: 100%|██████████| 282/282 [00:00<00:00, 332.75it/s]


Generating synthetic data for set 2


Gen. (-2.73) | Discrim. (0.09): 100%|██████████| 300/300 [00:17<00:00, 17.24it/s] 
Sampling conditions: 100%|██████████| 282/282 [00:00<00:00, 296.80it/s]


Generating synthetic data for set 3


Gen. (-2.18) | Discrim. (0.07): 100%|██████████| 300/300 [00:17<00:00, 17.55it/s] 
Sampling conditions: 100%|██████████| 282/282 [00:00<00:00, 345.36it/s]


Generating synthetic data for set 4


Gen. (-1.72) | Discrim. (-0.26): 100%|██████████| 300/300 [00:17<00:00, 17.29it/s]
Sampling conditions: 100%|██████████| 283/283 [00:00<00:00, 336.15it/s]


Generating synthetic data for set 5


Gen. (-2.48) | Discrim. (0.04): 100%|██████████| 300/300 [00:17<00:00, 17.05it/s] 
Sampling conditions: 100%|██████████| 283/283 [00:00<00:00, 322.81it/s]


Generating synthetic data for set 6


Gen. (-2.70) | Discrim. (-0.17): 100%|██████████| 300/300 [00:16<00:00, 17.76it/s]
Sampling conditions: 100%|██████████| 283/283 [00:00<00:00, 342.47it/s]


Generating synthetic data for set 7


Gen. (-3.11) | Discrim. (0.13): 100%|██████████| 300/300 [00:16<00:00, 18.12it/s] 
Sampling conditions: 100%|██████████| 283/283 [00:00<00:00, 356.30it/s]


Generating synthetic data for set 8


Gen. (-2.42) | Discrim. (-0.02): 100%|██████████| 300/300 [00:16<00:00, 18.22it/s]
Sampling conditions: 100%|██████████| 283/283 [00:00<00:00, 352.33it/s]


Generating synthetic data for set 9


Gen. (-2.78) | Discrim. (-0.03): 100%|██████████| 300/300 [00:16<00:00, 18.32it/s]
Sampling conditions: 100%|██████████| 283/283 [00:00<00:00, 349.82it/s]


#### CopulaGANSynthesizer

In [8]:
for i in range(10):
    print(f'Generating synthetic data for set {i}')
    df = pd.read_csv(f'../data/train/set_{i}.csv')
    metadata = Metadata()
    metadata.detect_table_from_dataframe(data=df, table_name='TLP')
    categorical_columns = [
        'ED_2Clases'
    ]
    metadata.update_columns(
        column_names=categorical_columns,
        sdtype='categorical',
        table_name='TLP'
    )
    numerical_columns = [
        'demo-genero',
        'clin-reservaCognitiva_escolaridad'
    ]
    metadata.update_columns(
        column_names=numerical_columns,
        sdtype='numerical',
        table_name='TLP'
    )
    metadata.validate()
    if os.path.exists('cgan_metadata.json'):
        os.remove('cgan_metadata.json')
    metadata.save_to_json('cgan_metadata.json')

    num_d, num_h = df['ED_2Clases'].value_counts()

    class_d = Condition(
        num_rows=num_h,
        column_values={'ED_2Clases': 'D'}
    )

    class_h = Condition(
        num_rows=num_d,
        column_values={'ED_2Clases': 'H'}
    )

    cgan_synthesizer = CopulaGANSynthesizer(
        metadata,
        enforce_min_max_values=False,
        enforce_rounding=True,
        epochs=NUM_EPOCHS,
        verbose=True,
        cuda=True
    )

    cgan_synthesizer.add_constraints(constraints)
    cgan_synthesizer.fit(df)
    cgan_synthetic_data = cgan_synthesizer.sample_from_conditions(
        conditions=[class_d, class_h],
        batch_size = BATCH_SIZE,
        max_tries_per_batch = 100
    )
    cgan_synthetic_data.to_csv(
        f'../data/synthetic/cgan/set_{i}.csv',
        index=False
    )

Generating synthetic data for set 0


Gen. (-2.30) | Discrim. (0.03): 100%|██████████| 300/300 [00:15<00:00, 18.87it/s] 
Sampling conditions: 100%|██████████| 282/282 [00:00<00:00, 343.07it/s]


Generating synthetic data for set 1


Gen. (-2.29) | Discrim. (0.07): 100%|██████████| 300/300 [00:17<00:00, 16.81it/s] 
Sampling conditions: 100%|██████████| 282/282 [00:00<00:00, 333.01it/s]


Generating synthetic data for set 2


Gen. (-2.84) | Discrim. (0.03): 100%|██████████| 300/300 [00:16<00:00, 17.97it/s] 
Sampling conditions: 100%|██████████| 282/282 [00:00<00:00, 353.40it/s]


Generating synthetic data for set 3


Gen. (-2.51) | Discrim. (-0.03): 100%|██████████| 300/300 [00:17<00:00, 17.29it/s]
Sampling conditions: 100%|██████████| 282/282 [00:00<00:00, 348.64it/s]


Generating synthetic data for set 4


Gen. (-2.33) | Discrim. (0.09): 100%|██████████| 300/300 [00:17<00:00, 17.60it/s] 
Sampling conditions: 100%|██████████| 283/283 [00:00<00:00, 323.07it/s]


Generating synthetic data for set 5


Gen. (-2.00) | Discrim. (0.02): 100%|██████████| 300/300 [00:17<00:00, 17.41it/s] 
Sampling conditions: 100%|██████████| 283/283 [00:00<00:00, 361.79it/s]


Generating synthetic data for set 6


Gen. (-2.44) | Discrim. (0.14): 100%|██████████| 300/300 [00:17<00:00, 17.61it/s] 
Sampling conditions: 100%|██████████| 283/283 [00:00<00:00, 345.89it/s]


Generating synthetic data for set 7


Gen. (-2.48) | Discrim. (0.09): 100%|██████████| 300/300 [00:18<00:00, 16.38it/s] 
Sampling conditions: 100%|██████████| 283/283 [00:00<00:00, 334.06it/s]


Generating synthetic data for set 8


Gen. (-2.11) | Discrim. (-0.33): 100%|██████████| 300/300 [00:17<00:00, 16.86it/s]
Sampling conditions: 100%|██████████| 283/283 [00:00<00:00, 310.23it/s]


Generating synthetic data for set 9


Gen. (-1.69) | Discrim. (-0.19): 100%|██████████| 300/300 [00:17<00:00, 16.83it/s]
Sampling conditions: 100%|██████████| 283/283 [00:00<00:00, 316.80it/s]
