### Imports

In [11]:
import pandas as pd
import os
import json

In [12]:
from sdv.metadata import Metadata
from sdv.sampling import Condition

from sdv.single_table import GaussianCopulaSynthesizer
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import TVAESynthesizer

In [13]:
NUM_ROWS = 50
NUM_EPOCHS = 5_000
NUM_SYNT_DATA = 100

#### Cargar customizaciones

In [14]:
with open('distributions.json') as f:
    distributions = json.load(f)

In [15]:
with open('constraints.json') as f:
    constraints = json.load(f)

#### GaussianCopulaSynthesizer

In [17]:
for i in range(10):
    print(f'Generating synthetic data for set {i}')
    df = pd.read_csv(f'../data/train/set_{i}.csv')
    metadata = Metadata()
    metadata.detect_table_from_dataframe(data=df, table_name='TLP')
    categorical_columns = [
        'ED_2Clases'
    ]
    metadata.update_columns(
        column_names=categorical_columns,
        sdtype='categorical',
        table_name='TLP'
    )
    numerical_columns = [
        'demo-genero',
        'clin-reservaCognitiva_escolaridad'
    ]
    metadata.update_columns(
        column_names=numerical_columns,
        sdtype='numerical',
        computer_representation="Int8",
        table_name='TLP'
    )
    metadata.validate()
    if os.path.exists('gc_metadata.json'):
        os.remove('gc_metadata.json')
    metadata.save_to_json('gc_metadata.json')

    num_d, num_h = df['ED_2Clases'].value_counts()
    # num_synthetic_data = num_d - num_h

    class_d = Condition(
        num_rows=num_h,
        column_values={'ED_2Clases': 'D'}
    )

    class_h = Condition(
        # num_rows=num_h + num_synthetic_data,
        num_rows=num_d,
        column_values={'ED_2Clases': 'H'}
    )

    gc_synthesizer = GaussianCopulaSynthesizer(
        metadata,
        enforce_min_max_values=False,
        enforce_rounding=True,
        # numerical_distributions=distributions,
        default_distribution='gaussian_kde'
    )

    gc_synthesizer.add_constraints(constraints)
    gc_synthesizer.fit(df)

    gc_synthetic_data = gc_synthesizer.sample_from_conditions(
        conditions=[class_d, class_h],
        batch_size = 50,
        max_tries_per_batch = 100
    )

    # gc_synthetic_data = pd.concat([df, gc_synthetic_data], ignore_index=True)
    gc_synthetic_data.to_csv(
        f'../data/synthetic/gc/set_{i}.csv',
        index=False
    )

Generating synthetic data for set 0


Sampling conditions: : 350it [00:02, 140.36it/s]                       


Generating synthetic data for set 1


Sampling conditions: : 350it [00:02, 143.89it/s]                       


Generating synthetic data for set 2


Sampling conditions: : 350it [00:02, 146.40it/s]                       


Generating synthetic data for set 3


Sampling conditions: : 350it [00:02, 147.64it/s]                       


Generating synthetic data for set 4


Sampling conditions: : 350it [00:02, 146.02it/s]                       


Generating synthetic data for set 5


Sampling conditions: : 350it [00:02, 139.93it/s]                       


Generating synthetic data for set 6


Sampling conditions: : 350it [00:02, 145.68it/s]                       


Generating synthetic data for set 7


Sampling conditions: : 350it [00:02, 146.26it/s]                       


Generating synthetic data for set 8


Sampling conditions: : 350it [00:02, 144.07it/s]                       


Generating synthetic data for set 9


Sampling conditions: : 350it [00:02, 145.78it/s]                       


#### CTGANSynthesizer

In [15]:
for i in range(10):
    print(f'Generating synthetic data for set {i}')
    df = pd.read_csv(f'../data/train/set_{i}.csv')
    metadata = Metadata()
    metadata.detect_table_from_dataframe(data=df, table_name='TLP')
    categorical_columns = [
        'ED_2Clases'
    ]
    metadata.update_columns(
        column_names=categorical_columns,
        sdtype='categorical',
        table_name='TLP'
    )
    numerical_columns = [
        'demo-genero',
        'clin-reservaCognitiva_escolaridad'
    ]
    metadata.update_columns(
        column_names=numerical_columns,
        sdtype='numerical',
        table_name='TLP'
    )
    metadata.validate()
    if os.path.exists('ctgan_metadata.json'):
        os.remove('ctgan_metadata.json')
    metadata.save_to_json('ctgan_metadata.json')

    num_d, num_h = df['ED_2Clases'].value_counts()
    num_synthetic_data = num_d - num_h

    class_d = Condition(
        num_rows=NUM_SYNT_DATA,
        column_values={'ED_2Clases': 'D'}
    )

    class_h = Condition(
        num_rows=NUM_SYNT_DATA + num_synthetic_data,
        column_values={'ED_2Clases': 'H'}
    )

    ctgan_synthesizer = CTGANSynthesizer(
        metadata,
        enforce_min_max_values=False,
        enforce_rounding=True,
        epochs=NUM_EPOCHS,
        verbose=True,
        cuda=True
    )

    ctgan_synthesizer.add_constraints(constraints)
    ctgan_synthesizer.fit(df)

    ctgan_synthetic_data = ctgan_synthesizer.sample_from_conditions(
        conditions=[class_d, class_h],
        batch_size = 50,
        max_tries_per_batch = 100
    )
    ctgan_synthetic_data = pd.concat([df, ctgan_synthetic_data], ignore_index=True)
    ctgan_synthetic_data.to_csv(
        f'../data/synthetic/ctgan/set_{i}.csv',
        index=False
    )

Generating synthetic data for set 0


Gen. (-0.55) | Discrim. (-0.18): 100%|██████████| 5000/5000 [04:48<00:00, 17.36it/s]
Sampling conditions: : 400it [00:03, 117.49it/s]                       


Generating synthetic data for set 1


Gen. (-0.98) | Discrim. (0.28): 100%|██████████| 5000/5000 [04:47<00:00, 17.41it/s] 
Sampling conditions: : 400it [00:03, 114.39it/s]                       


Generating synthetic data for set 2


Gen. (-1.48) | Discrim. (-0.17): 100%|██████████| 5000/5000 [04:50<00:00, 17.20it/s]
Sampling conditions: : 400it [00:03, 115.38it/s]                       


Generating synthetic data for set 3


Gen. (-0.56) | Discrim. (-0.21): 100%|██████████| 5000/5000 [04:47<00:00, 17.41it/s]
Sampling conditions: : 400it [00:03, 116.47it/s]                       


Generating synthetic data for set 4


Gen. (-1.40) | Discrim. (0.08): 100%|██████████| 5000/5000 [04:49<00:00, 17.28it/s] 
Sampling conditions: : 400it [00:03, 116.87it/s]                       


Generating synthetic data for set 5


Gen. (-1.45) | Discrim. (-0.10): 100%|██████████| 5000/5000 [04:40<00:00, 17.82it/s]
Sampling conditions: : 400it [00:03, 116.42it/s]                       


Generating synthetic data for set 6


Gen. (-1.04) | Discrim. (-0.05): 100%|██████████| 5000/5000 [04:48<00:00, 17.32it/s]
Sampling conditions: : 400it [00:03, 116.35it/s]                       


Generating synthetic data for set 7


Gen. (-1.41) | Discrim. (-0.03): 100%|██████████| 5000/5000 [04:46<00:00, 17.46it/s]
Sampling conditions: : 400it [00:03, 115.29it/s]                       


Generating synthetic data for set 8


Gen. (-0.85) | Discrim. (-0.02): 100%|██████████| 5000/5000 [04:50<00:00, 17.18it/s]
Sampling conditions: : 400it [00:03, 116.61it/s]                       


Generating synthetic data for set 9


Gen. (-0.78) | Discrim. (-0.21): 100%|██████████| 5000/5000 [04:50<00:00, 17.21it/s]
Sampling conditions: : 400it [00:03, 116.50it/s]                       


#### TVAESynthesizer

In [17]:
for i in range(10):
    print(f'Generating synthetic data for set {i}')
    df = pd.read_csv(f'../data/train/set_{i}.csv')
    metadata = Metadata()
    metadata.detect_table_from_dataframe(data=df, table_name='TLP')
    categorical_columns = [
        'ED_2Clases'
    ]
    metadata.update_columns(
        column_names=categorical_columns,
        sdtype='categorical',
        table_name='TLP'
    )
    numerical_columns = [
        'demo-genero',
        'clin-reservaCognitiva_escolaridad'
    ]
    metadata.update_columns(
        column_names=numerical_columns,
        sdtype='numerical',
        table_name='TLP'
    )
    metadata.validate()
    if os.path.exists('tvaes_metadata.json'):
        os.remove('tvaes_metadata.json')
    metadata.save_to_json('tvaes_metadata.json')

    num_d, num_h = df['ED_2Clases'].value_counts()
    num_synthetic_data = num_d - num_h

    class_d = Condition(
        num_rows=NUM_SYNT_DATA,
        column_values={'ED_2Clases': 'D'}
    )

    class_h = Condition(
        num_rows=NUM_SYNT_DATA + num_synthetic_data,
        column_values={'ED_2Clases': 'H'}
    )

    tvaes_synthesizer = TVAESynthesizer(
        metadata,
        enforce_min_max_values=False,
        enforce_rounding=True,
        epochs=NUM_EPOCHS,
        verbose=True,
        cuda=True
    )

    tvaes_synthesizer.add_constraints(constraints)
    tvaes_synthesizer.fit(df)
    tvaes_synthetic_data = tvaes_synthesizer.sample_from_conditions(
        conditions=[class_d, class_h],
        batch_size = 50,
        max_tries_per_batch = 100
    )
    tvaes_synthetic_data = pd.concat([df, tvaes_synthetic_data], ignore_index=True)
    tvaes_synthetic_data.to_csv(
        f'../data/synthetic/tvaes/set_{i}.csv',
        index=False
    )

Generating synthetic data for set 0


Loss: -107.670: 100%|██████████| 5000/5000 [01:46<00:00, 47.01it/s]
Sampling conditions: : 400it [00:03, 118.84it/s]                       


Generating synthetic data for set 1


Loss: -113.139: 100%|██████████| 5000/5000 [01:42<00:00, 48.57it/s]
Sampling conditions: : 400it [00:03, 127.69it/s]                       


Generating synthetic data for set 2


Loss: -113.680: 100%|██████████| 5000/5000 [01:44<00:00, 47.62it/s]
Sampling conditions: : 400it [00:02, 142.17it/s]                       


Generating synthetic data for set 3


Loss: -116.593: 100%|██████████| 5000/5000 [01:45<00:00, 47.61it/s]
Sampling conditions: : 400it [00:03, 122.44it/s]                       


Generating synthetic data for set 4


Loss: -110.048: 100%|██████████| 5000/5000 [01:44<00:00, 48.04it/s]
Sampling conditions: : 400it [00:02, 147.56it/s]                       


Generating synthetic data for set 5


Loss: -119.541: 100%|██████████| 5000/5000 [01:44<00:00, 47.75it/s]
Sampling conditions: : 400it [00:03, 119.51it/s]                       


Generating synthetic data for set 6


Loss: -115.175: 100%|██████████| 5000/5000 [01:45<00:00, 47.53it/s]
Sampling conditions: : 400it [00:03, 107.39it/s]                       


Generating synthetic data for set 7


Loss: -113.962: 100%|██████████| 5000/5000 [01:44<00:00, 47.94it/s]
Sampling conditions: : 400it [00:03, 115.60it/s]                       


Generating synthetic data for set 8


Loss: -110.293: 100%|██████████| 5000/5000 [01:44<00:00, 47.77it/s]
Sampling conditions: : 400it [00:02, 138.61it/s]                       


Generating synthetic data for set 9


Loss: -113.074: 100%|██████████| 5000/5000 [01:46<00:00, 46.83it/s]
Sampling conditions: : 400it [00:03, 115.93it/s]                       


#### CopulaGANSynthesizer

In [None]:
for i in range(10):
    print(f'Generating synthetic data for set {i}')
    df = pd.read_csv(f'../../data/train/set_{i}.csv')
    metadata = Metadata()
    metadata.detect_table_from_dataframe(data=df, table_name='TLP')
    categorical_columns = [
        'ED_2Clases', 'clin-frecUsoEmail'
    ]
    metadata.update_columns(
        column_names=categorical_columns,
        sdtype='categorical',
        table_name='TLP'
    )
    metadata.validate()

    num_d, num_h = df['ED_2Clases'].value_counts()
    num_synthetic_data = num_d - num_h

    class_d = Condition(
        num_rows=NUM_SYNT_DATA,
        column_values={'ED_2Clases': 'D'}
    )

    class_h = Condition(
        num_rows=NUM_SYNT_DATA + num_synthetic_data,
        column_values={'ED_2Clases': 'H'}
    )

    cg_synthesizer = CopulaGANSynthesizer(
        metadata,
        enforce_min_max_values=True,
        enforce_rounding=True,
        epochs=NUM_EPOCHS,
        verbose=True,
        cuda=True
    )

    cg_synthesizer.auto_assign_transformers(df)
    processed_df = cg_synthesizer.preprocess(df)
    cg_synthesizer.fit_processed_data(processed_df)
    cg_synthetic_data = cg_synthesizer.sample_from_conditions(
        conditions=[class_d, class_h],
        batch_size = 50,
        max_tries_per_batch = 100
    )
    cg_synthetic_data = pd.concat([df, cg_synthetic_data], ignore_index=True)
    cg_synthetic_data.to_csv(
        f'../../data/synthetic/cg/set_{i}.csv',
        index=False
    )