### Imports

In [1]:
import pandas as pd
import os
import json

In [2]:
from sdv.metadata import Metadata
from sdv.sampling import Condition

from sdv.single_table import GaussianCopulaSynthesizer
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import TVAESynthesizer

In [3]:
NUM_ROWS = 50
NUM_EPOCHS = 5_000
NUM_SYNT_DATA = 25

#### Cargar customizaciones

In [4]:
with open('distributions.json') as f:
    distributions = json.load(f)

In [5]:
with open('constraints.json') as f:
    constraints = json.load(f)

#### GaussianCopulaSynthesizer

In [6]:
for i in range(10):
    print(f'Generating synthetic data for set {i}')
    df = pd.read_csv(f'../data/train/set_{i}.csv')
    metadata = Metadata()
    metadata.detect_table_from_dataframe(data=df, table_name='TLP')
    categorical_columns = [
        'ED_2Clases'
    ]
    metadata.update_columns(
        column_names=categorical_columns,
        sdtype='categorical',
        table_name='TLP'
    )
    numerical_columns = [
        'demo-genero',
        'clin-reservaCognitiva_escolaridad'
    ]
    metadata.update_columns(
        column_names=numerical_columns,
        sdtype='numerical',
        computer_representation="Int8",
        table_name='TLP'
    )
    metadata.validate()
    if os.path.exists('gc_metadata.json'):
        os.remove('gc_metadata.json')
    metadata.save_to_json('gc_metadata.json')

    num_d, num_h = df['ED_2Clases'].value_counts()
    # num_synthetic_data = num_d - num_h

    class_d = Condition(
        num_rows=num_h,
        column_values={'ED_2Clases': 'D'}
    )

    class_h = Condition(
        # num_rows=num_h + num_synthetic_data,
        num_rows=num_d,
        column_values={'ED_2Clases': 'H'}
    )

    gc_synthesizer = GaussianCopulaSynthesizer(
        metadata,
        enforce_min_max_values=False,
        enforce_rounding=True,
        # numerical_distributions=distributions,
        default_distribution='gaussian_kde'
    )

    gc_synthesizer.add_constraints(constraints)
    gc_synthesizer.fit(df)

    gc_synthetic_data = gc_synthesizer.sample_from_conditions(
        conditions=[class_d, class_h],
        batch_size = 50,
        max_tries_per_batch = 100
    )

    # gc_synthetic_data = pd.concat([df, gc_synthetic_data], ignore_index=True)
    gc_synthetic_data.to_csv(
        f'../data/synthetic/gc/set_{i}.csv',
        index=False
    )

Generating synthetic data for set 0


Sampling conditions: : 350it [00:02, 143.87it/s]                       


Generating synthetic data for set 1


Sampling conditions: : 350it [00:02, 142.71it/s]                       


Generating synthetic data for set 2


Sampling conditions: : 350it [00:02, 141.84it/s]                       


Generating synthetic data for set 3


Sampling conditions: : 350it [00:02, 144.41it/s]                       


Generating synthetic data for set 4


Sampling conditions: : 350it [00:02, 144.58it/s]                       


Generating synthetic data for set 5


Sampling conditions: : 350it [00:02, 143.93it/s]                       


Generating synthetic data for set 6


Sampling conditions: : 350it [00:02, 141.09it/s]                       


Generating synthetic data for set 7


Sampling conditions: : 350it [00:02, 145.06it/s]                       


Generating synthetic data for set 8


Sampling conditions: : 350it [00:02, 140.02it/s]                       


Generating synthetic data for set 9


Sampling conditions: : 350it [00:02, 143.82it/s]                       


#### CTGANSynthesizer

In [7]:
for i in range(10):
    print(f'Generating synthetic data for set {i}')
    df = pd.read_csv(f'../data/train/set_{i}.csv')
    metadata = Metadata()
    metadata.detect_table_from_dataframe(data=df, table_name='TLP')
    categorical_columns = [
        'ED_2Clases'
    ]
    metadata.update_columns(
        column_names=categorical_columns,
        sdtype='categorical',
        table_name='TLP'
    )
    numerical_columns = [
        'demo-genero',
        'clin-reservaCognitiva_escolaridad'
    ]
    metadata.update_columns(
        column_names=numerical_columns,
        sdtype='numerical',
        table_name='TLP'
    )
    metadata.validate()
    if os.path.exists('ctgan_metadata.json'):
        os.remove('ctgan_metadata.json')
    metadata.save_to_json('ctgan_metadata.json')

    num_d, num_h = df['ED_2Clases'].value_counts()
    num_synthetic_data = num_d - num_h

    class_d = Condition(
        num_rows=NUM_SYNT_DATA,
        column_values={'ED_2Clases': 'D'}
    )

    class_h = Condition(
        num_rows=NUM_SYNT_DATA + num_synthetic_data,
        column_values={'ED_2Clases': 'H'}
    )

    ctgan_synthesizer = CTGANSynthesizer(
        metadata,
        enforce_min_max_values=False,
        enforce_rounding=True,
        epochs=NUM_EPOCHS,
        verbose=True,
        cuda=True
    )

    ctgan_synthesizer.add_constraints(constraints)
    ctgan_synthesizer.fit(df)

    ctgan_synthetic_data = ctgan_synthesizer.sample_from_conditions(
        conditions=[class_d, class_h],
        batch_size = 50,
        max_tries_per_batch = 100
    )
    ctgan_synthetic_data = pd.concat([df, ctgan_synthetic_data], ignore_index=True)
    ctgan_synthetic_data.to_csv(
        f'../data/synthetic/ctgan/set_{i}.csv',
        index=False
    )

Generating synthetic data for set 0


Gen. (0.08) | Discrim. (-0.16): 100%|██████████| 5000/5000 [05:03<00:00, 16.48it/s] 
Sampling conditions: : 225it [00:02, 90.25it/s]                        


Generating synthetic data for set 1


Gen. (-0.64) | Discrim. (0.02): 100%|██████████| 5000/5000 [04:54<00:00, 16.99it/s] 
Sampling conditions: : 225it [00:02, 98.62it/s]                        


Generating synthetic data for set 2


Gen. (-1.18) | Discrim. (-0.00): 100%|██████████| 5000/5000 [04:55<00:00, 16.92it/s]
Sampling conditions: : 225it [00:02, 97.22it/s]                        


Generating synthetic data for set 3


Gen. (-0.84) | Discrim. (-0.09): 100%|██████████| 5000/5000 [04:51<00:00, 17.18it/s]
Sampling conditions: : 225it [00:02, 98.54it/s]                        


Generating synthetic data for set 4


Gen. (-1.57) | Discrim. (0.25): 100%|██████████| 5000/5000 [04:54<00:00, 16.98it/s] 
Sampling conditions: : 225it [00:02, 96.59it/s]                        


Generating synthetic data for set 5


Gen. (-2.01) | Discrim. (-0.09): 100%|██████████| 5000/5000 [04:46<00:00, 17.42it/s]
Sampling conditions: : 225it [00:02, 98.74it/s]                        


Generating synthetic data for set 6


Gen. (-0.98) | Discrim. (0.08): 100%|██████████| 5000/5000 [04:54<00:00, 17.00it/s] 
Sampling conditions: : 225it [00:02, 98.32it/s]                        


Generating synthetic data for set 7


Gen. (-1.86) | Discrim. (-0.21): 100%|██████████| 5000/5000 [04:50<00:00, 17.22it/s]
Sampling conditions: : 225it [00:02, 95.98it/s]                        


Generating synthetic data for set 8


Gen. (-0.88) | Discrim. (0.02): 100%|██████████| 5000/5000 [04:56<00:00, 16.85it/s] 
Sampling conditions: : 225it [00:02, 98.01it/s]                        


Generating synthetic data for set 9


Gen. (-1.01) | Discrim. (0.12): 100%|██████████| 5000/5000 [04:54<00:00, 17.00it/s] 
Sampling conditions: : 225it [00:02, 96.46it/s]                        


#### TVAESynthesizer

In [8]:
for i in range(10):
    print(f'Generating synthetic data for set {i}')
    df = pd.read_csv(f'../data/train/set_{i}.csv')
    metadata = Metadata()
    metadata.detect_table_from_dataframe(data=df, table_name='TLP')
    categorical_columns = [
        'ED_2Clases'
    ]
    metadata.update_columns(
        column_names=categorical_columns,
        sdtype='categorical',
        table_name='TLP'
    )
    numerical_columns = [
        'demo-genero',
        'clin-reservaCognitiva_escolaridad'
    ]
    metadata.update_columns(
        column_names=numerical_columns,
        sdtype='numerical',
        table_name='TLP'
    )
    metadata.validate()
    if os.path.exists('tvaes_metadata.json'):
        os.remove('tvaes_metadata.json')
    metadata.save_to_json('tvaes_metadata.json')

    num_d, num_h = df['ED_2Clases'].value_counts()
    num_synthetic_data = num_d - num_h

    class_d = Condition(
        num_rows=NUM_SYNT_DATA,
        column_values={'ED_2Clases': 'D'}
    )

    class_h = Condition(
        num_rows=NUM_SYNT_DATA + num_synthetic_data,
        column_values={'ED_2Clases': 'H'}
    )

    tvaes_synthesizer = TVAESynthesizer(
        metadata,
        enforce_min_max_values=False,
        enforce_rounding=True,
        epochs=NUM_EPOCHS,
        verbose=True,
        cuda=True
    )

    tvaes_synthesizer.add_constraints(constraints)
    tvaes_synthesizer.fit(df)
    tvaes_synthetic_data = tvaes_synthesizer.sample_from_conditions(
        conditions=[class_d, class_h],
        batch_size = 50,
        max_tries_per_batch = 100
    )
    tvaes_synthetic_data = pd.concat([df, tvaes_synthetic_data], ignore_index=True)
    tvaes_synthetic_data.to_csv(
        f'../data/synthetic/tvaes/set_{i}.csv',
        index=False
    )

Generating synthetic data for set 0


Loss: -113.824: 100%|██████████| 5000/5000 [01:40<00:00, 49.79it/s]
Sampling conditions: : 225it [00:01, 131.23it/s]                       


Generating synthetic data for set 1


Loss: -113.016: 100%|██████████| 5000/5000 [01:39<00:00, 50.40it/s]
Sampling conditions: : 225it [00:02, 103.82it/s]                       


Generating synthetic data for set 2


Loss: -114.027: 100%|██████████| 5000/5000 [01:40<00:00, 49.63it/s]
Sampling conditions: : 225it [00:02, 103.98it/s]                       


Generating synthetic data for set 3


Loss: -119.794: 100%|██████████| 5000/5000 [01:39<00:00, 50.05it/s]
Sampling conditions: : 225it [00:02, 106.21it/s]                       


Generating synthetic data for set 4


Loss: -117.437: 100%|██████████| 5000/5000 [01:41<00:00, 49.13it/s]
Sampling conditions: : 225it [00:01, 113.14it/s]                       


Generating synthetic data for set 5


Loss: -115.979: 100%|██████████| 5000/5000 [01:39<00:00, 50.06it/s]
Sampling conditions: : 225it [00:02, 98.21it/s]                        


Generating synthetic data for set 6


Loss: -115.674: 100%|██████████| 5000/5000 [01:40<00:00, 49.80it/s]
Sampling conditions: : 225it [00:01, 125.00it/s]                       


Generating synthetic data for set 7


Loss: -109.966: 100%|██████████| 5000/5000 [01:38<00:00, 50.65it/s]
Sampling conditions: : 225it [00:01, 125.00it/s]                       


Generating synthetic data for set 8


Loss: -116.295: 100%|██████████| 5000/5000 [01:39<00:00, 50.28it/s]
Sampling conditions: : 225it [00:01, 123.85it/s]                       


Generating synthetic data for set 9


Loss: -116.255: 100%|██████████| 5000/5000 [01:42<00:00, 48.56it/s]
Sampling conditions: : 225it [00:01, 127.47it/s]                       
