### 1. Imports

In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [3]:
NUM_ROWS = 50
NUM_EPOCHS = 10000

### 2. Load data

In [4]:
df = pd.read_excel(
    '../../../data/tlp/Identia_UNED_TLP.xlsx'
)


In [None]:
df.head()

### 3. Preprocesing

In [6]:
def transfor_ed(row):
   if row['etiq-diagExpTLP_R2'] == 'ED1':
      return 'H'
   elif row['etiq-diagExpTLP_R2'] == 'ED2' or row['etiq-diagExpTLP_R2'] == 'ED3':
      return 'D'

In [7]:
df['ED_2Clases'] = df.apply(transfor_ed, axis=1)

In [8]:
drop_columns = [
    'etiq-diagExpTLPcode',
    'etiq-diagExpTLPtext',
    'etiq-diagExpTLPcode_R2',
    'etiq-diagMMSE',
    'etiq-diagExpTLP',
    'etiq-diagExpTLP_R2',
    'etiq-diagExpTLPtext_R2',
    'etiq-id',
    'clin-Covid_numDosis',
    'clin-Covid_numVecesCovid',
    'clin-Covid_pasadoCovid',
    'clin-Covid_sentimientoAislado',
    'clin-Covid_sintomaCansancio',
    'clin-Covid_sintomaDiarrea',
    'clin-Covid_sintomaDifRespirar',
    'clin-Covid_sintomaDolCabeza',
    'clin-Covid_sintomaDolGarganta',
    'clin-Covid_sintomaDolMuscular',
    'clin-Covid_sintomaEscalofrios',
    'clin-Covid_sintomaFiebre',
    'clin-Covid_sintomaPerGusto',
    'clin-Covid_sintomaPerOlfato',
    'clin-Covid_vacunado',
    'clin-añosSinFumar_Tipos_y_Rangos',
    'clin-consumoAlcohol_UBEsemanal',
    'clin-enfermedadesAltTiroides',
    'clin-enfermedadesAntecedentesCardiacos',
    'clin-enfermedadesHepatica',
    'clin-enfermedadesRenal',
    'clin-entornoUrbano',
    'clin-familiaresAlzheimer',
    'clin-familiaresOtraDemencia',
    'clin-tipoFumador',
    'clin-tratCronico',
    'clin-tratPsicologico',
    'clin-tratPsiquiatrico',
    'clin-frecOlvidos',
    'clin-frecUsoEmail',
    'clin-frecUsoMovil',
    'clin-frecUsoOrdenador',
    'clin-frecUsoRSociales',
    'clin-frecUsoTele',
    'clin-nivelActFisica',
    'clin-nivelSociabilidad',
    'clin-reservaCognitiva_actividadLectora',
    'clin-reservaCognitiva_cursos',
    'clin-reservaCognitiva_escolaridadPadres',
    'clin-reservaCognitiva_formacionMusical',
    'clin-reservaCognitiva_juegos',
    'clin-numCigarros',
    'clin-añosSinFumar',
    'clin-tipoAlcohol'
]

In [9]:
pt_columns = [column for column in df.columns if 'PT' in column ]
pz_columns = [column for column in df.columns if 'PZ' in column ]
pdc_columns = [column for column in df.columns if 'PDC' in column]
drop_columns += pt_columns + pz_columns + pdc_columns
df.drop(columns=drop_columns, inplace=True)

In [None]:
df.shape

In [None]:
df.columns[df.isna().any()]

In [None]:
df.columns[df.astype(str).apply(lambda col: col.isin(["inf", "-inf"]).any())]

In [13]:
from sdv.metadata import Metadata

metadata = Metadata.detect_from_dataframe(data=df, table_name='TLP')

In [14]:
categorical_columns = [
    'ED_2Clases'
]
metadata.update_columns(
    column_names=categorical_columns,
    sdtype='categorical',
    table_name='TLP'
)

In [None]:
print('Auto detected data:\n')
metadata.visualize()

In [16]:
metadata.validate_table(data=df, table_name='TLP')

In [None]:
metadata.visualize()

In [18]:
import os
if os.path.exists("metadata.json"):
  os.remove("metadata.json")

metadata.save_to_json('metadata.json')

### 4. Synthesizers

Constrains

In [19]:
num_d, num_h = df['ED_2Clases'].value_counts()
num_synthetic_data = num_d - num_h

In [None]:
df['ED_2Clases'].value_counts()

In [21]:
df.to_csv(
    '../../../data/tlp/preprocesado.csv',
    index=False
)

#### 4.1. GaussianCopulaSynthesizer

In [22]:
from sdv.single_table import GaussianCopulaSynthesizer

gc_synthesizer = GaussianCopulaSynthesizer(
    metadata,
    enforce_min_max_values=True,
    enforce_rounding=True,
    locales=["es_ES"],
    numerical_distributions={},
    default_distribution='beta'
)

In [23]:
gc_synthesizer.auto_assign_transformers(df)

In [24]:
processed_df = gc_synthesizer.preprocess(df)

In [25]:
gc_synthesizer.fit_processed_data(processed_df)

In [26]:
synthetic_data_list = []

while len(synthetic_data_list) < num_synthetic_data:
    samples = gc_synthesizer.sample(num_rows=NUM_ROWS)
    synthetic_data_list.extend(samples.loc[samples['ED_2Clases'] == 'H'].values.tolist()[:num_synthetic_data - len(synthetic_data_list)])


In [27]:
synthetic_data_df = pd.DataFrame(synthetic_data_list, columns=df.columns)

In [28]:
gc_synthetic_data = pd.concat([df, synthetic_data_df], ignore_index=True)

In [None]:
gc_synthetic_data.shape

In [30]:
gc_synthetic_data.to_csv(
    '../../../data/tlp/synthetic-gc.csv',
    index=False
)

#### 4.2.CTGANSynthesizer

In [31]:
from sdv.single_table import CTGANSynthesizer

ctgan_synthesizer = CTGANSynthesizer(
    metadata,
    enforce_min_max_values=True,
    enforce_rounding=True,
    locales=['es_ES'],
    epochs=NUM_EPOCHS,
    verbose=True,
    cuda=True
)

In [32]:
ctgan_synthesizer.auto_assign_transformers(df)

In [33]:
processed_df = ctgan_synthesizer.preprocess(df)

In [None]:
ctgan_synthesizer.fit_processed_data(processed_df)

In [None]:
synthetic_data_list = []

while len(synthetic_data_list) < num_synthetic_data:
    samples = ctgan_synthesizer.sample(num_rows=NUM_ROWS)
    synthetic_data_list.extend(samples.loc[samples['ED_2Clases'] == 'H'].values.tolist()[:num_synthetic_data - len(synthetic_data_list)])


In [None]:
synthetic_data_df = pd.DataFrame(synthetic_data_list, columns=df.columns)

In [None]:
ctgan_synthetic_data = pd.concat([df, synthetic_data_df], ignore_index=True)

In [None]:
ctgan_synthetic_data.shape

In [None]:
ctgan_synthetic_data.to_csv(
    '../../../data/tlp/synthetic-ctgan.csv',
    index=False
)

#### 4.3. TVAESynthesizer

In [None]:
from sdv.single_table import TVAESynthesizer

tvaes_synthesizer = TVAESynthesizer(
    metadata,
    enforce_min_max_values=True,
    enforce_rounding=True,
    epochs=NUM_EPOCHS,
    verbose=True,
    cuda=True
)

In [None]:
tvaes_synthesizer.auto_assign_transformers(df)

In [None]:
processed_df = tvaes_synthesizer.preprocess(df)

In [None]:
tvaes_synthesizer.fit_processed_data(processed_df)

In [None]:
synthetic_data_list = []

while len(synthetic_data_list) < num_synthetic_data:
    samples = tvaes_synthesizer.sample(num_rows=NUM_ROWS)
    synthetic_data_list.extend(samples.loc[samples['ED_2Clases'] == 'H'].values.tolist()[:num_synthetic_data - len(synthetic_data_list)])


In [None]:
synthetic_data_df = pd.DataFrame(synthetic_data_list, columns=df.columns)

In [None]:
tvaes_synthetic_data = pd.concat([df, synthetic_data_df], ignore_index=True)

In [None]:
tvaes_synthetic_data.shape

In [None]:
tvaes_synthetic_data.to_csv(
    '../../../data/tlp/synthetic-tvaes.csv',
    index=False
)

### 5. Evaluating Real vs. Synthetic Data

In [None]:
from sdv.evaluation.single_table import get_column_plot

plot_columns = list(df.columns)

#### 5.1. GaussianCopulaSynthesizer

In [None]:
from sdv.evaluation.single_table import run_diagnostic

gc_diagnostic = run_diagnostic(
    real_data=df,
    synthetic_data=gc_synthetic_data,
    metadata=metadata
)

In [None]:
from sdv.evaluation.single_table import evaluate_quality

gc_quality_report = evaluate_quality(
    df,
    gc_synthetic_data,
    metadata
)

In [None]:
# gc_details = gc_quality_report.get_details('Column Pair Trends')
# gc_details[gc_details['Real Correlation'].notnull()]

In [None]:
#gc_quality_report.get_details('Column Shapes')

In [None]:
# for column in plot_columns:
#     fig = get_column_plot(
#         real_data=df,
#         synthetic_data=gc_synthetic_data,
#         column_name=column,
#         metadata=metadata
#     )
#     fig.show()

#### 5.2. CTGANSynthesizer

In [None]:
ctgan_diagnostic = run_diagnostic(
    real_data=df,
    synthetic_data=ctgan_synthetic_data,
    metadata=metadata
)

In [None]:
ctgan_quality_report = evaluate_quality(
    df,
    ctgan_synthetic_data,
    metadata
)

In [None]:
# ctgan_details = ctgan_quality_report.get_details('Column Pair Trends')
# ctgan_details[ctgan_details['Real Correlation'].notnull()]

In [None]:
# ctgan_quality_report.get_details('Column Shapes')

In [None]:
#for column in plot_columns:
#    fig = get_column_plot(
#        real_data=df,
#        synthetic_data=ctgan_synthetic_data,
#        column_name=column,
#        metadata=metadata
#    )
#    fig.show()

#### 5.3. TVAESSynthesizer

In [None]:
tvaes_diagnostic = run_diagnostic(
    real_data=df,
    synthetic_data=tvaes_synthetic_data,
    metadata=metadata
)

In [None]:
tvaes_quality_report = evaluate_quality(
    df,
    tvaes_synthetic_data,
    metadata
)

In [None]:
# tvaes_details = tvaes_quality_report.get_details('Column Pair Trends')
# tvaes_details[tvaes_details['Real Correlation'].notnull()]

In [None]:
# tvaes_quality_report.get_details('Column Shapes')

In [None]:
#for column in plot_columns:
#    fig = get_column_plot(
#        real_data=df,
#        synthetic_data=tvaes_synthetic_data,
#        column_name=column,
#        metadata=metadata
#    )
#    fig.show()