# CardiCat Dev v0.4

CardiCat Only Walkthrough (both `ipynb` and `shell` compatible)

High-cardinality categorical features are a common characteristic of mixed-type tabular datasets.   
Existing generative model architectures struggle to learn the complexities of such data at scale,   
primarily due to the difficulty of parameterizing the categorical features.   
In this paper, we present a general variational autoencoder model, CardiCat, that can accurately fit  
 imbalanced high-cardinality and heterogeneous tabular data. Our method substitutes one-hot encoding  
  with regularized dual encoder-decoder embedding layers, which are jointly learned.  
   This approach enables us to use embeddings that depend also on the other covariates,   
   leading to a compact and homogenized parameterization of categorical features.   
   Our model employs a considerably smaller trainable parameter space than competing methods,   
   enabling learning at a large scale. CardiCat generates high-quality synthetic data that   
   better represent high-cardinality and imbalanced features compared to competing VAE models for multiple real and simulated datasets.   

This walkthrough benchmarks CardiCat (or VAE , tVAE, tGAN, cCardiCatMask) using non-trivial datasets.  

**Make sure the `params_dict.txt` is populated according to your specifications**

## Imports

In [None]:
import ast
import os
import sys
from datetime import datetime
from time import process_time
import json

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from cycler import cycler

import plotly.express as px
import plotly.io as pio

import tensorflow as tf

# Benchmarking & Evaluation:
from sdv.tabular import CTGAN, TVAE

sns.set_theme(style="ticks", color_codes=True)

pio.templates.default = "plotly_white"


comp_name = ""
param_dict_path = "/Users/{}/Dropbox/School/gitRepos/CardiCat/params_dict.txt".format(
    comp_name
)

When executing through the notebook, the cell below must be specified:

In [None]:
model = "CardiCat"  # possible values: CardiCat, VAE (VAE-vanilla), eVAE (VAE-encoder)
dataset = "Credit"  # possible values: PetFinder, Bank,Credit, Census, Medical,MIMIC,Criteo, Simulated
run_id = "test"

When executing through shell as a py file, the cell below must be commented-out:

In [None]:
# Comment-out if executed using shell!
sys.argv = [0, model, dataset, param_dict_path]
print(sys.argv)

NOTE: there is no need to change any of the code below this cell. 

## Settings

In [None]:
# Retrieving the user defined shell parameters:
param_dict_path = sys.argv[3]
# Reading the dictionary params from the dict file:
with open(param_dict_path) as f:
    params_tmp = f.read()
param_dict = ast.literal_eval(params_tmp)
# Adding user defined params to the params dictionary:
param_dict["model"] = sys.argv[1]
param_dict["dataset_log_name"] = sys.argv[2]
param_dict["param_dict_path"] = sys.argv[3]
param_dict["run_id"] = run_id

# Setting  logical model contraints:
if param_dict["model"] == "VAE":
    param_dict["embed_th"] = 5000  # just a big threshold to avoid
    param_dict["emb_loss"] = False
elif param_dict["model"] == "eVAE":
    param_dict["emb_loss"] = False
elif param_dict["model"] == "CardiCat":
    param_dict["emb_loss"] = True
    param_dict["onehot_ind_cat"] = param_dict["onehot_ind_cat"]
else:
    raise Exception("Sorry, model name not recognized")

intro = """\n
########EXECUTING CARDICAT : {} WITH THE FOLLOWING:########
""".format(param_dict["model"])
print(intro)
print("var1 model:", sys.argv[1])
print("var2 dataset_log_name:", sys.argv[2])
print("var3 latent_dim:", param_dict["latent_dim"])
print("     param file path:", param_dict_path)
print("var4 CardiCat library path:", param_dict["lib_path"])
print("var5 CardiCat data path:", param_dict["data_path"])
# Copy param_dict to add to logs
logs = param_dict.copy()
logs["date_time"] = datetime.now().strftime("%Y%m%d_%H%M")


In [None]:
# Loading the CardiCat module:
sys.path.insert(1, param_dict["lib_path"])

from src import network as network
from src import postprocessing as postprocessing
from src import preprocessing as preprocessing
from src import reporting as reporting
from src import vae_model as vae

if not param_dict["show_figs"]:
    plt.ioff()
print("Logs:\n", logs)

## Preprocessing 

Loading Dataset:

In [None]:
# Getting a table of the cardinality and type for each vairable:
## Making a copy of the dataset object to be able to compare the synthetic data
## to real data at the end of the notebook:
dataframe, is_target = preprocessing.load_dataset(
    param_dict["dataset_log_name"], param_dict["data_path"]
)
# if param_dict["dataset_log_name"]=="Simulated":
#     dataframe = dataframe.sample(20000)
param_dict["is_target"] = is_target
dataframe, dataframe_test = np.split(
    dataframe.sample(frac=1), [int(0.7 * len(dataframe))]
)
print("Train data size: ", dataframe.shape, "Test data size: ", dataframe_test.shape)
df = dataframe.copy()

In [None]:
# Get cardinality and type per feature:
preprocessing.get_df_cardinality(df)

Detecting column types for preprocesing:

In [None]:
# Creating lists of categorica,integer, and float columns (features/variables):
catCols, intCols, floatCols = preprocessing.get_col_types(
    df, is_y=param_dict["is_target"], verbose=True
)
col_tokens_all_tmp = {}
for i in catCols:
    col_tokens_all_tmp[i] = len(df[i].unique())

if param_dict["cVAE"]:
    tmpEmb = [
        key
        for key, value in col_tokens_all_tmp.items()
        if value >= param_dict["embed_th"]
    ]
else:
    tmpEmb = []


Calculating the marginal probabilities of each categorical feature:

In [None]:
catMarginals = preprocessing.get_catMarginals(dataframe, catCols)
# catMarginals

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=len(catMarginals), figsize=(10, 2))
for i, key in enumerate(catMarginals):
    pd.DataFrame.from_dict(catMarginals[key], orient="index")[:20].plot.bar(
        ax=axes[i], legend=False
    )
plt.show()

Conditional VAE or MASK logic:

In [None]:
if param_dict["cVAE"]:
    # marg_sample = lambda e: np.random.choice(
    #     list(catMarginals[e].keys()), p=list(catMarginals[e].values())
    #
    alpha = 1  # (if <= alpha then x, else marg sample)

    if param_dict["cVAE_mask"]:
        for col in tmpEmb:
            df["cond_" + col] = "mask"
        for index, row in df.iterrows():
            # print(index,row)
            col_sampled = np.random.choice(tmpEmb)
            # df.loc[index, "cond_" + col_sampled] = preprocessing.marg_sample(
            #     col_sampled, catMarginals
            # )

            # df.loc[index, "cond_" + col_sampled] = df.loc[index, col_sampled]
            df.loc[index, "cond_" + col_sampled] = (
                df.loc[index, col_sampled]
                if np.random.rand() <= alpha
                else preprocessing.marg_sample(col_sampled, catMarginals)
            )
    else:
        for col in tmpEmb:
            df["cond_" + col] = dataframe[col].apply(
                lambda x: x
                if np.random.rand() <= alpha
                else preprocessing.marg_sample(col, catMarginals)
            )  # df[col].copy()
df

Calculate CramersV for reporting purposes of orignal data

In [None]:
###
anove_mixed_original = reporting.get_mixed_anova(
    catCols, intCols, floatCols, dataframe_test
)
cramersV_original = reporting.get_catCols_cramersV(catCols, dataframe_test)

train/test split tf.dataset creation

In [None]:
## Splitting into train/test:
# # this is to avoid the bug with tf.dataset:

# ### Label encoding (ordinal encoding) the categorical variabels:
df, label_encoder = preprocessing.labelEncoding(
    df,
    catCols,
)

if param_dict["cVAE_mask"]:
    preprocessing.add_mask_labels(label_encoder)

if param_dict["cVAE"]:
    for col in tmpEmb:
        df["cond_" + col] = label_encoder[col].transform(df["cond_" + col])


if param_dict["dataset_log_name"] == "Criteo":
    tst = np.asarray(df).astype("int64")
    tst = pd.DataFrame(tst, columns=df.columns)
    train, test = np.split(tst.sample(frac=1), [int(0.8 * len(tst))])
    print("length of train:", len(train))
    print("length of test:", len(test))
    train_ds = preprocessing.df_to_dset(
        train, is_y=param_dict["is_target"], batch_size=param_dict["batch_size"]
    )
    test_ds = preprocessing.df_to_dset(
        test, is_y=param_dict["is_target"], batch_size=param_dict["batch_size"]
    )
    train_ds.element_spec
else:
    train, test = np.split(
        df.sample(frac=1), [int(param_dict["train_ratio"] * len(df))]
    )
    print("Total length: {:,}".format(len(df)))
    print(
        "length of train: {:,} which is {:.1%}".format(
            len(train), param_dict["train_ratio"]
        )
    )
    print(
        "length of test: {:,} which is {:.1%}".format(
            len(test), 1 - param_dict["train_ratio"]
        )
    )
    train_ds = preprocessing.df_to_dset(
        train, is_y=param_dict["is_target"], batch_size=param_dict["batch_size"]
    )
    test_ds = preprocessing.df_to_dset(
        test, is_y=param_dict["is_target"], batch_size=param_dict["batch_size"]
    )
    train_ds.element_spec


## Encoding

In [None]:
data_encoder = preprocessing.data_encoder(
    original_df=dataframe, train_ds=train_ds, param_dict=param_dict
)
print(data_encoder.catCols)
print("Categorical column tokens (cardinality): ", data_encoder.col_tokens_all)
# emb_sizes = data_encoder.emb_sizes
print("Total Cardinality: {:,}".format(sum(data_encoder.col_tokens_all.values())))
print("Suggested embedding size for each cat column: ", data_encoder.emb_sizes_all)
print("embCols: ", data_encoder.embCols)
print("ohCols: ", data_encoder.ohCols)

## Defining Network

In [None]:
if param_dict["cVAE"]:
    all_features_cond = tf.keras.layers.concatenate(
        [
            data_encoder.all_features,
            tf.keras.layers.concatenate(data_encoder.condFeatures),
        ]
    )

# all_features_cond

In [None]:
def decayed_learning_rate(step):
    return initial_learning_rate * decay_rate ** (step / decay_steps)


initial_learning_rate = param_dict["learn_rate"]
decay_rate = 0.5
decay_steps = 50
print("{:.6f}".format(decayed_learning_rate(150)))


In [None]:
## Complete Network:
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=param_dict["learn_rate"])
# optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=lr_schedule)
final = vae.sampling(param_dict["latent_dim"], param_dict["latent_dim"])
input_decoder = param_dict["latent_dim"]


# INCOMPLETE!! (probably only for conditional)
if param_dict["cVAE"]:
    dec_inputs_tmp = [
        tf.keras.layers.Input(shape=input_decoder, name="dec_latent_input")
    ]
    dec_inputs = dec_inputs_tmp + data_encoder.condInputs
    dec_features = dec_inputs_tmp + data_encoder.condFeatures
    dec_features = tf.keras.layers.concatenate(dec_features)

    enc = network.encoder(
        data_encoder.all_inputs + data_encoder.condInputs,
        all_features_cond,
        param_dict["latent_dim"],
    )
    dec = network.decoder(dec_inputs, dec_features, data_encoder.layer_sizes)
    # dec_inputs = tf.keras.Input(shape=input_decoder, name='decoder_input_layer')
    # dec = network.decoder(dec_inputs,dec_inputs,data_encoder.layer_sizes)
else:
    dec_inputs = tf.keras.Input(shape=input_decoder, name="dec_latent_input")
    enc = network.encoder(
        data_encoder.all_inputs, data_encoder.all_features, param_dict["latent_dim"]
    )
    dec = network.decoder(dec_inputs, dec_inputs, data_encoder.layer_sizes)

# dec = network.decoder(input_decoder,data_encoder.ohCols,param_dict,oh_tokens,emb_tokens,
#                       col_tokens_all.values(),embCols,emb_sizes,intCols,
#                       floatCols,layer_sizes)

if param_dict["emb_loss"]:
    if param_dict["cVAE"]:
        cod = vae.codex(
            data_encoder.all_inputs + data_encoder.condInputs, data_encoder.all_features
        )
    else:
        # cod = vae.codex(data_encoder.all_inputs,data_encoder.all_features)
        cod = vae.codex(data_encoder.all_inputs, data_encoder.all_features_cod)
else:
    cod = vae.codex(data_encoder.all_inputs_1hot, data_encoder.all_features_1hot)

In [None]:
# cod.summary()
# for l in cod.layers:
#     print(l.name, l.trainable)

In [None]:
# Uncomment only if your system supports graphviz
#
# Save network architecture plots:
if param_dict["save_figs"]:
    tf.keras.utils.plot_model(
        enc,
        show_shapes=True,
        rankdir="LR",
        to_file=os.path.join(
            param_dict["lib_path"],
            "output/model_architecture_graphs/{}_{}_encoder.png".format(
                logs["date_time"], logs["dataset_log_name"]
            ),
        ),
        show_layer_activations=True,
    )

    tf.keras.utils.plot_model(
        dec,
        show_shapes=True,
        rankdir="LR",
        to_file=os.path.join(
            param_dict["lib_path"],
            "output/model_architecture_graphs/{}_{}_decoder.png".format(
                logs["date_time"], logs["dataset_log_name"]
            ),
        ),
        show_layer_activations=True,
    )

    tf.keras.utils.plot_model(
        cod,
        show_shapes=True,
        rankdir="LR",
        to_file=os.path.join(
            param_dict["lib_path"],
            "output/model_architecture_graphs/{}_{}_codex.png".format(
                logs["date_time"], logs["dataset_log_name"]
            ),
        ),
    )


In [None]:
tf.keras.utils.plot_model(
    enc,
    show_shapes=True,
    rankdir="LR",
    show_layer_activations=True,
    show_trainable=True,
)

In [None]:
# Get number of network's trainable parameters;
trainableParamsEnc = np.sum([np.prod(v.shape) for v in enc.trainable_weights])
trainableParamsDec = np.sum([np.prod(v.shape) for v in dec.trainable_weights])

print("Traniable Parameters Encoder: {:,}".format(int(trainableParamsEnc)))
print("Traniable Parameters Decoder: {:,}".format(int(trainableParamsDec)))
print(
    "Total-Traniable Parameters VAE: {:,}".format(
        int(trainableParamsEnc + trainableParamsDec)
    )
)
logs["trainable_params"] = "{:,}".format(int(trainableParamsEnc + trainableParamsDec))

## CardiCat Training

In [None]:
emb_weights = {
    weights.name.split("/")[0]: weights.numpy()
    for weights in enc.weights
    if weights.name.split("_")[0] == "emb"
}
#
emb_init_var = [
    tf.math.reduce_mean(tf.math.reduce_variance(emb_weights[emb], axis=0))
    for emb in emb_weights.keys()
]

In [None]:
CardiCat_start = process_time()
tf.config.run_functions_eagerly(True)
output_loss, loss_logs_all, emb_weights = vae.train_high_vae(
    train_ds=train_ds,
    enc=enc,
    dec=dec,
    cod=cod,
    final=final,
    optimizer=optimizer,
    layer_sizes=data_encoder.layer_sizes,
    param_dict=param_dict,
    weights=data_encoder.weights,
    emb_init_var=emb_init_var,
    tmpEmb=tmpEmb,
)
CardiCat_stop = process_time()
CardiCat_time = round(CardiCat_stop - CardiCat_start, 2)
print("Elapsed time in seconds:", CardiCat_time)
logs["losses"] = round(output_loss.iloc[-1, 1:], 2)

In [None]:
print("one-hot columns: ", data_encoder.ohCols)
print("embedded columns: ", data_encoder.embCols)
print("numerical columns: ", data_encoder.numCols)

In [None]:
feature_loss_df = pd.DataFrame(
    tf.reshape(
        loss_logs_all,
        shape=(
            param_dict["epochs"],
            len(
                [*data_encoder.ohCols.values()]
                + [*data_encoder.embCols.values()]
                + data_encoder.numCols
            ),
        ),
    ),
    columns=[*data_encoder.ohCols.keys()]
    + [*data_encoder.embCols.keys()]
    + data_encoder.numCols,
)
feature_loss_df

In [None]:
feature_loss = px.line(feature_loss_df, title="loss progression per feautre")
feature_loss

In [None]:
output_loss

In [None]:
# Plot
if param_dict["mixed_loss"]:
    fig, [ax1, ax2] = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
    df_melt = output_loss[
        ["epoch", "kl_loss", "mixed_loss_factor", "total_loss", "emb_reg_loss"]
    ].melt(id_vars=["epoch"], var_name="loss_type", value_name="loss")
    sns.lineplot(data=df_melt, x="epoch", y="loss", hue="loss_type", ax=ax1)

    df_melt = output_loss[
        ["epoch", "hot_loss", "emb_loss", "num_loss", "mixed_loss", "emb_reg_loss"]
    ].melt(id_vars=["epoch"], var_name="loss_type", value_name="loss")
    sns.lineplot(data=df_melt, x="epoch", y="loss", hue="loss_type", ax=ax2)
    fig

else:
    df_melt = output_loss[
        ["epoch", "total_loss", "mse_loss_factor", "kl_loss", "emb_reg_loss"]
    ].melt(id_vars=["epoch"], var_name="loss_type", value_name="loss")

    fig, ax = plt.subplots(figsize=(12, 7))
    custom_cycler = (
        cycler(color=["r", "k", "y"])
        + cycler(linestyle=["--", "-.", ":"])
        + cycler(lw=[2, 1, 3])
    )
    ax.set_prop_cycle(custom_cycler)
    df_melt.set_index(["loss_type", "epoch"]).unstack("loss_type")["loss"].plot(ax=ax)
    ax.set_ylabel("loss")
    ax.set_yscale("log")
    ax.set_title("CardiCat Training Loss per Epic")
# fig

### Evaluation

In [None]:
# mean, log_var = enc.predict(train_ds)
# latents = final([mean, log_var])
# # dec.predict(latents)

In [None]:
oh_tokens = list(data_encoder.ohCols.values())
emb_sizes = list(data_encoder.embCols.values())
embCols = list(data_encoder.embCols.keys())
numCols = data_encoder.numCols
numFeatures = data_encoder.numFeatures
numLookup = data_encoder.numLookup
# intCols = data_encoder.intCols
all_inputs = data_encoder.all_inputs
all_inputs_1hot = data_encoder.all_inputs_1hot
col_tokens_all = data_encoder.col_tokens_all
ohCols = list(data_encoder.ohCols.keys())


In [None]:
# ## Evaluation

# ### Predicting on the train set
mean, log_var = enc.predict(train_ds)
latents = final([mean, log_var])
if param_dict["cVAE"]:
    df_y_dict = dict(train)
    df_y_dict["dec_latent_input"] = latents
else:
    df_y_dict = latents

# create a dataframe of synthetic data, get the categorical embeddings:
gen_df_train, emb_weights = postprocessing.get_pred(
    enc,
    dec,
    df_y_dict,
    param_dict,
    oh_tokens,
    emb_sizes,
    embCols,
    numFeatures,
    catCols,
    numCols,
    numLookup,
    intCols,
    all_inputs,
    # all_inputs_1hot,
    list(col_tokens_all.values()),
    label_encoder,
)


Train Set Evaluation:

In [None]:
# Transforme normalized and encoded splits to original state:
train_decoded = postprocessing.decode_splits(train, label_encoder)
test_decoded = postprocessing.decode_splits(test, label_encoder)

In [None]:
report_cardicat_train = postprocessing.get_report(
    dataframe_test, gen_df_train, param_dict
)

Random Sample out of Learned Prior Set Evaluation:

In [None]:
# predicting on random set
n = dataframe_test.shape[0]  # len(dataframe)
mean_rand = tf.reshape(
    tf.tile(
        enc.weights[-3],
        [
            n,
        ],
    ),
    (n, param_dict["latent_dim"]),
)
logvar_rand = tf.reshape(
    tf.tile(
        enc.weights[-1],
        [
            n,
        ],
    ),
    (n, param_dict["latent_dim"]),
)

cardicat_gen_rand = vae.sampling_model([mean_rand, logvar_rand])

if param_dict["cVAE"]:
    df_y_rand = pd.DataFrame()
    # marg_sample = lambda e: np.random.choice(list(catMarginals[e].keys()),p = list(catMarginals[e].values()))
    alpha = 0  # we want only random sample from marg prob
    # tmpEmb = ['Breed1', 'Color1', 'Color2', 'MaturitySize', 'FurLength', 'Vaccinated', 'Sterilized', 'Health']
    if param_dict["cVAE_mask"]:
        for col in tmpEmb:
            df_y_rand["cond_" + col] = dataframe_test[col].apply(lambda x: "mask")
        for index, row in df_y_rand.iterrows():
            # print(index,row)
            col_sampled = np.random.choice(tmpEmb)
            df_y_rand.loc[index, "cond_" + col_sampled] = preprocessing.marg_sample(
                col_sampled, catMarginals
            )
        for col in tmpEmb:
            df_y_rand["cond_" + col] = label_encoder[col].transform(
                df_y_rand["cond_" + col]
            )
    else:  # no mask
        for col in tmpEmb:
            df_y_rand["cond_" + col] = dataframe_test[col].apply(
                lambda x: x
                if np.random.rand() <= alpha
                else preprocessing.marg_sample(col, catMarginals)
            )  # df[col].copy()
            df_y_rand["cond_" + col] = label_encoder[col].transform(
                df_y_rand["cond_" + col]
            )
    df_y_rand_dict = dict(df_y_rand)
else:
    df_y_rand_dict = {}


df_y_rand_dict["dec_latent_input"] = cardicat_gen_rand
# df_y_rand_ds = tmp1
# df_y_rand_ds = tf.data.Dataset.from_tensor_slices(tmp1)#.batch(param_dict['batch_size'])

In [None]:
cardicat_gen_rand, trash = postprocessing.get_pred(
    enc,
    dec,
    df_y_rand_dict,
    param_dict,
    oh_tokens,
    emb_sizes,
    embCols,
    numFeatures,
    catCols,
    numCols,
    numLookup,
    intCols,
    all_inputs,
    # all_inputs_1hot,
    list(col_tokens_all.values()),
    label_encoder,
)


In [None]:
report_cardicat_rand = postprocessing.get_report(
    dataframe_test, cardicat_gen_rand, param_dict, full=True
)


Logging Metrics:

In [None]:
anove_mixed_cardicat = reporting.get_mixed_anova(
    catCols, intCols, floatCols, cardicat_gen_rand
)
qScore_mixed_cardicat, mixed_cardicat = reporting.get_qScoreMixed(
    anove_mixed_original, anove_mixed_cardicat
)
ks_mixed_0_weighted, ks_mixed_1_weighted, ks_mixed_0, ks_mixed_1, ks_mixed_raw_stats = (
    reporting.get_mean_ks_mixed_stats(
        cardicat_gen_rand,
        dataframe_test,
        data_encoder.catCols,
        data_encoder.intCols,
        data_encoder.floatCols,
    )
)


cramersV_cardicat = reporting.get_catCols_cramersV(catCols, cardicat_gen_rand)
score_cat_pairs_cardicat, pairs_cardicat = reporting.get_qScoreMixed(
    cramersV_original, cramersV_cardicat
)

CardiCat_scores = {
    "marginals_all": round(report_cardicat_rand["summary"].iloc[0, 1], 2),
    "pairs_all": round(report_cardicat_rand["summary"].iloc[1, 1], 2),
    "marginals_KS": round(
        report_cardicat_rand["marginals"][
            report_cardicat_rand["marginals"].Metric == "KSComplement"
        ]["Quality Score"].mean(),
        2,
    ),
    "marginals_TV": round(
        report_cardicat_rand["marginals"][
            report_cardicat_rand["marginals"].Metric == "TVComplement"
        ]["Quality Score"].mean(),
        2,
    ),
    "pairs_corr": round(
        report_cardicat_rand["pairs"][
            report_cardicat_rand["pairs"].Metric == "CorrelationSimilarity"
        ]["Quality Score"].mean(),
        2,
    ),
    "pairs_cont": round(
        report_cardicat_rand["pairs"][
            report_cardicat_rand["pairs"].Metric == "ContingencySimilarity"
        ]["Quality Score"].mean(),
        2,
    ),
    "pairs_cont_fix": round(
        report_cardicat_rand["pairs"][
            (report_cardicat_rand["pairs"]["Column 1"].isin(catCols))
            & (report_cardicat_rand["pairs"]["Column 2"].isin(catCols))
        ][report_cardicat_rand["pairs"].Metric == "ContingencySimilarity"][
            "Quality Score"
        ].mean(),
        2,
    ),
    "pairs_cat": round(score_cat_pairs_cardicat, 2),
    "pairs_mixed": round(qScore_mixed_cardicat, 2),
    "pairs_ks_mixed_0_weighted": round(np.mean(list(ks_mixed_0_weighted.values())), 3),
    "pairs_ks_mixed_1_weighted": round(np.mean(list(ks_mixed_1_weighted.values())), 3),
    "pairs_ks_mixed_0": round(np.mean(list(ks_mixed_0.values())), 3),
    "pairs_ks_mixed_1": round(np.mean(list(ks_mixed_0.values())), 3),
}


kys = [
    "loss_type",
    "oh_loss_fun",
    "embed_th",
    "epochs",
    "batch_size",
    "learn_rate",
    "recon_factor",
    "emb_regularization",
    "emb_reg_factor",
    "hot_factor",
    "emb_factor",
    "num_factor",
    "mse_factor",
    "dataset_log_name",
    "latent_dim",
    "weighted_loss",
    "emb_loss",
    "date_time",
    "trainable_params",
    "onehot_ind_cat",
    "comment",
    "run_id",
]
CardiCat_output = {k: v for k, v in logs.items() if k in kys}
CardiCat_output["col_tokens_all"] = col_tokens_all
CardiCat_output["emb_sizes"] = emb_sizes
CardiCat_output["numCols"] = numCols
CardiCat_output["catCols"] = catCols
CardiCat_output["ohCols"] = ohCols
if param_dict["cVAE"]:
    if param_dict["cVAE_mask"]:
        CardiCat_output["model"] = "cCardiCatMask"
    else:
        CardiCat_output["model"] = "cCardiCat"
else:
    CardiCat_output["model"] = param_dict["model"]
CardiCat_output["score_train"] = round(report_cardicat_train, 2)
CardiCat_output["score_rand"] = round(report_cardicat_rand["summary"].Score.mean(), 2)
CardiCat_output["scores"] = CardiCat_scores
CardiCat_output["marginals"] = report_cardicat_rand["marginals"][
    ["Column", "Quality Score"]
].to_dict()
CardiCat_output["pairs"] = report_cardicat_rand["pairs"][
    ["Column 1", "Column 2", "Metric", "Quality Score"]
].to_dict()
CardiCat_output["cramersV"] = pairs_cardicat.to_dict()
CardiCat_output["mixed"] = mixed_cardicat.to_dict()
CardiCat_output["mixed_ks"] = ks_mixed_raw_stats
CardiCat_output["time"] = CardiCat_time
print(CardiCat_output["score_rand"])


In [None]:
cardicat_scores_df = pd.DataFrame(
    CardiCat_output["scores"].values(),
    index=CardiCat_output["scores"].keys(),
    columns=["score"],
)
display(cardicat_scores_df)
px.bar(
    cardicat_scores_df,
    text_auto=".2f",
)

### REsults

In [None]:
if param_dict["attention"]:
    CardiCat_output["model"] = "CardiCatAttention"

# CardiCat_output["dataset_log_name"] = "Simulated20k"
# logs["dataset_log_name"] = "Simulated20k"

Writing logs to output folder:

In [None]:
if param_dict["save_files"]:
    cardicat_gen_rand.to_pickle(
        os.path.join(
            param_dict["lib_path"],
            "{}/synthetics/synthetics_pkl_{}_{}_{}".format(
                param_dict["output_path"],
                logs["dataset_log_name"],
                logs["date_time"],
                CardiCat_output["model"] + "_rand",
            ),
        )
    )

outlog_cardicat = pd.DataFrame.from_dict(
    {key: [CardiCat_output[key]] for key in CardiCat_output}
)
if param_dict["save_files"]:
    outlog_cardicat.to_csv(
        param_dict["lib_path"]
        + "{}/outlogs/{}_{}_{}".format(
            param_dict["output_path"],
            logs["date_time"],
            CardiCat_output["model"],
            logs["dataset_log_name"],
        )
    )
    report_cardicat_rand["marginals"].to_csv(
        param_dict["lib_path"]
        + "{}/quality_scores/marginals/{}_{}_{}".format(
            param_dict["output_path"],
            logs["date_time"],
            CardiCat_output["model"],
            logs["dataset_log_name"],
        )
    )
    report_cardicat_rand["pairs"].to_csv(
        param_dict["lib_path"]
        + "{}/quality_scores/pairs/{}_{}_{}".format(
            param_dict["output_path"],
            logs["date_time"],
            CardiCat_output["model"],
            logs["dataset_log_name"],
        )
    )

with open(
    param_dict["lib_path"]
    + "{}/reports/{}_{}_{}.json".format(
        param_dict["output_path"],
        logs["date_time"],
        CardiCat_output["model"],
        logs["dataset_log_name"],
    ),
    "w",
) as outfile:
    json.dump(CardiCat_output, outfile)


## tVAE

In [None]:
logs["date_time"] = datetime.now().strftime("%Y%m%d_%H%M")

In [None]:
print("\n### Training tVAE ###")
tVAE_start = process_time()
tvae = TVAE(
    epochs=param_dict["epochs"],
    batch_size=param_dict["batch_size"],
    embedding_dim=param_dict["latent_dim"],
    compress_dims=(128, 128, 128),
    decompress_dims=(128, 128, 128),
    l2scale=param_dict["learn_rate"],
    loss_factor=param_dict["recon_factor"],
    field_types=postprocessing.get_metadata(train),
)


tvae.fit(dataframe)
tVAE_stop = process_time()
tVAE_time = round(tVAE_stop - tVAE_start, 2)
print("Elapsed time in seconds:", tVAE_time)
# ## Evaluation
tvae_gen_rand = tvae.sample(dataframe_test.shape[0])

In [None]:
report_tvae = postprocessing.get_report(
    dataframe_test, tvae_gen_rand, param_dict, full=True
)


In [None]:
anove_mixed_tvae = reporting.get_mixed_anova(catCols, intCols, floatCols, tvae_gen_rand)
qScore_mixed_tvae, mixed_tvae = reporting.get_qScoreMixed(
    anove_mixed_original, anove_mixed_tvae
)

cramersV_tvae = reporting.get_catCols_cramersV(catCols, tvae_gen_rand)
score_cat_pairs_tvae, pairs_tvae = reporting.get_qScoreMixed(
    cramersV_original, cramersV_tvae
)

(
    ks_mixed_0_weighted_tvae,
    ks_mixed_1_weighted_tvae,
    ks_mixed_0_tvae,
    ks_mixed_1_tvae,
    ks_mixed_raw_stats_tvae,
) = reporting.get_mean_ks_mixed_stats(
    tvae_gen_rand,
    dataframe_test,
    data_encoder.catCols,
    data_encoder.intCols,
    data_encoder.floatCols,
)

tvae_scores = {
    "marginals_all": round(report_tvae["summary"].iloc[0, 1], 2),
    "pairs_all": round(report_tvae["summary"].iloc[1, 1], 2),
    "marginals_KS": round(
        report_tvae["marginals"][report_tvae["marginals"].Metric == "KSComplement"][
            "Quality Score"
        ].mean(),
        2,
    ),
    "marginals_TV": round(
        report_tvae["marginals"][report_tvae["marginals"].Metric == "TVComplement"][
            "Quality Score"
        ].mean(),
        2,
    ),
    "pairs_corr": round(
        report_tvae["pairs"][report_tvae["pairs"].Metric == "CorrelationSimilarity"][
            "Quality Score"
        ].mean(),
        2,
    ),
    "pairs_cont": round(
        report_tvae["pairs"][report_tvae["pairs"].Metric == "ContingencySimilarity"][
            "Quality Score"
        ].mean(),
        2,
    ),
    "pairs_cont_fix": round(
        report_tvae["pairs"][
            (report_tvae["pairs"]["Column 1"].isin(catCols))
            & (report_tvae["pairs"]["Column 2"].isin(catCols))
        ][report_tvae["pairs"].Metric == "ContingencySimilarity"][
            "Quality Score"
        ].mean(),
        2,
    ),
    "pairs_mixed": round(qScore_mixed_tvae, 2),
    "pairs_cat": round(score_cat_pairs_tvae, 2),
    "pairs_ks_mixed_0_weighted": round(
        np.mean(list(ks_mixed_0_weighted_tvae.values())), 3
    ),
    "pairs_ks_mixed_1_weighted": round(
        np.mean(list(ks_mixed_1_weighted_tvae.values())), 3
    ),
    "pairs_ks_mixed_0": round(np.mean(list(ks_mixed_0_tvae.values())), 3),
    "pairs_ks_mixed_1": round(np.mean(list(ks_mixed_1_tvae.values())), 3),
}

kys = [
    "epochs",
    "batch_size",
    "recon_factor",
    "mse_factor",
    "dataset_log_name",
    "latent_dim",
    "date_time",
    "comment",
    "run_id",
]

tVAE_output = {k: v for k, v in logs.items() if k in kys}
tVAE_output["model"] = "tVAE"
tVAE_output["score_rand"] = round(report_tvae["summary"].Score.mean(), 2)
tVAE_output["scores"] = tvae_scores
tVAE_output["marginals"] = report_tvae["marginals"][
    ["Column", "Quality Score"]
].to_dict()
tVAE_output["pairs"] = report_tvae["pairs"][
    ["Column 1", "Column 2", "Metric", "Quality Score"]
].to_dict()
tVAE_output["cramersV"] = pairs_tvae.to_dict()
tVAE_output["mixed"] = mixed_tvae.to_dict()
tVAE_output["mixed_ks"] = ks_mixed_raw_stats_tvae
tVAE_output["time"] = tVAE_time
# tVAE_output

In [None]:
tVAE_output["scores"]

In [None]:
# Saving synthetic pickle, and outlogs:
if param_dict["save_files"]:
    tvae_gen_rand.to_pickle(
        os.path.join(
            param_dict["lib_path"],
            "{}/synthetics/synthetics_pkl_{}_{}_{}".format(
                param_dict["output_path"],
                logs["dataset_log_name"],
                logs["date_time"],
                "tVAE" + "_rand",
            ),
        )
    )

outlog_tvae = pd.DataFrame.from_dict({key: [tVAE_output[key]] for key in tVAE_output})
if param_dict["save_files"]:
    outlog_tvae.to_csv(
        param_dict["lib_path"]
        + "{}/outlogs/{}_{}_{}".format(
            param_dict["output_path"],
            logs["date_time"],
            tVAE_output["model"],
            logs["dataset_log_name"],
        )
    )

with open(
    param_dict["lib_path"]
    + "{}/reports/{}_{}_{}.json".format(
        param_dict["output_path"],
        logs["date_time"],
        tVAE_output["model"],
        logs["dataset_log_name"],
    ),
    "w",
) as outfile:
    json.dump(tVAE_output, outfile)


## tGAN

In [None]:
print("\n### Training tGAN ###")
tGAN_start = process_time()
tGAN = CTGAN(
    epochs=param_dict["epochs"],
    batch_size=param_dict["batch_size"],
    embedding_dim=param_dict["latent_dim"],
    generator_dim=(128, 128, 128),
    discriminator_dim=(128, 128, 128),
    verbose=True,
    field_types=postprocessing.get_metadata(train),
    generator_lr=param_dict["learn_rate"],
    discriminator_lr=param_dict["learn_rate"],
)


# tvae.fit(dataframe.drop('target',axis=1))
tGAN.fit(dataframe)
tGAN_stop = process_time()
tGAN_time = round(tGAN_stop - tGAN_start, 2)
print("Elapsed time in seconds:", tGAN_time)
# ## Evaluation
tGAN_gen_rand = tGAN.sample(dataframe_test.shape[0])

In [None]:
report_tgan = postprocessing.get_report(
    dataframe_test, tGAN_gen_rand, param_dict, full=True
)

In [None]:
anove_mixed_tgan = reporting.get_mixed_anova(catCols, intCols, floatCols, tGAN_gen_rand)
qScore_mixed_tgan, mixed_tgan = reporting.get_qScoreMixed(
    anove_mixed_original, anove_mixed_tgan
)

cramersV_tgan = reporting.get_catCols_cramersV(catCols, tGAN_gen_rand)
score_cat_pairs_tgan, pairs_tgan = reporting.get_qScoreMixed(
    cramersV_original, cramersV_tgan
)

(
    ks_mixed_0_weighted_tgan,
    ks_mixed_1_weighted_tgan,
    ks_mixed_0_tgan,
    ks_mixed_1_tgan,
    ks_mixed_raw_stats_tgan,
) = reporting.get_mean_ks_mixed_stats(
    tGAN_gen_rand,
    dataframe_test,
    data_encoder.catCols,
    data_encoder.intCols,
    data_encoder.floatCols,
)

tgan_scores = {
    "marginals_all": round(report_tgan["summary"].iloc[0, 1], 2),
    "pairs_all": round(report_tgan["summary"].iloc[1, 1], 2),
    "marginals_KS": round(
        report_tgan["marginals"][report_tgan["marginals"].Metric == "KSComplement"][
            "Quality Score"
        ].mean(),
        2,
    ),
    "marginals_TV": round(
        report_tgan["marginals"][report_tgan["marginals"].Metric == "TVComplement"][
            "Quality Score"
        ].mean(),
        2,
    ),
    "pairs_corr": round(
        report_tgan["pairs"][report_tgan["pairs"].Metric == "CorrelationSimilarity"][
            "Quality Score"
        ].mean(),
        2,
    ),
    "pairs_cont": round(
        report_tgan["pairs"][report_tgan["pairs"].Metric == "ContingencySimilarity"][
            "Quality Score"
        ].mean(),
        2,
    ),
    "pairs_cont_fix": round(
        report_tvae["pairs"][
            (report_tvae["pairs"]["Column 1"].isin(catCols))
            & (report_tvae["pairs"]["Column 2"].isin(catCols))
        ][report_tvae["pairs"].Metric == "ContingencySimilarity"][
            "Quality Score"
        ].mean(),
        2,
    ),
    "pairs_mixed": round(qScore_mixed_tgan, 2),
    "pairs_cat": round(score_cat_pairs_tgan, 2),
    "pairs_ks_mixed_0_weighted": round(
        np.mean(list(ks_mixed_0_weighted_tgan.values())), 3
    ),
    "pairs_ks_mixed_1_weighted": round(
        np.mean(list(ks_mixed_1_weighted_tgan.values())), 3
    ),
    "pairs_ks_mixed_0": round(np.mean(list(ks_mixed_0_tgan.values())), 3),
    "pairs_ks_mixed_1": round(np.mean(list(ks_mixed_1_tgan.values())), 3),
}

kys = [
    "epochs",
    "batch_size",
    "recon_factor",
    "mse_factor",
    "dataset_log_name",
    "latent_dim",
    "date_time",
    "comment",
    "run_id",
]

tGAN_output = {k: v for k, v in logs.items() if k in kys}
tGAN_output["model"] = "tGAN"
tGAN_output["score_rand"] = round(report_tgan["summary"].Score.mean(), 2)
tGAN_output["scores"] = tgan_scores
tGAN_output["marginals"] = report_tgan["marginals"][
    ["Column", "Quality Score"]
].to_dict()
tGAN_output["pairs"] = report_tgan["pairs"][
    ["Column 1", "Column 2", "Metric", "Quality Score"]
].to_dict()
tGAN_output["cramersV"] = pairs_tgan.to_dict()
tGAN_output["mixed"] = mixed_tgan.to_dict()
tGAN_output["mixed_ks"] = ks_mixed_raw_stats_tgan
tGAN_output["time"] = tGAN_time
# tGAN_output

In [None]:
tGAN_output["scores"]

In [None]:
if param_dict["save_files"]:
    tGAN_gen_rand.to_pickle(
        os.path.join(
            param_dict["lib_path"],
            "{}/synthetics/synthetics_pkl_{}_{}_{}".format(
                param_dict["output_path"],
                logs["dataset_log_name"],
                logs["date_time"],
                "tGAN" + "_rand",
            ),
        )
    )

outlog_tgan = pd.DataFrame.from_dict({key: [tGAN_output[key]] for key in tGAN_output})
if param_dict["save_files"]:
    outlog_tgan.to_csv(
        param_dict["lib_path"]
        + "{}/outlogs/{}_{}_{}".format(
            param_dict["output_path"],
            logs["date_time"],
            tGAN_output["model"],
            logs["dataset_log_name"],
        )
    )

with open(
    param_dict["lib_path"]
    + "{}/reports/{}_{}_{}.json".format(
        param_dict["output_path"],
        logs["date_time"],
        tGAN_output["model"],
        logs["dataset_log_name"],
    ),
    "w",
) as outfile:
    json.dump(tGAN_output, outfile)

### END