# Synthcity tabular models training

#### For static data

We use synthcity Benchmarks to select the best two models and generate SD.

In [None]:
# sometimes we have to purge the workspace to avoid errors
!rm -rf workspace

In [None]:
# stdlib
import sys
import warnings

warnings.filterwarnings("ignore")

from datetime import datetime, timedelta

import numpy as np
import pandas as pd

# synthcity absolute
import synthcity.logger as log
from synthcity.plugins import Plugins
from synthcity.plugins.core.dataloader import GenericDataLoader
from synthcity.benchmark import Benchmarks
from synthcity.utils.serialization import load, load_from_file, save, save_to_file

log.add(sink=sys.stderr, level="INFO")

### Inputs
- Number of days or datafilename
- Epochs
- Peaks (number of max and min values of the time series)
- Size of sampled synthetic data
- Real data file name
- Generators

In [None]:
days = 1
data_dir = "../"
epochs = 128
peaks = 1
sample_size = None
datafilename = None

In [None]:
# uncomment the models to test
generators = ['marginal_distributions',
              # 'aim',
              # 'bayesian_network',
              # 'privbayes',
              'adsgan',
              # 'pategan',
              'ctgan',
              # 'tvae',
              # 'rtvae',
              # 'nflow',
              # 'goggle',
              'ddpm',
              # 'arfpy',
              # 'great',
              # 'dpgan'
             ]

### Read real data

In [None]:
if datafilename:
    real_data = pd.read_csv(datafilename, index_col=0)    
else:
    real_data = pd.read_csv(f"{data_dir}real_data_sdv_{days}_days.csv", index_col=0)

if not sample_size:
    sample_size = len(real_data.datapoint_id.unique())

In [None]:
real_data

### Add peaks and valleys

Find max and min on the time series and add it to the static columns, discarding time series data

In [None]:
def pick_peaks(df, count=1):
    """ Select max and min values for each time series and add it to a dataframe along with the timestamp

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe with static and time series values.
    count : int
        The number of max and min values to grab.

    Returns
    -------
    pd.DataFrame
        DataFrame with static features and mins and maxes with timestamps
    """
    # get static features
    static_df = df.drop(columns=["Timestamp", "energy_elec", "energy_gas"]).drop_duplicates().reset_index(drop=True)

    # get timeseries for each utility
    group_timeseries_elec = df[["datapoint_id", "Timestamp", "energy_elec"]].groupby('datapoint_id', sort=False)
    group_timeseries_gas = df[["datapoint_id", "Timestamp", "energy_gas"]].groupby('datapoint_id', sort=False)

    # emaxmins ={f"temax{i}": [] for i in range(count)} | {f"emax{i}": [] for i in range(count)} |\
    #           {f"temin{i}": [] for i in range(count)} | {f"emin{i}": [] for i in range(count)}
    
    # create a dict to store maxes and mins, change this to adjust for a particular order
    emaxmins = {}
    for i in range(count):
            emaxmins[f"temax_{i}"] = []
            emaxmins[f"emax_{i}"] = []
            emaxmins[f"temin_{i}"] = []
            emaxmins[f"emin_{i}"] = []
    
    for t in group_timeseries_elec.groups:
        df_ts = group_timeseries_elec.get_group(t)[["Timestamp", "energy_elec"]]
        ts = df_ts.energy_elec
        # locate mins and maxes and append it to emaxmins dict
        for i in range(count):
            imaxv, maxv = ts.idxmax(), ts.max()
            iminv, minv = ts.idxmin(), ts.min()
            max_ts = df_ts.loc[imaxv, "Timestamp"]
            min_ts = df_ts.loc[iminv, "Timestamp"]
            emaxmins[f"temax_{i}"].append(max_ts)
            emaxmins[f"emax_{i}"].append(maxv)
            emaxmins[f"temin_{i}"].append(min_ts)
            emaxmins[f"emin_{i}"].append(minv)
            ts = ts.drop(imaxv)
            ts = ts.drop(iminv)

    # create a new dataframe with mins and maxes and their timestamp
    edfp = pd.DataFrame(emaxmins)

    # create a dict to store maxes and mins, change this to adjust for a particular order
    gmaxmins = {}
    for i in range(count):
            gmaxmins[f"tgmax_{i}"] = []
            gmaxmins[f"gmax_{i}"] = []
            gmaxmins[f"tgmin_{i}"] = []
            gmaxmins[f"gmin_{i}"] = []
    
    for t in group_timeseries_gas.groups:
        df_ts = group_timeseries_gas.get_group(t)[["Timestamp", "energy_gas"]]#.reset_index(drop=True)
        ts = df_ts.energy_gas

        # Note 1: dont look for minmax if timeseries is flat or zero
        # often there is no gas measurements
        # Note 2: this doesnt handle all cases, this assumes that if there is a gas measurement
        # then there are measurements for each hour.
        # Note 3: mins and maxes for gas contributes to sparsity given that a lot of 
        # buildings do not have gas consumption.
        if len(ts.unique()) > 2:
            for i in range(count):
                imaxv, maxv = ts.idxmax(), ts.max()
                iminv, minv = ts.idxmin(), ts.min()
                max_ts = df_ts.loc[imaxv, "Timestamp"]
                min_ts = df_ts.loc[iminv, "Timestamp"]
                gmaxmins[f"tgmax_{i}"].append(max_ts)
                gmaxmins[f"gmax_{i}"].append(maxv)
                gmaxmins[f"tgmin_{i}"].append(min_ts)
                gmaxmins[f"gmin_{i}"].append(minv)
                ts = ts.drop(imaxv)
                ts = ts.drop(iminv)
        else:
            for i in range(count):
                gmaxmins[f"tgmax_{i}"].append(str(datetime.strptime("2000-01-01 00:00:00", '%Y-%m-%d %H:%M:%S')))
                gmaxmins[f"gmax_{i}"].append(0)
                gmaxmins[f"tgmin_{i}"].append(str(datetime.strptime("2000-01-01 00:00:00", '%Y-%m-%d %H:%M:%S')))
                gmaxmins[f"gmin_{i}"].append(0)
            
    
    gdfp = pd.DataFrame(gmaxmins)

    # concatenate static_df with min and maxes dataframes
    df_ = pd.concat([static_df, edfp, gdfp], axis=1)

    # Quick validation, the static emax0 values should be the same as computing
    # the max of each time series.
    # TODO: Validation needs to be done for all cases.
    if not np.allclose(df_.emax_0.unique(), df[["datapoint_id","energy_elec"]].groupby('datapoint_id', sort=False).max().values.flatten()):
       raise ValueError("Max of time series mismatch")
        
    # let's reorder the columns
    cols = list(df_.columns)
    lencs = len(cols)
    print(f"Columns in {len(df.columns)}, columns out {lencs}")
    #return df_, edfp
    if lencs != len(df.columns) + 8*count - 3:
        raise ValueError("Input / output columns mismatch")
    
    return df_[cols[0:1]+cols[-8*count:]+cols[1:lencs-8*count]]


In [None]:
%%time
real_data = pick_peaks(real_data, peaks)

In [None]:
real_data

In [None]:
real_data.to_csv(f"real_data_synthcity_{days}_days_{peaks}_peaks_tabular.csv")

## Model benchmarking and selection

In [None]:
loader = GenericDataLoader(real_data)

In [None]:
# all metrics, we may want to select the model using stats metrics first
metrics = {
    #'sanity': ['data_mismatch', 'common_rows_proportion', 'nearest_syn_neighbor_distance', 'close_values_probability', 'distant_values_probability'],
    'stats': ['jensenshannon_dist', 'chi_squared_test', 'feature_corr', 'inv_kl_divergence', 'ks_test', 'max_mean_discrepancy', 'wasserstein_dist', 'prdc', 'alpha_precision'],
    #'performance': ['linear_model', 'mlp', 'xgb', 'feat_rank_distance'],
    #'detection': ['detection_xgb', 'detection_mlp', 'detection_gmm', 'detection_linear'],
    #'privacy': ['delta-presence', 'k-anonymization', 'k-map', 'distinct l-diversity', 'identifiability_score', 'DomiasMIA_BNAF', 'DomiasMIA_KDE', 'DomiasMIA_prior']
}

In [None]:
plugin_kwargs = {model:{} for model in generators}

In [None]:
# WARNING for testing set epochs to 100 for each model, comment this for a full test
# using default parameters
plugin_kwargs['ctgan'] = {"n_iter": 100}
plugin_kwargs['adsgan'] = {"n_iter": 100}
plugin_kwargs['ddpm'] = {"n_iter": 100}
plugin_kwargs

In [None]:
%%time
score = Benchmarks.evaluate(
    [(model, model, plugin_kwargs[model]) for model in generators],
    loader,
    synthetic_size=len(real_data),
    repeats=1,
    metrics=metrics,
    use_metric_cache=False
)

In [None]:
Benchmarks.print(score)

In [None]:
means = []
for model in generators:
    mean = score[model]["mean"]
    means.append(mean)

In [None]:
results = pd.concat(means, axis=1)
results.columns = [model for model in generators]
results

In [None]:
# WARNING let's pick the jensenshannon to select the best model (we can change this criteria)
# minimum value is the best
# jensenshannon is the first metric
metric = results.index[0]
metric

In [None]:
results.iloc[0].sort_values(ascending=True)

In [None]:
best_generator = results.iloc[0].sort_values(ascending=True).index[0]
best_generator

In [None]:
best_two = results.iloc[0].sort_values(ascending=True).index[0:2].to_list()
best_two

## Retrain models and sample SD

In [None]:
%%time
for model in best_two:
    syn_model = Plugins().get(model)
    print(f"Fit and sampling {model}")
    # FIXME forcing plugins to use previous n_iter 
    syn_model.n_iter = plugin_kwargs[model]['n_iter']
    syn_model.fit(loader)#, plugin_kwargs[model])
    save_to_file(f"model_{model}_synthcity_{days}_days_{peaks}_peaks_tabular.pkl", syn_model)
    synthetic_data = syn_model.generate().dataframe()
    print(synthetic_data.head(5))
    synthetic_data.to_csv(f"synthetic_data_synthcity_{days}_days_{peaks}_peaks_tabular.csv")
    

## Done!