# Evaluation Tabular Generators

In [None]:
# sometimes we have to purge the workspace to avoid errors
!rm -rf workspace

In [None]:
# stdlib
import sys
import warnings

warnings.filterwarnings("ignore")

from datetime import datetime, timedelta

import numpy as np
import pandas as pd

# synthcity absolute
import synthcity.logger as log
from synthcity import metrics
from synthcity.plugins import Plugins
from synthcity.plugins.core.dataloader import GenericDataLoader
from synthcity.benchmark import Benchmarks
from synthcity.utils.serialization import load, load_from_file, save, save_to_file


log.add(sink=sys.stderr, level="INFO")

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from synthcity.metrics.plots import plot_marginal_comparison, plot_tsne

## Inputs

In [None]:
peaks = 1
n_iter = 10 # WARNING change this
# it should be 4000 but it takes 35 minutes on a GPU
num_seq = 4000
days = 1
# real data dir
data_dir = "../"
# generate_tsne
generate_tsne = False
generator = "adsgan"

In [None]:
rd_filename = f"real_data_synthcity_{days}_days_{peaks}_peaks_tabular.csv"
sd_filename = f"synthetic_data_synthcity_{days}_days_{peaks}_peaks_tabular_opt.csv"
model_filename = f"model_{generator}_synthcity_{days}_days_{peaks}_peaks_tabular.pkl"

### Load real data and instantiate the dataloader

In [None]:
real_data = pd.read_csv(rd_filename, index_col=0)

In [None]:
real_data.head(3)

In [None]:
synthetic_data = pd.read_csv(sd_filename, index_col=0)

In [None]:
synthetic_data.head(3)

In [None]:
loader = GenericDataLoader(real_data)

### Load the saved model

In [None]:
syn_model = load_from_file(model_filename)

In [None]:
real_data[":dcv_type"].hist()

In [None]:
synthetic_data[":dcv_type"].hist()

## Evaluation


In [None]:
metrics_dict = {
    'sanity': ['data_mismatch', 'common_rows_proportion', 'nearest_syn_neighbor_distance', 'close_values_probability', 'distant_values_probability'],
    'stats': ['jensenshannon_dist', 'chi_squared_test', 'feature_corr', 'inv_kl_divergence', 'ks_test', 'max_mean_discrepancy', 'wasserstein_dist', 'prdc', 'alpha_precision'],
    # these do not make sense
    #'performance': ['linear_model', 'mlp', 'xgb', 'feat_rank_distance'],
    #'detection': ['detection_xgb', 'detection_mlp', 'detection_gmm', 'detection_linear'],
    # this takes a long time to run
    'privacy': ['delta-presence', 'k-anonymization', 'k-map', 'distinct l-diversity', 'identifiability_score']
}

In [None]:
%%time
scores = []
for k, v in metrics_dict.items():
    print(f"{k} metrics")
    score = metrics.Metrics.evaluate(real_data, synthetic_data, metrics={k: v})
    scores.append(score)
    #print(score)

### Synthcity can compare several batches of generated SD and output statistics from the metrics for each evaluation. Here we only care about generating metrics for one batch.

Thus in the `scores` dictionary the key `mean` has a different meaning, it reflects the value of the metric. All other keys can be discarded.

In [None]:
scores[0][['mean']]

In [None]:
scores[1][['mean']]

In [None]:
scores[2][['mean']]

## Plots

- Distributions
- tSNE
- Peaks and valleys

In [None]:
#real_data.dtypes

In [None]:
static_data = real_data.copy()

In [None]:
if not (static_data.dtypes == synthetic_data.dtypes).all():
    raise("Types mismatch")

In [None]:
%%time
for col, dt in static_data.dtypes.items():
    if dt == "float64" or dt == "int64":
        if len(static_data[col].unique()) < 30:
            static_data[col] = static_data[[col]].astype(str)

In [None]:
%%time
for col, dt in synthetic_data.dtypes.items():
    if dt == "float64" or dt == "int64":
        if len(synthetic_data[col].unique()) < 30:
            synthetic_data[col] = synthetic_data[[col]].astype(str)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
def convert_to_gdloader(df, ct=None):
    if not ct:
        ct = make_column_transformer((OrdinalEncoder(), make_column_selector(dtype_include="object")),
                                     (OrdinalEncoder(), make_column_selector(dtype_include="string")),
                                     ("passthrough", make_column_selector(dtype_include="float64")))

    column_order = list(df.select_dtypes(include=["object"]).columns) + list(df.select_dtypes(include=["string"]).columns)\
                 + list(df.select_dtypes(include=["float64"]).columns)
    tr_df = pd.DataFrame(ct.fit_transform(df), index=df.index, columns=column_order)[df.columns]
    
    loader = GenericDataLoader(tr_df)

    return loader, ct

In [None]:
rd_loader, ct = convert_to_gdloader(static_data)

In [None]:
len(rd_loader.static_features)

In [None]:
sd_loader, _ = convert_to_gdloader(synthetic_data, ct)

In [None]:
len(static_data.columns)

In [None]:
ct.transform(synthetic_data).shape

In [None]:
if not (static_data.dtypes == synthetic_data.dtypes).all():
    raise("Types mismatch")

In [None]:
%%time
plot_marginal_comparison(plt, rd_loader, sd_loader)

In [None]:
%%time
if generate_tsne:
    plot_tsne(plt, rd_loader, sd_loader)

In [None]:
static_data.head(3)

In [None]:
peak_df = static_data[["temax_0", "emax_0", "temin_0", "emin_0", "tgmax_0", "gmax_0", "tgmin_0", "gmin_0"]].copy()

In [None]:
for tcol in ["temax_0", "temin_0", "tgmax_0", "tgmin_0"]:
    peak_df[tcol] = peak_df[tcol].apply(lambda x: pd.to_datetime(x).hour)

In [None]:
rt = np.zeros((24, 4))
for i, tcol in enumerate(["temax_0", "temin_0", "tgmax_0", "tgmin_0"]):
    rt[peak_df[[tcol]].groupby(tcol).size().index.to_numpy(), i] = peak_df[[tcol]].groupby(tcol).size().to_numpy()
rdf_ = pd.DataFrame(rt, columns=["emax_rd", "emin_rd", "gmax_rd", "gmin_rd"])

In [None]:
sd_peak_df = synthetic_data[["temax_0", "emax_0", "temin_0", "emin_0", "tgmax_0", "gmax_0", "tgmin_0", "gmin_0"]].copy()

In [None]:
sd_peak_df

In [None]:
for tcol in ["temax_0", "temin_0", "tgmax_0", "tgmin_0"]:
    sd_peak_df[tcol] = sd_peak_df[tcol].apply(lambda x: pd.to_datetime(x).hour)

In [None]:
st = np.zeros((24, 4))
for i, tcol in enumerate(["temax_0", "temin_0", "tgmax_0", "tgmin_0"]):
    st[sd_peak_df[[tcol]].groupby(tcol).size().index.to_numpy(), i] = sd_peak_df[[tcol]].groupby(tcol).size().to_numpy()
sdf_ = pd.DataFrame(st, columns=["emax_sd", "emin_sd", "gmax_sd", "gmin_sd"])

In [None]:
df = pd.concat([rdf_, sdf_], axis=1)

In [None]:
df.head()

In [None]:
df.sum()

In [None]:
df["hour"] = [str(i+1) for i in range(24)]
df.set_index("hour", inplace=True)

In [None]:
df

In [None]:
edf = pd.melt(df[["emax_rd", "emax_sd", "emin_rd", "emin_sd"]].reset_index(), id_vars='hour')
edf = edf.rename(columns={"hour": "hour of the day", "value": "counts", "variable": "legend"})

In [None]:
fig, ax  = plt.subplots(1, figsize=(14, 8))
sns.barplot(x='hour of the day', y='counts', hue='legend', data=edf, fill=False, width=1, ax=ax)
minor_ticks = np.arange(0, 23)+0.5
ax.set_xticks(minor_ticks, minor=True)
ax.grid(which='minor')
#ax.axvline(0.5, color='k')

In [None]:
gdf = pd.melt(df[["gmax_rd", "gmax_sd", "gmin_rd", "gmin_sd"]].reset_index(), id_vars='hour')
gdf = gdf.rename(columns={"hour": "hour of the day", "value": "counts", "variable": "legend"})

In [None]:
fig, ax  = plt.subplots(1, figsize=(14, 8))
sns.barplot(x='hour of the day', y='counts', hue='legend', data=gdf, fill=False, width=1, ax=ax)
minor_ticks = np.arange(0, 23)+0.5
ax.set_xticks(minor_ticks, minor=True)
ax.grid(which='minor')

In [None]:
def abs_error(rdv, sdv):
    return np.absolute(np.subtract(rdv, sdv))

In [None]:
peaks_comparison_df = df.copy()

In [None]:
peaks_comparison_df["emax_abs_err"] =  abs_error(peaks_comparison_df["emax_rd"], peaks_comparison_df["emax_sd"])
peaks_comparison_df["emin_abs_err"] =  abs_error(peaks_comparison_df["emin_rd"], peaks_comparison_df["emin_sd"])
peaks_comparison_df["gmax_abs_err"] =  abs_error(peaks_comparison_df["gmax_rd"], peaks_comparison_df["gmax_sd"])
peaks_comparison_df["gmin_abs_err"] =  abs_error(peaks_comparison_df["gmin_rd"], peaks_comparison_df["gmin_sd"])

In [None]:
peaks_comparison_df = peaks_comparison_df[["emax_abs_err", "emin_abs_err", "gmax_abs_err", "gmin_abs_err"]].copy()

In [None]:
peaks_comparison_df

In [None]:
pdf = pd.melt(peaks_comparison_df.reset_index(), id_vars='hour')
pdf = pdf.rename(columns={"hour": "hour of the day", "value": "absolute error", "variable": "legend"})

In [None]:
fig, ax  = plt.subplots(1, figsize=(14, 8))
sns.barplot(x='hour of the day', y='absolute error', hue='legend', data=pdf, fill=False, width=1, ax=ax)
major_ticks = np.arange(0, 23)+0.5
ax.set_xticks(major_ticks, minor=True)
ax.grid(which='minor')

In [None]:
peaks_comparison_df.sum()

# Done!!!

In [None]:
!date