# Evaluation TimeGAN

In [None]:
# sometimes we have to purge the workspace to avoid errors
!rm -rf workspace

In [None]:
# stdlib
import sys
import pickle
import random
import warnings

warnings.filterwarnings("ignore")

from datetime import datetime, timedelta

import numpy as np
import pandas as pd

# synthcity absolute
import synthcity.logger as log
from synthcity.plugins import Plugins
from synthcity.plugins.core.dataloader import TimeSeriesDataLoader
from synthcity import metrics
from synthcity.benchmark import Benchmarks
from synthcity.utils.serialization import load, load_from_file, save, save_to_file

log.add(sink=sys.stderr, level="INFO")

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns
from synthcity.metrics.plots import plot_marginal_comparison, plot_tsne

In [None]:
from scipy.spatial import distance
from scipy.stats import ks_2samp

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

In [None]:
from IPython.utils import io

## Load SD generator & sample or load SD

In [None]:
# False to load SD, True to load model and generate
generate = False
generator = "timegan"
mode = "GRU"
n_iter = 100
# it should be 4000 but it takes 35 minutes on a GPU
num_seq = 4000
days = 1
# real data dir
data_dir = "../"
# generate_tsne
generate_tsne = False

For 4gb of VRAM and less, only 3900 samples can be generated in that case.
```
OutOfMemoryError: CUDA out of memory. Tried to allocate 946.00 MiB (GPU 0; 3.81 GiB total capacity; 2.88 GiB already allocated; 684.19 MiB free; 2.98 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
```

The following error is related to the fact that the generator is not sampling 24 time series of fixed lengths (24 hours)
```
[2024-03-19T20:35:25.858358+0000][21133][INFO] [seq_time_id] quality loss for constraints ge = 0. Remaining 91881. prev length 93600. Original dtype int64.
```

Same error when sampling 1000 with patience 1000
```
[2024-03-19T21:35:19.135288+0000][22117][INFO] [seq_time_id] quality loss for constraints ge = 0. Remaining 11795. prev length 12000. Original dtype int64.
```

In [None]:
%%time
if generate:
    syn_model = load_from_file(f"model_{generator}_mode_{mode}_synthcity_days_{days}_niter_{n_iter}.pkl")
    #synthesizer.verbose = False
    with io.capture_output() as captured:
        synthetic_data = syn_model.generate(num_seq, sampling_patience=10000).dataframe()
        #synthetic_data = syn_model.generate(num_seq).dataframe()
    synthetic_data.to_csv(f"synthetic_data_model_{generator}_synthcity_days_{days}_niter_{n_iter}.csv")
else:
    synthetic_data = pd.read_csv(f"synthetic_data_model_{generator}_mode_{mode}_synthcity_days_{days}_niter_{n_iter}.csv", index_col=0)

In [None]:
synthetic_data.head(3)

## Load RD

In [None]:
real_data = pd.read_csv(f"{data_dir}real_data_sdv_{days}_days.csv", index_col=0)

In [None]:
real_data.head(3)

In [None]:
def extract_ts(df):
    """ Extract time series for each `datapoint_id`

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe with static and time series values.

    Returns
    -------
    pd.DataFrame
        DataFrame with static features
    
    List
        List of time series DataFrames
    """
    # get static features
    static_df = df.drop(columns=["Timestamp", "energy_elec", "energy_gas"]).drop_duplicates().reset_index(drop=True)

    # get timeseries for each datapoint_id
    group_timeseries = df[["datapoint_id", "Timestamp", "energy_elec", "energy_gas"]].groupby('datapoint_id', sort=False)
    # timeseries_dfs = [group_timeseries.get_group(t)[[#"Timestamp",
    #                                                  "energy_elec",
    #                                                  "energy_gas"]] for t in group_timeseries.groups]    

    timeseries_dfs = []
    for t in group_timeseries.groups:
        # WARNING, FIXME for now avoid timestamps and work with hours as index
        tdf = group_timeseries.get_group(t)[[#"Timestamp",
                                             "energy_elec",
                                             "energy_gas"]]
        # ts_df["Timestamp"] = ts_df["Timestamp"].apply(lambda t: datetime.strptime(t, '%Y-%m-%d %H:%M:%S').hour)
        # ts_df = ts_df.set_index("Timestamp")

        # so now drop consecutive indices and reset it to 0 to 23
        tdf = tdf.reset_index(drop=True)
        tdf.index.name="hour"
        timeseries_dfs.append(tdf)

    if len(timeseries_dfs) != len(static_df):
        raise ValueError(f"Number of datapoint_ids {len(static_df)} doesn't match the number of time series {len(timeseries_dfs)}")
    
    return static_df, timeseries_dfs


In [None]:
%%time
static_df, timeseries_dfs = extract_ts(real_data)

In [None]:
# creates the list of observation times, usually 0 to 24 hours x 4000
observation_data = []
#outcome = []
for tdf in timeseries_dfs:
    observations = list(tdf.index)
    observation_data.append(observations)
    #outcome.append(1)

In [None]:
# Fake the outcome, we dont have/know the ML task for this dataset so let's fake the target
outcome = np.random.randint(2, size=(len(static_df)))

In [None]:
outcome_df = pd.DataFrame(outcome, columns=["y"])

In [None]:
# Work with a subset of the features since the code breaks a lot!!!
static_data = static_df.copy()

In [None]:
static_data.dtypes

#### Treat every feature with less than 30 unique elements as string to make it categorical

In [None]:
%%time
for col, dt in static_data.dtypes.items():
    if dt == "float64" or dt == "int64":
        if len(static_data[col].unique()) < 30:
            static_data[col] = static_data[[col]].astype(str)

In [None]:
static_data.head(3)

In [None]:
timeseries_dfs[2].head(3)

In [None]:
static_data.drop(columns=["datapoint_id"], inplace=True)

### SD loader

In [None]:
synthetic_data.head(1)

In [None]:
synthetic_data.columns = [col_name.removeprefix("seq_static_") for col_name in synthetic_data.columns]
synthetic_data.columns = [col_name.removeprefix("seq_temporal_") for col_name in synthetic_data.columns]
synthetic_data.columns = [col_name.removeprefix("seq_out_") for col_name in synthetic_data.columns]

In [None]:
synthetic_data.columns

In [None]:
def sd_postprocess(df):
    """ Extract time series for each `seq_id`, static features and outcome

    Parameters
    ----------
    df : pd.DataFrame
        Input synthetic dataframe with static and time series values.

    Returns
    -------
    pd.DataFrame
        DataFrame with static features
    
    List
        List of time series DataFrames

    pd.DataFrame
        DataFrame with outcomes (labels)

    """
    # get static features
    static_df = df.drop(columns=["seq_time_id", "energy_elec", "energy_gas"]).drop_duplicates().reset_index(drop=True)

    # get timeseries for each datapoint_id
    group_timeseries = df[["seq_id", "seq_time_id", "energy_elec", "energy_gas"]].groupby('seq_id', sort=False)
 

    timeseries_dfs = []
    for t in group_timeseries.groups:
        # WARNING, FIXME for now avoid timestamps and work with hours as index
        tdf = group_timeseries.get_group(t)[["energy_elec",
                                             "energy_gas"]]
        # so now drop consecutive indices and reset it to 0 to 23
        tdf = tdf.reset_index(drop=True)
        tdf.index.name="hour"
        timeseries_dfs.append(tdf)

    if len(timeseries_dfs) != len(static_df):
        raise ValueError(f"Number of datapoint_ids {len(static_df)} doesn't match the number of time series {len(timeseries_dfs)}")
    
    return static_df, timeseries_dfs, static_df[["y"]].copy()

In [None]:
sd_static_data, sd_timeseries_dfs, sd_outcome_df = sd_postprocess(synthetic_data)

In [None]:
# creates the list of observation times, usually 0 to 24 hours x 4000
sd_observation_data = []
#outcome = []
for tdf in sd_timeseries_dfs:
    observations = list(tdf.index)
    sd_observation_data.append(observations)
    #outcome.append(1)

#### Treat every feature with less than 30 unique elements as string to make it categorical

In [None]:
%%time
for col, dt in sd_static_data.dtypes.items():
    if dt == "float64" or dt == "int64":
        if len(sd_static_data[col].unique()) < 30:
            sd_static_data[col] = sd_static_data[[col]].astype(str)

In [None]:
sd_static_data.head(3)

In [None]:
timeseries_dfs[2].head(3)

In [None]:
sd_static_data.drop(columns=["seq_id"], inplace=True)

In [None]:
# force column order as is in the real data
sd_static_data = sd_static_data[static_data.columns].copy()

In [None]:
def convert_to_tsdloaders(static_df, timeseries_dfs, observation_data, outcome_df, ct=None):
    if not ct:
        ct = make_column_transformer((OrdinalEncoder(), make_column_selector(dtype_include="object")),
                                     (OrdinalEncoder(), make_column_selector(dtype_include="string")))

    column_order = list(static_df.select_dtypes(include=["object"]).columns) + list(static_df.select_dtypes(include=["string"]).columns)
    tr_df = pd.DataFrame(ct.fit_transform(static_df), index=static_df.index, columns=column_order)[static_df.columns]
    
    loader = TimeSeriesDataLoader(
         temporal_data=timeseries_dfs,
         observation_times=observation_data,
         static_data=tr_df,
         outcome=outcome_df
    )
    return loader, ct

In [None]:
%%time
rd_loader, ct = convert_to_tsdloaders(static_data, timeseries_dfs, observation_data, outcome_df)

In [None]:
%%time
sd_loader, _ = convert_to_tsdloaders(sd_static_data, sd_timeseries_dfs, sd_observation_data, sd_outcome_df, ct=ct)

## Evaluation
- It takes 2 1/2 hours to run metrics on cpu (3 minutes on GPU): sanity, stats, and privacy
- We need to check if the metrics are being computed accordingly. The plots are right because we are using the SD dataloaders.

In [None]:
metrics_dict = {
    'sanity': ['data_mismatch', 'common_rows_proportion', 'nearest_syn_neighbor_distance', 'close_values_probability', 'distant_values_probability'],
    'stats': ['jensenshannon_dist', 'chi_squared_test', 'feature_corr', 'inv_kl_divergence', 'ks_test', 'max_mean_discrepancy', 'wasserstein_dist', 'prdc', 'alpha_precision'],
    # these do not make sense
    #'performance': ['linear_model', 'mlp', 'xgb', 'feat_rank_distance'],
    #'detection': ['detection_xgb', 'detection_mlp', 'detection_gmm', 'detection_linear'],
    # this takes a long time to run
    'privacy': ['delta-presence', 'k-anonymization', 'k-map', 'distinct l-diversity', 'identifiability_score']
}

In [None]:
%%time
scores = []
for k, v in metrics_dict.items():
    print(f"{k} metrics")
    score = metrics.Metrics.evaluate(static_data, sd_static_data, metrics={k: v})
    scores.append(score)
    #print(score)

### Synthcity can compare several batches of generated SD and output statistics from the metrics for each evaluation. Here we only care about generating metrics for one batch.

Thus in the `scores` dictionary the key `mean` has a different meaning, it reflects the value of the metric. All other keys can be discarded.

In [None]:
scores[0][['mean']]

In [None]:
scores[1][['mean']]

In [None]:
scores[2][['mean']]

## Plots

- Distributions
- tSNE
- Time Series comparison and average

In [None]:
%%time
plot_marginal_comparison(plt, rd_loader, sd_loader)

In [None]:
%%time
if generate_tsne:
    plot_tsne(plt, rd_loader, sd_loader)

#### Time Series plots

In [None]:
# Set the style for the plots
sns.set_style("whitegrid")
sns.set_context("paper")  # Sets the scaling of elements such as the font size

# Formatter for the y-axis
def thousands(x, pos):
    'The two args are the value and tick position'
    return f'{x * 1e-3:,.0f}k'.replace(',', ' ')

formatter = FuncFormatter(thousands)

# Specify the datapoint_ids you want to plot
real_specific_id = None
synthetic_specific_id = None


# If specific IDs are not provided, choose a random one from the unique values
if not real_specific_id:
    real_specific_id = random.randint(0, num_seq-1)
if not synthetic_specific_id:
    synthetic_specific_id = random.randint(0, num_seq-1)

# Electric Energy Comparison
plt.figure(figsize=(10, 6))
ax = plt.subplot(111)
time = [str(i+1) for i in range(24)]
ax.plot(time, timeseries_dfs[real_specific_id]['energy_elec'], label='Real Data - Electric Energy', color='#0072B2', linewidth=2.5)
ax.plot(time, sd_timeseries_dfs[synthetic_specific_id]['energy_elec'], label='Synthetic Data - Electric Energy', color='#D55E00', linestyle='--', linewidth=2.5)
#ax.yaxis.set_major_formatter(formatter)
plt.title('Comparison of Electric Energy Usage Over Time', fontsize=16)
plt.xlabel('Timestamp', fontsize=14)
plt.ylabel('Electric Energy Usage (kWh)', fontsize=14)
plt.legend(fontsize=12, loc='upper right')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
sns.despine(left=True)
plt.tight_layout()

# Save the plot with high resolution
#plt.savefig('electric_energy_comparison.png', dpi=300)  # Replace with your path

plt.show()

In [None]:
# Gas Energy Comparison
plt.figure(figsize=(10, 6))
ax = plt.subplot(111)
ax.plot(time, timeseries_dfs[real_specific_id]['energy_gas'], label='Real Data - Gas Energy', color='#0072B2', linewidth=2.5)
ax.plot(time, sd_timeseries_dfs[synthetic_specific_id]['energy_gas'], label='Synthetic Data - Gas Energy', color='#D55E00', linestyle='--', linewidth=2.5)
#ax.yaxis.set_major_formatter(formatter)
plt.title('Comparison of Gas Energy Usage Over Time', fontsize=16)
plt.xlabel('Timestamp', fontsize=14)
plt.ylabel('Gas Energy Usage (kWh)', fontsize=14)
plt.legend(fontsize=12, loc='upper right')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
sns.despine(left=True)
plt.tight_layout()

# Save the plot with high resolution
#plt.savefig('gas_energy_comparison.png', dpi=300)  # Replace with your path

plt.show()

In [None]:
#Mean Energy Usage 
real_data_avg = real_data.groupby('Timestamp').mean().reset_index()
synthetic_data_avg = synthetic_data.groupby('seq_time_id').mean().reset_index()

In [None]:
plt.figure(figsize=(10, 6))
ax = plt.subplot(111)
ax.plot(time, real_data_avg['energy_elec'], label='Real Data - Average Electric Energy', color='#0072B2', linewidth=2.5)
ax.plot(time, synthetic_data_avg['energy_elec'], label='Synthetic Data - Average Electric Energy', color='#D55E00', linestyle='--', linewidth=2.5)
ax.yaxis.set_major_formatter(formatter)
ax.set_title('Average Comparison of Electric Energy Usage Over Time', fontsize=16)
ax.set_xlabel('Hour of the day', fontsize=14)
ax.set_ylabel('Electric Energy Usage (Joules per hour)', fontsize=14)
plt.legend(fontsize=12, loc='upper right')
sns.despine(left=True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
ax = plt.subplot(111)
ax.plot(time, real_data_avg['energy_gas'], label='Real Data - Average Gas Energy', color='#0072B2', linewidth=2.5)
ax.plot(time, synthetic_data_avg['energy_gas'], label='Synthetic Data - Average Gas Energy', color='#D55E00', linestyle='--', linewidth=2.5)
ax.yaxis.set_major_formatter(formatter)
plt.title('Average Comparison of Gas Energy Usage Over Time', fontsize=16)
plt.xlabel('Hour of the day', fontsize=14)
plt.ylabel('Gas Energy Usage (Joules per hour)', fontsize=14)
plt.legend(fontsize=12, loc='upper right')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
sns.despine(left=True)
plt.tight_layout()
plt.show()

In [None]:
%%time
ks_stats = {}
stats = [("mean", lambda x: np.mean(x, axis=0)),
         ("std", lambda x: np.std(x, axis=0)),
         ("median", lambda x: np.median(x, axis=0)),
         ("max", np.amax),
         ("min", np.amin)
        ]
for name, op in stats:
    rd_energy_dist = np.zeros((len(timeseries_dfs), 2))
    sd_energy_dist = np.zeros((len(timeseries_dfs), 2))
    for i, (rdt, sdt) in enumerate(zip(timeseries_dfs, sd_timeseries_dfs)):
        rd_energy_dist[i] = op(rdt.to_numpy())
        sd_energy_dist[i] = op(sdt.to_numpy())
    ks_stat_elec, _ = ks_2samp(rd_energy_dist[:,0], sd_energy_dist[:,0])
    ks_stat_gas, _ = ks_2samp(rd_energy_dist[:,1], sd_energy_dist[:,1])
    ks_stats[name] = {}
    ks_stats[name]["electricity"] = 1.0-ks_stat_elec
    ks_stats[name]["gas"] = 1.0-ks_stat_gas

In [None]:
ks_stats

In [None]:
%%time
hourly_stats = {}

hourly_stats['mean'] = distance.cosine(real_data[['Timestamp', 'energy_elec']].groupby('Timestamp').mean().to_numpy().flatten(), 
                                       synthetic_data[['seq_time_id', 'energy_elec']].groupby('seq_time_id').mean().to_numpy().flatten())
                              
hourly_stats['std'] = distance.cosine(real_data[['Timestamp', 'energy_elec']].groupby('Timestamp').std().to_numpy().flatten(), 
                                      synthetic_data[['seq_time_id', 'energy_elec']].groupby('seq_time_id').std().to_numpy().flatten())

hourly_stats['median'] = distance.cosine(real_data[['Timestamp', 'energy_elec']].groupby('Timestamp').median().to_numpy().flatten(), 
                                         synthetic_data[['seq_time_id', 'energy_elec']].groupby('seq_time_id').median().to_numpy().flatten())

hourly_stats['max'] = distance.cosine(real_data[['Timestamp', 'energy_elec']].groupby('Timestamp').max().to_numpy().flatten(), 
                                      synthetic_data[['seq_time_id', 'energy_elec']].groupby('seq_time_id').max().to_numpy().flatten())

hourly_stats['min'] = distance.cosine(real_data[['Timestamp', 'energy_elec']].groupby('Timestamp').min().to_numpy().flatten(), 
                                      synthetic_data[['seq_time_id', 'energy_elec']].groupby('seq_time_id').min().to_numpy().flatten())

In [None]:
hourly_stats

In [None]:
def get_peaks_by_hour(timeseries_dfs, sd_timeseries_dfs):    
    rd_peaks = np.zeros((len(timeseries_dfs[0]), 4))
    sd_peaks = np.zeros((len(timeseries_dfs[0]), 4))
    
    for i, (rdt, sdt) in enumerate(zip(timeseries_dfs, sd_timeseries_dfs)):
        for ts, p in zip((rdt, sdt), (rd_peaks, sd_peaks)):
            nt = ts.to_numpy()
            i_emax, i_gmax = np.argmax(nt, axis=0)    
            p[i_emax, 0] += 1
            p[i_gmax, 1] += 1
            i_emin, i_gmin = np.argmin(nt, axis=0)    
            p[i_emin, 2] += 1
            p[i_gmin, 3] += 1
    #return rd_peaks, sd_peaks
    df = pd.DataFrame(np.hstack((rd_peaks, sd_peaks)), columns=["emax_rd", "gmax_rd",
                                                                "emin_rd", "gmin_rd",
                                                                "emax_sd", "gmax_sd",
                                                                "emin_sd", "gmin_sd"])
    return df[["emax_rd", "emax_sd",
               "emin_rd", "emin_sd",
               "gmax_rd", "gmax_sd",
               "gmin_rd", "gmin_sd"]]

In [None]:
df = get_peaks_by_hour(timeseries_dfs, sd_timeseries_dfs)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.sum()

In [None]:
df.index = [str(i+1) for i in range(24)]

In [None]:
edf = pd.melt(df[["emax_rd", "emax_sd", "emin_rd", "emin_sd"]].reset_index(), id_vars='index')
edf = edf.rename(columns={"index": "Hour of the day", "value": "Min/max counts", "variable": "legend"})

In [None]:
fig, ax  = plt.subplots(1, figsize=(14, 8))
#sns.set_context(font_scale=3)
sns.set(font_scale=1.5, style = "whitegrid")
#sns.set_context("paper"))
sns.barplot(x='Hour of the day', y='Min/max counts', hue='legend', data=edf, fill=False, width=1, ax=ax)
minor_ticks = np.arange(0, 23)+0.5
ax.set_xticks(minor_ticks, minor=True)
ax.grid(which='minor')

In [None]:

gdf = pd.melt(df[["gmax_rd", "gmax_sd", "gmin_rd", "gmin_sd"]].reset_index(), id_vars='index')
gdf = gdf.rename(columns={"index": "Hour of the day", "value": "Min/max counts", "variable": "legend"})

In [None]:
fig, ax  = plt.subplots(1, figsize=(14, 8))
sns.barplot(x='Hour of the day', y='Min/max counts', hue='legend', data=gdf, fill=False, width=1, ax=ax)
minor_ticks = np.arange(0, 23)+0.5
ax.set_xticks(minor_ticks, minor=True)
ax.grid(which='minor')

In [None]:
def abs_error(rdv, sdv):
    return np.absolute(np.subtract(rdv, sdv))

In [None]:
peaks_comparison_df = df.copy()

In [None]:
peaks_comparison_df["emax_abs_err"] =  abs_error(peaks_comparison_df["emax_rd"], peaks_comparison_df["emax_sd"])
peaks_comparison_df["emin_abs_err"] =  abs_error(peaks_comparison_df["emin_rd"], peaks_comparison_df["emin_sd"])
peaks_comparison_df["gmax_abs_err"] =  abs_error(peaks_comparison_df["gmax_rd"], peaks_comparison_df["gmax_sd"])
peaks_comparison_df["gmin_abs_err"] =  abs_error(peaks_comparison_df["gmin_rd"], peaks_comparison_df["gmin_sd"])
peaks_comparison_df = peaks_comparison_df[["emax_abs_err", "emin_abs_err", "gmax_abs_err", "gmin_abs_err"]].copy()

In [None]:
peaks_comparison_df.head()

In [None]:
pdf = pd.melt(peaks_comparison_df.reset_index(), id_vars='index')
pdf = pdf.rename(columns={"index": "Hour of the day", "value": "Absolute error", "variable": "legend"})

In [None]:
fig, ax  = plt.subplots(1, figsize=(14, 8))
sns.barplot(x='Hour of the day', y='Absolute error', hue='legend', data=pdf, fill=False, width=1, ax=ax)
major_ticks = np.arange(0, 23)+0.5
ax.set_xticks(major_ticks, minor=True)
ax.grid(which='minor')

In [None]:
peaks_comparison_df.sum()

## Done!