# Synthcity Time Series generator training


In [None]:
# sometimes we have to purge the workspace to avoid errors
!rm -rf workspace

In [None]:
# stdlib
import sys
import pickle
import warnings

warnings.filterwarnings("ignore")

from datetime import datetime, timedelta

import numpy as np
import pandas as pd

# synthcity absolute
import synthcity.logger as log
from synthcity.plugins import Plugins
from synthcity.plugins.core.dataloader import TimeSeriesDataLoader
from synthcity.benchmark import Benchmarks
from synthcity.utils.serialization import load, load_from_file, save, save_to_file
from synthcity.plugins.core.constraints import Constraints

log.add(sink=sys.stderr, level="INFO")

### Inputs
- Number of days or datafilename
- Number of iteration / epochs
- Size of sampled synthetic data
- Real data file name
- Generators

In [None]:
days = 1
data_dir = "../"
n_iter = 100# 1000 is the default
sample_size = None
datafilename = None

In [None]:
Plugins(categories=["time_series"]).list()

In [None]:
generators = Plugins(categories=["time_series"]).list()

In [None]:
generator = "timegan"
mode = "GRU"

### Read real data

In [None]:
if datafilename:
    real_data = pd.read_csv(datafilename, index_col=0)    
else:
    real_data = pd.read_csv(f"{data_dir}real_data_sdv_{days}_days.csv", index_col=0)

if not sample_size:
    sample_size = len(real_data.datapoint_id.unique())

In [None]:
real_data.head(3)

### Extract time series and instantiate TimeSeriesDataLoader

In [None]:
def extract_ts(df):
    """ Extract time series for each `datapoint_id`

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe with static and time series values.

    Returns
    -------
    pd.DataFrame
        DataFrame with static features
    
    List
        List of time series DataFrames
    """
    # get static features
    static_df = df.drop(columns=["Timestamp", "energy_elec", "energy_gas"]).drop_duplicates().reset_index(drop=True)

    # get timeseries for each datapoint_id
    group_timeseries = df[["datapoint_id", "Timestamp", "energy_elec", "energy_gas"]].groupby('datapoint_id', sort=False)
    # timeseries_dfs = [group_timeseries.get_group(t)[[#"Timestamp",
    #                                                  "energy_elec",
    #                                                  "energy_gas"]] for t in group_timeseries.groups]    

    timeseries_dfs = []
    for t in group_timeseries.groups:
        # WARNING, FIXME for now avoid timestamps and work with hours as index
        tdf = group_timeseries.get_group(t)[[#"Timestamp",
                                             "energy_elec",
                                             "energy_gas"]]
        # ts_df["Timestamp"] = ts_df["Timestamp"].apply(lambda t: datetime.strptime(t, '%Y-%m-%d %H:%M:%S').hour)
        # ts_df = ts_df.set_index("Timestamp")

        # so now drop consecutive indices and reset it to 0 to 23
        tdf = tdf.reset_index(drop=True)
        tdf.index.name="hour"
        timeseries_dfs.append(tdf)

    if len(timeseries_dfs) != len(static_df):
        raise ValueError(f"Number of datapoint_ids {len(static_df)} doesn't match the number of time series {len(timeseries_dfs)}")
    
    return static_df, timeseries_dfs


In [None]:
%%time
static_df, timeseries_dfs = extract_ts(real_data)

In [None]:
#with open(f"timseries_dfs_synthcity_{days}_days.pkl", "wb") as f:
#    pickle.dump(timeseries_dfs, f)

In [None]:
#static_df.to_csv(f"static_data_synthcity_{days}_days.csv")

In [None]:
# creates the list of observation times, usually 0 to 24 hours x 4000
observation_data = []
#outcome = []
for tdf in timeseries_dfs:
    observations = list(tdf.index)
    observation_data.append(observations)
    #outcome.append(1)

In [None]:
# Fake the outcome, we dont have/know the ML task for this dataset so let's fake the target
outcome = np.random.randint(2, size=(len(static_df)))

In [None]:
outcome_df = pd.DataFrame(outcome, columns=["y"])

In [None]:
# Work with a subset of the features since the code breaks a lot!!!
static_data = static_df.copy()

In [None]:
static_data.dtypes

#### Treat every feature with less than 30 unique elements as string to make it categorical

In [None]:
%%time
for col, dt in static_data.dtypes.items():
    if dt == "float64" or dt == "int64":
        if len(static_data[col].unique()) < 30:
            static_data[col] = static_data[[col]].astype(str)

In [None]:
static_data.head(3)

In [None]:
timeseries_dfs[2].head(3)

In [None]:
with open(f"real_timeseries_dfs_synthcity_{days}_days.pkl", "wb") as f:
    pickle.dump(timeseries_dfs, f)

In [None]:
static_data.drop(columns=["datapoint_id"], inplace=True)

In [None]:
static_data.to_csv(f"real_static_data_synthcity_{days}_days.csv")

In [None]:
%%time
# instantiate time series data loader
loader = TimeSeriesDataLoader(
    temporal_data=timeseries_dfs,
    observation_times=observation_data,
    static_data=static_data,
    outcome=outcome_df,
)

In [None]:
loader.info()

## Train generator

Available "modes" for TimeGAN:
```
mode: str = "RNN"
    Core neural net architecture.
    Available models:
        - "LSTM"
        - "GRU"
        - "RNN"
        - "Transformer"
        - "MLSTM_FCN"
        - "TCN"
        - "InceptionTime"
        - "InceptionTimePlus"
        - "XceptionTime"
        - "ResCNN"
        - "OmniScaleCNN"
        - "XCM"
```

In [None]:
with open(f"tsdloader_synthcity_{days}_days.pkl", "wb") as f:
    pickle.dump(loader, f)

|mode| RAM| VRAM| Epochs|Fit| Gen|
|---|---|---|---|---|---|
|1|~14Gb|~900Mb| 50|~13min|~10min|
|1|~14Gb|~900Mb| 100|~22min|~22min|
|1|~14Gb|~900Mb| 150|~31min|~22min|
|1|~14Gb|~900Mb| 1000|~3h15min|~15min|
|*LSTM|~14Gb|~1Gb| 100|~28min|~15min|
|*GRU|~14Gb|~1Gb| 100|~28min|~8min|

*VRAM on generation for 1K samples

In [None]:
%%time
syn_model = Plugins().get("timegan",
                          n_iter=n_iter,
                          mode=mode)
syn_model.fit(loader)

In [None]:
# let's save the generator
save_to_file(f"model_{syn_model.name()}_mode_{mode}_synthcity_days_{days}_niter_{syn_model.n_iter}.pkl", syn_model)

In [None]:
%%time
synthetic_data = []
batch_size = 1000
all_ids = set()
count, lcount = 0, 0
iter = 0
if not sample_size:
    sample_size = syn_model.data_info['len']

while len(synthetic_data) < syn_model.data_info['len']:
    lsd = len(synthetic_data)
    sd_dfs = []
    sd_df = syn_model.generate(batch_size, sampling_patience=1000).dataframe()
    generated = len(sd_df["seq_id"].unique())
    #print(f"total samples generated {len(sd_df)}, uniques {generated}")

    for id, tidf in sd_df.groupby("seq_id", sort=False):
        if len(tidf) == 24:
            df = tidf.sort_values(by=["seq_time_id"]).copy()
            df["seq_time_id"] = list(range(24))
            sd_dfs.append(df)
        else:
            lcount += len(tidf)
            count += 1

    if len(sd_dfs) > 0:
        if lsd == 0:
            synthetic_data = pd.concat(sd_dfs)
            seq_ids = np.repeat(np.arange(0, len(synthetic_data["seq_id"].unique())), 24)
            #print(f"total samples {len(synthetic_data)}, {len(seq_ids)}")
            synthetic_data["seq_id"] = seq_ids
        else:
            #sd_dfs.insert(0, synthetic_data)
            sd_new_df = pd.concat(sd_dfs)
            uids_len_start = len(synthetic_data["seq_id"].unique())+1
            uids_len_stop = uids_len_start + len(sd_new_df["seq_id"].unique())
            seq_ids = np.arange(uids_len_start, uids_len_stop)
            #print(f"old samples {len(synthetic_data)}, new samples {len(sd_df)}")
            sd_new_df["seq_id"] = np.repeat(seq_ids, 24)
            synthetic_data = pd.concat([synthetic_data, sd_new_df])        
    iter += 1
    print(f"total samples {len(synthetic_data)}, generated in this step <{len(synthetic_data)-lsd}>, pct <{100*(len(synthetic_data)-lsd)/(len(sd_df)):.2f}%> total bad samples {lcount} -> {count}")


In [None]:
synthetic_data = synthetic_data.reset_index(drop=True)

In [None]:
# let's get the exact number of samples, this could be improved by sampling on a larger sample size
# but have to consider based in groups of seq_ids
synthetic_data = synthetic_data[:syn_model.data_info["len"]].copy()

In [None]:
synthetic_data.tail()

In [None]:
synthetic_data.to_csv(f"synthetic_data_model_{syn_model.name()}_mode_{mode}_synthcity_days_{days}_niter_{syn_model.n_iter}.csv")

## Done!