# SDV CPAR model training

#### Here we add peaks and valleys to the static data (context)

- We select the n largest and n smallest values of each time series, this is chosen by setting the variable `peaks`.
- We pick also the timestamp for each of those values, and add it to the context (static features).
- We leave the time series and use it with the context to train the generator.
- The final output is a time series data with a context that has the min and maxes synthesized.

Issues:

- In the SD, the time when an extrema happens for a particular `datapoint_id` not necessarily matches the extrema on the time series for this `datapoint_id`.
- SDV has some problems manipulating datetime data types. The model can be fitted but throws errors when generating the SD. To fix this, we keep only the hour as `str` type, thus this make only sense on hourly data for an specific day.
- For some reason the context columns has to be passed in the order they appear in the real data dataframe.

In [None]:
from datetime import datetime, timedelta

In [None]:
import numpy as np

In [None]:
import pandas as pd

In [None]:
import sdv
from sdv.sequential import PARSynthesizer
#from sdv.constraints import Unique

### Inputs
- Number of days or datafilename
- Epochs
- Peaks (number of max and min values of the time series)
- Size of sampled synthetic data
- Real data file name

In [None]:
days = 1
data_dir = "../"
epochs = 128
peaks = 1
sample_size = None
datafilename = None

### Read real data

In [None]:
if datafilename:
    real_data = pd.read_csv(datafilename, index_col=0)    
else:
    real_data = pd.read_csv(f"{data_dir}real_data_sdv_{days}_days.csv", index_col=0)

if not sample_size:
    sample_size = len(real_data.datapoint_id.unique())

In [None]:
real_data

### Add peaks and valleys

Find max and min on the time series and add it to the static columns, keeping time series data AS IS

In [None]:
def pick_peaks(df, count=1):
    """ Select max and min values for each time series and add it to a dataframe along with the timestamp

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe with static and time series values.
    count : int
        The number of max and min values to grab.

    Returns
    -------
    pd.DataFrame
        DataFrame with static features and mins and maxes with timestamps
    """
    # get timeseries for each utility
    group_timeseries_elec = df[["datapoint_id", "Timestamp", "energy_elec"]].groupby('datapoint_id', sort=False)
    group_timeseries_gas = df[["datapoint_id", "Timestamp", "energy_gas"]].groupby('datapoint_id', sort=False)

    df_ = df.copy()

    # create new columns
    for i in range(count):
            df_[f"temax{i}"] = 0
            df_[f"emax{i}"] = 0
            df_[f"temin{i}"] = 0
            df_[f"emin{i}"] = 0

    for t in group_timeseries_elec.groups:
        df_ts = group_timeseries_elec.get_group(t)[["Timestamp", "energy_elec"]]
        ts = df_ts.energy_elec
        ts_range = ts.index.copy()
        # loop in number of mins and maxes
        for i in range(count):
            imaxv, maxv = ts.idxmax(), ts.max()
            iminv, minv = ts.idxmin(), ts.min()
            max_ts = df_ts.loc[imaxv, "Timestamp"]
            min_ts = df_ts.loc[iminv, "Timestamp"]

            # this works, datetime object to pandas but it doesnt work on sampling
            #df_.loc[ts_range, f"temax{i}"] = datetime.strptime(max_ts, '%Y-%m-%d %H:%M:%S')
            # so we take a string for the hour
            df_.loc[ts_range, f"temax{i}"] = str(datetime.strptime(max_ts, '%Y-%m-%d %H:%M:%S').hour)
            df_.loc[ts_range, f"emax{i}"] = maxv
            df_.loc[ts_range, f"temin{i}"]= str(datetime.strptime(min_ts, '%Y-%m-%d %H:%M:%S').hour)
            df_.loc[ts_range, f"emin{i}"] = minv
            ts = ts.drop(imaxv)
            ts = ts.drop(iminv)

    for i in range(count):
            df_[f"tgmax{i}"] = 0
            df_[f"gmax{i}"] = 0
            df_[f"tgmin{i}"] = 0
            df_[f"gmin{i}"] = 0

    for t in group_timeseries_gas.groups:
        df_ts = group_timeseries_gas.get_group(t)[["Timestamp", "energy_gas"]]
        ts = df_ts.energy_gas
        ts_range = ts.index.copy()
        # Note 1: dont look for minmax if timeseries is flat or zero
        # often there is no gas measurements
        # Note 2: this doesnt handle all cases, this assumes that if there is a gas measurement
        # then there are measurements for each hour.
        # Note 3: mins and maxes for gas contributes to sparsity given that a lot of 
        # buildings do not have gas consumption.
        if len(ts.unique()) > 2:
            for i in range(count):
                imaxv, maxv = ts.idxmax(), ts.max()
                iminv, minv = ts.idxmin(), ts.min()
                max_ts = df_ts.loc[imaxv, "Timestamp"]
                min_ts = df_ts.loc[iminv, "Timestamp"]
                # CPAR model does not like datetime types here, extracting the hour only
                df_.loc[ts_range, f"tgmax{i}"] = str(datetime.strptime(max_ts, '%Y-%m-%d %H:%M:%S').hour)
                df_.loc[ts_range, f"gmax{i}"] = maxv
                df_.loc[ts_range, f"tgmin{i}"]= str(datetime.strptime(min_ts, '%Y-%m-%d %H:%M:%S').hour)
                df_.loc[ts_range, f"gmin{i}"] = minv
                ts = ts.drop(imaxv)
                ts = ts.drop(iminv)
        else:
            for i in range(count):
                df_.loc[ts_range, f"tgmax{i}"] = str(datetime.strptime("2000-01-01 00:00:00", '%Y-%m-%d %H:%M:%S').hour)
                df_.loc[ts_range, f"gmax{i}"] = 0
                df_.loc[ts_range, f"tgmin{i}"]= str(datetime.strptime("2000-01-01 00:00:00", '%Y-%m-%d %H:%M:%S').hour)
                df_.loc[ts_range, f"gmin{i}"] = 0

    # Quick validation, the static emax0 values should be the same as computing
    # the max of each time series
    #df[["datapoint_id","energy_elec"]].groupby('datapoint_id', sort=False).nlargest(count).values.flatten()
    # for i in range(count):
    #     if not np.allclose(df_[f"emax{i}"], df[["datapoint_id","energy_elec"]].groupby('datapoint_id', sort=False).max().values.flatten()):
    #     raise ValueError("Max of time series mismatch")
    if not np.allclose(df_.emax0.unique(), df[["datapoint_id","energy_elec"]].groupby('datapoint_id', sort=False).max().values.flatten()):
        raise ValueError("Max of time series mismatch")

    # force datetime on pandas
    #for i in range(count):
    #    df_[f"temax{i}"] = pd.to_datetime(df_[f"temax{i}"])
    #     df_[f"temin{i}"] = pd.to_datetime(df_[f"temin{i}"])       
    #     df_[f"tgmax{i}"] = pd.to_datetime(df_[f"tgmax{i}"])
    #     df_[f"tgmin{i}"] = pd.to_datetime(df_[f"tgmin{i}"])


    # let's reorder the columns
    cols = list(df_.columns)
    lencs = len(cols)
    print(f"Columns in {len(df.columns)}, columns out {lencs}")
    if lencs != len(df.columns) + 8*count:
        raise ValueError("Input / output columns mismatch")
    
    return df_[cols[0:4]+cols[-8*count:]+cols[4:lencs-8*count]]#.drop(columns=[#"temax0",
                                                                              #"emax0", 
                                                               #               "temin0", "emin0",
                                                               #               "tgmax0", "gmax0", "tgmin0", "gmin0"
                                                               #              ])

In [None]:
%%time
real_data_mm = pick_peaks(real_data, peaks)

In [None]:
real_data_mm.to_csv(f"real_data_sdv_{days}_days_{peaks}_static_peaks.csv")

#### Manipulate data to conform SDV data flow
- Treat every feature with less than 30 unique elements as string to make it categorical

In [None]:
real_data = real_data_mm

In [None]:
if real_data.isnull().values.any():
    raise ValueError("Real data has null values")

In [None]:
real_data

In [None]:
%%time
for col, dt in real_data.dtypes[4:].items():
    if dt == "float64" or dt == "int64":
        if len(real_data[col].unique()) < 30:
            real_data[col] = real_data[[col]].astype(str)

In [None]:
print(list(real_data.dtypes))

### Define and handle metadata

In [None]:
real_data.temax0.dtype

In [None]:
metadata = sdv.metadata.SingleTableMetadata()

In [None]:
metadata.detect_from_dataframe(real_data)

In [None]:
metadata.visualize()

#### Adjusting metadata for timeseries

In [None]:
# update metadata for datapoint_id, set to id and hex string to avoid duplications
metadata.update_column(column_name='datapoint_id', sdtype='id', regex_format='[0-9a-f]{6}')#, regex='[0-9a-f]{32}')

In [None]:
#for i in range(peaks):
#    metadata.update_column(column_name=f"temax{i}", sdtype='datetime', datetime_format='%Y-%m-%d %H:%M:%S')
#     metadata.update_column(column_name=f"temin{i}", sdtype='datetime', datetime_format='%Y-%m-%d %H:%M:%S')
#     metadata.update_column(column_name=f"tgmax{i}", sdtype='datetime', datetime_format='%Y-%m-%d %H:%M:%S')
#     metadata.update_column(column_name=f"tgmin{i}", sdtype='datetime', datetime_format='%Y-%m-%d %H:%M:%S')

In [None]:
# Each datapoint_id (unique) holds a timeseries
metadata.set_sequence_key(column_name='datapoint_id')

In [None]:
# The index for timeseries is the Timestamp
metadata.set_sequence_index(column_name='Timestamp')

In [None]:
metadata.visualize()

In [None]:
metadata.to_dict()

#### Adapt the input data to the model
- Set context columns (features)

In [None]:
real_data.columns

In [None]:
context_cols = list(real_data.columns)[4:]

In [None]:
context_cols

## Fit synthesizer

In [None]:
#epochs = 10

In [None]:
synthesizer = PARSynthesizer(
    metadata,
    context_columns=context_cols,
    verbose=True,
    epochs=epochs)
    #segment_size=7)#128

|days| RAM| VRAM| Epochs|Fit| Gen|
|---|---|---|---|---|---|
|1|~3.5Gb|~0.6Gb| 10|~3min|~3min|
|1|~3.5Gb|~0.6Gb| 128|~42min|~9min|

In [None]:
%%time
synthesizer.fit(real_data)

## Sample SD and save model

In [None]:
%%time
synthetic_data = synthesizer.sample(num_sequences=sample_size)

In [None]:
synthetic_data.head()

In [None]:
synthetic_data.describe()

In [None]:
real_data.describe()

In [None]:
synthetic_data[:24]

In [None]:
real_data

In [None]:
# save model
synthesizer.save(f'quick_test_PAR_full_cols_{days}_days_{peaks}_static_peaks.pkl')

In [None]:
# dump data
synthetic_data.to_csv(f"synthetic_data_sdv_{days}_days_{peaks}_static_peaks.csv")

## Done!