# SDV CPAR model training

In [None]:
from datetime import datetime, timedelta

In [None]:
import numpy as np

In [None]:
import pandas as pd

In [None]:
import sdv
from sdv.sequential import PARSynthesizer
#from sdv.constraints import Unique

### Inputs
- Number of days or datafilename
- Epochs
- Peaks (number of max and min values of the time series)
- Size of sampled synthetic data
- Real data file name

In [None]:
days = 1
data_dir = "../"
epochs = 128
peaks = 1
sample_size = None
datafilename = None

### Read real data

In [None]:
if datafilename:
    real_data = pd.read_csv(datafilename, index_col=0)    
else:
    real_data = pd.read_csv(f"{data_dir}real_data_sdv_{days}_days.csv", index_col=0)

if not sample_size:
    sample_size = len(real_data.datapoint_id.unique())

In [None]:
real_data

### Add peaks and valleys

In [None]:
def pick_peaks(df, count=1):
    """ Select max and min values for each time series and add it to a dataframe along with the timestamp

    Parameters
    ----------
    count : int
        The number of max and min values to grab.
    """
    group_timeseries_elec = df[["datapoint_id", "energy_elec"]].groupby('datapoint_id', sort=False)
    group_timeseries_gas = df[["datapoint_id", "energy_gas"]].groupby('datapoint_id', sort=False)

    #timeseries_dfs = []
    edfp = []
    for t in group_timeseries_elec.groups:
        ts = group_timeseries_elec.get_group(t)["energy_elec"]#.reset_index(drop=True)
        #timeseries_dfs.append(ts)
        dfp_ = pd.DataFrame(ts)
        dfp_["epeak"] = 0
        dfp_["iepeak"] = 0
        
        for i in range(count):
            imaxv, maxv = ts.idxmax(), ts.max()
            iminv, minv = ts.idxmin(), ts.min()
            dfp_.at[imaxv, "epeak"] = maxv
            dfp_.at[imaxv, "iepeak"] = 1
            dfp_.at[iminv, "epeak"] = minv
            dfp_.at[iminv, "iepeak"] = -1
            ts = ts.drop(imaxv)
            ts = ts.drop(iminv)
        edfp.append(dfp_[["epeak", "iepeak"]])
    edfp = pd.concat(edfp)

    gdfp = []
    for t in group_timeseries_gas.groups:
        ts = group_timeseries_gas.get_group(t)["energy_gas"]#.reset_index(drop=True)
        dfp_ = pd.DataFrame(ts)
        dfp_["gpeak"] = 0
        dfp_["igpeak"] = 0

        # dont look for minmax if timeseries is flat or zero
        # often there is no gas measurements
        if len(ts.unique()) > 2:
            for i in range(count):
                imaxv, maxv = ts.idxmax(), ts.max()
                iminv, minv = ts.idxmin(), ts.min()
                dfp_.at[imaxv, "gpeak"] = maxv
                dfp_.at[imaxv, "igpeak"] = 1
                dfp_.at[iminv, "gpeak"] = minv
                dfp_.at[iminv, "igpeak"] = -1
                ts = ts.drop(imaxv)
                ts = ts.drop(iminv)
        gdfp.append(dfp_[["gpeak", "igpeak"]])
    gdfp = pd.concat(gdfp)    
    
    ndf = pd.concat([df, edfp, gdfp], axis=1)

    return ndf[list(ndf.columns[:4])+list(ndf.columns[-4:])+list(ndf.columns[4:-4])].copy()

In [None]:
%%time
real_data = pick_peaks(real_data, peaks)

In [None]:
real_data

In [None]:
real_data.to_csv(f"real_data_sdv_{days}_days_{peaks}_peaks.csv")

#### Manipulate data to conform SDV data flow
- Treat every feature with less than 30 unique elements as string to make it categorical

In [None]:
%%time
for col, dt in real_data.dtypes[4:].items():
    if dt == "float64" or dt == "int64":
        if len(real_data[col].unique()) < 30:
            real_data[col] = real_data[[col]].astype(str)

In [None]:
#data_in_df[":ext_roof_cond"].unique()

In [None]:
real_data.dtypes

### Define and handle metadata

In [None]:
metadata = sdv.metadata.SingleTableMetadata()

In [None]:
metadata.detect_from_dataframe(real_data)

In [None]:
metadata.visualize()

#### Adjusting metadata for timeseries

In [None]:
# update metadata for datapoint_id, set to id and hex string to avoid duplications
metadata.update_column(column_name='datapoint_id', sdtype='id', regex_format='[0-9a-f]{6}')#, regex='[0-9a-f]{32}')

In [None]:
# Each datapoint_id (unique) holds a timeseries
metadata.set_sequence_key(column_name='datapoint_id')

In [None]:
# The index for timeseries is the Timestamp
metadata.set_sequence_index(column_name='Timestamp')

In [None]:
metadata.visualize()

In [None]:
metadata.to_dict()

#### Adapt the input data to the model
- Set context columns (features)

In [None]:
context_cols = list(set(real_data.columns)-set(['datapoint_id', 'Timestamp',
                                                'energy_elec', 'energy_gas',
                                                'epeak', 'iepeak',
                                                'gpeak', 'igpeak']))
#-set([':datapoint_id', 'datapoint_id', 'Timestamp', 'energy_elec', 'energy_gas', 'Total Energy', 'bldg_conditioned_floor_area_m_sq']))

In [None]:
context_cols

## Fit synthesizer

In [None]:
#epochs = 10

In [None]:
synthesizer = PARSynthesizer(
    metadata,
    context_columns=context_cols,
    verbose=True,
    epochs=epochs)
    #segment_size=7)#128

|days| RAM| VRAM| Epochs|Fit| Gen|
|---|---|---|---|---|---|
|1|~3.5Gb|~0.6Gb| 10|~3min|~3min|
|1|~3.5Gb|~0.6Gb| 128|~42min|~9min|

In [None]:
%%time
synthesizer.fit(real_data)

## Sample SD and save model

In [None]:
%%time
synthetic_data = synthesizer.sample(num_sequences=sample_size)

In [None]:
synthetic_data.head()

In [None]:
synthetic_data.describe()

In [None]:
real_data.describe()

In [None]:
synthetic_data[:24]

In [None]:
real_data

In [None]:
# save model
synthesizer.save(f'quick_test_PAR_full_cols_{days}_days_{peaks}_peaks.pkl')

In [None]:
# dump data
synthetic_data.to_csv(f"synthetic_data_sdv_{days}_days_{peaks}_peaks.csv")

## Done!