# SDV CPAR model training

This notebook trains the CPAR model and samples synthetic data (SD).

In [None]:
from datetime import datetime, timedelta

In [None]:
import numpy as np

In [None]:
import pandas as pd

In [None]:
import sdv
from sdv.sequential import PARSynthesizer
#from sdv.constraints import Unique

### Inputs
- Number of days or datafilename
- Epochs
- Size of sampled synthetic data
- Real data file name

In [None]:
days = 1
data_dir = "../"
epochs = 128
sample_size = None
datafilename = None

### Read real data

In [None]:
if datafilename:
    real_data = pd.read_csv(datafilename, index_col=0)    
else:
    real_data = pd.read_csv(f"{data_dir}real_data_sdv_{days}_days.csv", index_col=0)

if not sample_size:
    sample_size = len(real_data.datapoint_id.unique())

In [None]:
real_data

#### Manipulate data to conform SDV data flow
- Treat every feature with less than 30 unique elements as string to make it categorical

In [None]:
%%time
for col, dt in real_data.dtypes[4:].items():
    if dt == "float64" or dt == "int64":
        if len(real_data[col].unique()) < 30:
            real_data[col] = real_data[[col]].astype(str)

In [None]:
#data_in_df[":ext_roof_cond"].unique()

In [None]:
real_data.dtypes

### Define and handle metadata

In [None]:
metadata = sdv.metadata.SingleTableMetadata()

In [None]:
metadata.detect_from_dataframe(real_data)

In [None]:
metadata.visualize()

#### Adjusting metadata for timeseries

In [None]:
# update metadata for datapoint_id, set to id and hex string to avoid duplications
metadata.update_column(column_name='datapoint_id', sdtype='id', regex_format='[0-9a-f]{6}')#, regex='[0-9a-f]{32}')

In [None]:
# Each datapoint_id (unique) holds a timeseries
metadata.set_sequence_key(column_name='datapoint_id')

In [None]:
# The index for timeseries is the Timestamp
metadata.set_sequence_index(column_name='Timestamp')

In [None]:
metadata.visualize()

In [None]:
metadata.to_dict()

#### Adapt the input data to the model
- Set context columns (features)

In [None]:
context_cols = list(real_data.columns)[4:]

In [None]:
context_cols

## Fit synthesizer

In [None]:
#epochs=10

In [None]:
synthesizer = PARSynthesizer(
    metadata,
    context_columns=context_cols,
    verbose=True,
    epochs=epochs)
    #segment_size=7)#128

|days| RAM| VRAM| Epochs|Fit| Gen|
|---|---|---|---|---|---|
|7|~14Gb|~2.5Gb| 64|~30min|~35min|
|6|~14.2Gb|~2Gb| 64|~15min|~min|
|3|~13Gb|~1.2Gb| 64|~13min|~min|
|1|~13Gb|~0.6Gb| 64|~13min|~min|
|1|~11Gb|~0.6Gb| 128|~26min|~min|
|1|~11Gb|~0.6Gb| 10|~3min|~3min|

In [None]:
%%time
synthesizer.fit(real_data)

## Sample SD and save model

In [None]:
%%time
synthetic_data = synthesizer.sample(num_sequences=sample_size)

In [None]:
synthetic_data.head()

In [None]:
synthetic_data.describe()

In [None]:
real_data.describe()

In [None]:
synthetic_data

In [None]:
real_data

In [None]:
# save model
synthesizer.save(f'quick_test_PAR_full_cols_{days}_days.pkl')

In [None]:
# dump data
synthetic_data.to_csv(f"synthetic_data_sdv_{days}_days.csv")

## Done!