In [44]:
# !git clone https://github.com/jorgpg5/synthetic_data.git

fatal: destination path 'synthetic_data' already exists and is not an empty directory.


# Synthesize Time Series data from your own DataFrame

This Blueprint demonstrates how to create synthetic time series data with Gretel. We assume that within the dataset
there is at least:

1) A specific column holding time data points

2) One or more columns that contain measurements or numerical observations for each point in time.

For this Blueprint, we will generate a very simple sine wave as our time series data.

In [45]:
# %%capture

# !pip install pyyaml smart_open numpy pandas optuna matplotlib
# !pip install -U gretel-client

In [1]:
# Specify your Gretel API key

from getpass import getpass
import pandas as pd
from gretel_client import configure_session, ClientConfig

pd.set_option('max_colwidth', None)

configure_session(ClientConfig(api_key=getpass(prompt="Enter Gretel API key"), 
                               endpoint="https://api.gretel.cloud"))

Enter Gretel API key········


In [2]:
import datetime
import optuna
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('dp_1.csv')

df

Unnamed: 0,Timestamp,BlinkL,BlinkR,Close_car,Dist_laneL,Dist_laneR,Ego_speed,FCW,Gral_pedestrian,GPS_data_type,...,Right_thumb_z,Right_thumb_vis,Left_hip_x,Left_hip_y,Left_hip_z,Left_hip_vis,Right_hip_x,Right_hip_y,Right_hip_z,Right_hip_vis
0,1620700292389898,0,0,0,0.00,0.00,0,0,0,$GPRMC,...,-0.002471,0.622854,0.469241,0.158402,0.094456,0.996623,0.450585,0.290386,0.277914,0.998372
1,1620700292444904,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.002730,0.671047,0.476152,0.174349,0.070491,0.994896,0.457167,0.305354,0.257861,0.997148
2,1620700292520406,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.002738,0.590319,0.474365,0.167934,0.084372,0.997750,0.453799,0.303305,0.273701,0.998800
3,1620700292589380,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.002750,0.585817,0.475759,0.174586,0.081895,0.997823,0.453397,0.304718,0.269305,0.998857
4,1620700292670665,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.002744,0.688776,0.471799,0.170256,0.068155,0.991953,0.453873,0.308401,0.238535,0.996843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22598,1620702606662417,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.002499,0.225026,0.468305,0.250714,0.215763,0.990552,0.455707,0.387940,0.350800,0.991968
22599,1620702606754876,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.002487,0.354484,0.475765,0.261941,0.170480,0.999026,0.459822,0.408762,0.331405,0.999169
22600,1620702606847360,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.002497,0.234411,0.475760,0.272552,0.099595,0.987154,0.463147,0.433973,0.267212,0.986010
22601,1620702606938313,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.002495,0.445996,0.478145,0.272602,0.130408,0.970782,0.463505,0.435549,0.289939,0.971532


In [3]:
all_cols = list(df.columns.values.tolist())
len(all_cols)

171

In [4]:
cols_to_remove = ['Timestamp', 'GPS_data_type', 'Validity', 'North_South', 
                  'East_West', 'Checksum', 'Date_stamp', 'Speed']
cols = all_cols.copy()
for elem in cols_to_remove:
  cols.remove(elem)
len(cols)

163

In [5]:

from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=9, cols=1)

fig.append_trace(go.Scatter(
    y=df.Ego_speed,
    name='Ego_speed',
), row=1, col=1)

fig.append_trace(go.Scatter(
    y=df.Lattitude,
    name='Lattitude',
), row=2, col=1)

fig.append_trace(go.Scatter(
    y=df.Longitude,
    name='Longitude',
), row=3, col=1)

fig.append_trace(go.Scatter(
    y=df.Accel_X,
    name='Accel_X',
), row=4, col=1)

fig.append_trace(go.Scatter(
    y=df.Accel_Y,
    name='Accel_Y',
), row=5, col=1)

fig.append_trace(go.Scatter(
    y=df.Accel_Z,
    name='Accel_Z',
), row=6, col=1)

fig.append_trace(go.Scatter(
    y=df.Headway,
    name='Headway',
), row=7, col=1)

fig.append_trace(go.Scatter(
    y=df.Number_obstacles,
    name='Number_obstacles',
), row=8, col=1)

fig.append_trace(go.Scatter(
    y=df.Left_elbow_x,
    name='Left_elbow_x',
), row=9, col=1)

fig.update_layout(height=1200, width=1200, title_text="Individual channels")
fig.show()

## Reducing decimal places

In [6]:
# round decimals to 4 places

cols_to_reduce_decimal_places = ['Lane_curvature',  'Nose_x', 'Nose_y', 'Nose_z', 'Nose_vis', 'Left_eye_inner_x',
 'Left_eye_inner_y', 'Left_eye_inner_z', 'Left_eye_inner_vis', 'Left_eye_x', 'Left_eye_y', 'Left_eye_z',
 'Left_eye_vis', 'Left_eye_outer_x', 'Left_eye_outer_y', 'Left_eye_outer_z', 'Left_eye_outer_vis', 'Right_eye_inner_x',
 'Right_eye_inner_y', 'Right_eye_inner_z', 'Right_eye_inner_vis', 'Right_eye_x', 'Right_eye_y', 'Right_eye_z',
 'Right_eye_vis', 'Right_eye_outer_x', 'Right_eye_outer_y', 'Right_eye_outer_z', 'Right_eye_outer_vis', 'Left_ear_x',
 'Left_ear_y', 'Left_ear_z', 'Left_ear_vis', 'Right_ear_x', 'Right_ear_y', 'Right_ear_z', 'Right_ear_vis', 'Mouth_left_x',
 'Mouth_left_y', 'Mouth_left_z', 'Mouth_left_vis', 'Mouth_right_x', 'Mouth_right_y', 'Mouth_right_z', 'Mouth_right_vis',
 'Left_shoulder_x', 'Left_shoulder_y', 'Left_shoulder_z', 'Left_shoulder_vis', 'Right_shoulder_x', 'Right_shoulder_y',
 'Right_shoulder_z', 'Right_shoulder_vis', 'Left_elbow_x', 'Left_elbow_y', 'Left_elbow_z', 'Left_elbow_vis', 
 'Right_elbow_x', 'Right_elbow_y', 'Right_elbow_z', 'Right_elbow_vis', 'Left_wrist_x', 'Left_wrist_y', 'Left_wrist_z',
 'Left_wrist_vis', 'Right_wrist_x', 'Right_wrist_y', 'Right_wrist_z', 'Right_wrist_vis', 'Left_pinky_x',
 'Left_pinky_y', 'Left_pinky_z', 'Left_pinky_vis', 'Right_pinky_x', 'Right_pinky_y', 'Right_pinky_z', 'Right_pinky_vis',
 'Left_index_x', 'Left_index_y', 'Left_index_z', 'Left_index_vis', 'Right_index_x', 'Right_index_y', 'Right_index_z',
 'Right_index_vis', 'Left_thumb_x', 'Left_thumb_y', 'Left_thumb_z', 'Left_thumb_vis', 'Right_thumb_x', 'Right_thumb_y',
 'Right_thumb_z', 'Right_thumb_vis', 'Left_hip_x', 'Left_hip_y', 'Left_hip_z', 'Left_hip_vis', 'Right_hip_x',
 'Right_hip_y', 'Right_hip_z', 'Right_hip_vis']

In [7]:
decimal_places = 4
train_df = df.copy()
for name in cols_to_reduce_decimal_places:
    train_df[name] = train_df[name].round(decimals=decimal_places)

In [8]:
train_df

Unnamed: 0,Timestamp,BlinkL,BlinkR,Close_car,Dist_laneL,Dist_laneR,Ego_speed,FCW,Gral_pedestrian,GPS_data_type,...,Right_thumb_z,Right_thumb_vis,Left_hip_x,Left_hip_y,Left_hip_z,Left_hip_vis,Right_hip_x,Right_hip_y,Right_hip_z,Right_hip_vis
0,1620700292389898,0,0,0,0.00,0.00,0,0,0,$GPRMC,...,-0.0025,0.6229,0.4692,0.1584,0.0945,0.9966,0.4506,0.2904,0.2779,0.9984
1,1620700292444904,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.0027,0.6710,0.4762,0.1743,0.0705,0.9949,0.4572,0.3054,0.2579,0.9971
2,1620700292520406,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.0027,0.5903,0.4744,0.1679,0.0844,0.9977,0.4538,0.3033,0.2737,0.9988
3,1620700292589380,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.0028,0.5858,0.4758,0.1746,0.0819,0.9978,0.4534,0.3047,0.2693,0.9989
4,1620700292670665,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.0027,0.6888,0.4718,0.1703,0.0682,0.9920,0.4539,0.3084,0.2385,0.9968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22598,1620702606662417,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.0025,0.2250,0.4683,0.2507,0.2158,0.9906,0.4557,0.3879,0.3508,0.9920
22599,1620702606754876,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.0025,0.3545,0.4758,0.2619,0.1705,0.9990,0.4598,0.4088,0.3314,0.9992
22600,1620702606847360,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.0025,0.2344,0.4758,0.2726,0.0996,0.9872,0.4631,0.4340,0.2672,0.9860
22601,1620702606938313,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.0025,0.4460,0.4781,0.2726,0.1304,0.9708,0.4635,0.4355,0.2899,0.9715


# Data As-Is

This part of the code generates synthetic data using the time series task with the data as-is. The hyperparameters we found that work best have been included for this run.

In [9]:
from smart_open import open
import yaml

from gretel_client import create_project
from gretel_client.helpers import poll

# train_df = pd.read_csv('dp_1.csv')

# Create a project and model configuration.
project = create_project(display_name="time-series-synthetic-data-as-is")

# Pull down the default synthetic config.  We will modify it slightly.
with open("https://raw.githubusercontent.com/gretelai/gretel-blueprints/main/config_templates/gretel/synthetics/default.yml", 'r') as stream:
    config = yaml.safe_load(stream)

# Here we create an object to specify the timeseries task.
time_field = "Timestamp"
trend_fields = cols

task = {
    'type': 'time_series',
    'attrs': {
        'time_field': time_field,
        'trend_fields': trend_fields
    }
}

config['models'][0]['synthetics']['task'] = task

config['models'][0]['synthetics']['params']['vocab_size'] = 0
config['models'][0]['synthetics']['params']['predict_batch_size'] = 1
config['models'][0]['synthetics']['params']['reset_states'] = True
config['models'][0]['synthetics']['params']['overwrite'] = True
config['models'][0]['synthetics']['params']['validation_split'] = False #Our validation split does not support time series tasks yet. Updated config will be uploaded soon.
config['models'][0]['synthetics']['params']['dropout_rate'] = .25 #0.5
config['models'][0]['synthetics']['params']['gen_temp'] = .750 #.898
config['models'][0]['synthetics']['params']['learning_rate'] = .001 #0.0035
config['models'][0]['synthetics']['params']['rnn_units'] = 128 #64

model = project.create_model_obj(model_config=config)

# Get a csv to work with, just dump out the train_df.
train_df.to_csv('train.csv', index=False)
model.data_source = 'train.csv'

# Upload the training data.  Train the model.
model.submit(upload_data_source=True)

poll(model)

# Use the model to generate synthetic data.
record_handler = model.create_record_handler_obj()

# For time series data we dump out the date column to seed the record handler.
train_df.Timestamp.to_csv('idx_seeds.csv', index=False)

record_handler.submit(
    action="generate",
    params={"num_records": 22600, "max_invalid": 20000},
    data_source='idx_seeds.csv',
    upload_data_source=True
)

poll(record_handler)

synthetic = pd.read_csv(record_handler.get_artifact_link("data"), compression='gzip')

synthetic.head()

train_df.Ego_speed.head(1000).plot(figsize=(12, 8))
plt.show()

synthetic.Ego_speed.head(1000).plot(figsize=(12, 8))
plt.show()

INFO: Starting poller


{
    "uid": "615991f939ee90d6a7bb208e",
    "model_name": "sneaky-honorable-dinasaur",
    "runner_mode": "cloud",
    "user_id": "60f7ffc2bff621796155eab8",
    "project_id": "615991e0b7179bdd8128daf1",
    "logs": null,
    "status_history": {
        "created": "2021-10-03T11:20:25.969067Z"
    },
    "last_modified": "2021-10-03T11:20:26.205263Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:42c01cffaa364d53cfc00e91e5df33050451079e6c12b62a99c438f7dbe6743d",
    "model_type": "synthetics",
    "config": {
        "schema_version": "1.0",
        "name": null,
        "models": [
            {
                "synthetics": {
                    "params": {
                        "field_delimiter": null,
                        "epochs": 100,
              

INFO: Status is created. Model creation has been queued.
INFO: Status is pending. A Gretel Cloud worker is being allocated to begin model creation.
INFO: Status is active. A worker has started creating your model!
2021-10-03T11:20:46.814991Z  Starting synthetic model training
2021-10-03T11:20:46.817101Z  Loading training data
2021-10-03T11:20:47.730836Z  Training data loaded
{
    "record_count": 22603,
    "field_count": 171
}
2021-10-03T11:21:00.102742Z  Creating semantic validators and preparing training data
2021-10-03T11:24:28.652472Z  Beginning ML model training
2021-10-03T11:25:06.562070Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.7066,
    "loss": 0.9625,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-10-03T11:25:20.334930Z  Training epoch completed
{
    "epoch": 1,
    "accuracy": 0.7577,
    "loss": 0.7192,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-10-03T11:25:34.128492Z  Training epoch completed
{
    "epoch": 2,
   

2021-10-03T11:35:11.442023Z  Training epoch completed
{
    "epoch": 44,
    "accuracy": 0.8202,
    "loss": 0.5185,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-10-03T11:35:25.214144Z  Training epoch completed
{
    "epoch": 45,
    "accuracy": 0.8203,
    "loss": 0.5179,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-10-03T11:35:38.940625Z  Training epoch completed
{
    "epoch": 46,
    "accuracy": 0.8202,
    "loss": 0.5175,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-10-03T11:35:52.699266Z  Training epoch completed
{
    "epoch": 47,
    "accuracy": 0.8203,
    "loss": 0.5174,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-10-03T11:36:06.406662Z  Training epoch completed
{
    "epoch": 48,
    "accuracy": 0.8203,
    "loss": 0.5173,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-10-03T11:36:20.132078Z  Training epoch completed
{
    "epoch": 49,
    "accuracy": 0.8206,
    "loss": 0.5169,
   

2021-10-03T11:41:37.688967Z  Training epoch completed
{
    "epoch": 5,
    "accuracy": 0.9238,
    "loss": 0.2634,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 2
}
2021-10-03T11:41:43.125729Z  Training epoch completed
{
    "epoch": 6,
    "accuracy": 0.926,
    "loss": 0.2545,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 2
}
2021-10-03T11:41:48.518578Z  Training epoch completed
{
    "epoch": 7,
    "accuracy": 0.9279,
    "loss": 0.247,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 2
}
2021-10-03T11:41:53.940986Z  Training epoch completed
{
    "epoch": 8,
    "accuracy": 0.9291,
    "loss": 0.2417,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 2
}
2021-10-03T11:41:59.348099Z  Training epoch completed
{
    "epoch": 9,
    "accuracy": 0.9303,
    "loss": 0.2368,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 2
}
2021-10-03T11:42:04.758483Z  Training epoch completed
{
    "epoch": 10,
    "accuracy": 0.9311,
    "loss": 0.2336,
    "val_a

2021-10-03T11:45:56.540985Z  Training epoch completed
{
    "epoch": 13,
    "accuracy": 0.9893,
    "loss": 0.0401,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2021-10-03T11:46:01.837059Z  Training epoch completed
{
    "epoch": 14,
    "accuracy": 0.9893,
    "loss": 0.0401,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2021-10-03T11:46:07.066676Z  Training epoch completed
{
    "epoch": 15,
    "accuracy": 0.9893,
    "loss": 0.0401,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2021-10-03T11:46:12.395758Z  Training epoch completed
{
    "epoch": 16,
    "accuracy": 0.9894,
    "loss": 0.0402,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2021-10-03T11:46:51.370338Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.7675,
    "loss": 0.7211,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2021-10-03T11:47:12.354932Z  Training epoch completed
{
    "epoch": 1,
    "accuracy": 0.8066,
    "loss": 0.5484,
    "

2021-10-03T12:01:59.194660Z  Training epoch completed
{
    "epoch": 43,
    "accuracy": 0.86,
    "loss": 0.3937,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2021-10-03T12:02:23.371020Z  Training epoch completed
{
    "epoch": 44,
    "accuracy": 0.86,
    "loss": 0.3934,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2021-10-03T12:02:47.367723Z  Training epoch completed
{
    "epoch": 45,
    "accuracy": 0.86,
    "loss": 0.3931,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2021-10-03T12:03:11.586555Z  Training epoch completed
{
    "epoch": 46,
    "accuracy": 0.8601,
    "loss": 0.393,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2021-10-03T12:03:35.669466Z  Training epoch completed
{
    "epoch": 47,
    "accuracy": 0.8603,
    "loss": 0.3928,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2021-10-03T12:03:59.940541Z  Training epoch completed
{
    "epoch": 48,
    "accuracy": 0.8604,
    "loss": 0.3926,
    "val_a

2021-10-03T12:15:56.958553Z  Training epoch completed
{
    "epoch": 37,
    "accuracy": 0.8861,
    "loss": 0.329,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 5
}
2021-10-03T12:16:13.151558Z  Training epoch completed
{
    "epoch": 38,
    "accuracy": 0.8863,
    "loss": 0.3286,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 5
}
2021-10-03T12:16:29.285231Z  Training epoch completed
{
    "epoch": 39,
    "accuracy": 0.8864,
    "loss": 0.3287,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 5
}
2021-10-03T12:16:45.471790Z  Training epoch completed
{
    "epoch": 40,
    "accuracy": 0.8864,
    "loss": 0.3281,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 5
}
2021-10-03T12:17:01.692874Z  Training epoch completed
{
    "epoch": 41,
    "accuracy": 0.8865,
    "loss": 0.3277,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 5
}
2021-10-03T12:17:17.828541Z  Training epoch completed
{
    "epoch": 42,
    "accuracy": 0.8865,
    "loss": 0.3277,
    

2021-10-03T12:23:47.038836Z  Training epoch completed
{
    "epoch": 35,
    "accuracy": 0.8798,
    "loss": 0.3402,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 6
}
2021-10-03T12:23:54.927867Z  Training epoch completed
{
    "epoch": 36,
    "accuracy": 0.8798,
    "loss": 0.3399,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 6
}
2021-10-03T12:24:02.832453Z  Training epoch completed
{
    "epoch": 37,
    "accuracy": 0.88,
    "loss": 0.3392,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 6
}
2021-10-03T12:24:10.692198Z  Training epoch completed
{
    "epoch": 38,
    "accuracy": 0.8801,
    "loss": 0.3391,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 6
}
2021-10-03T12:24:18.554866Z  Training epoch completed
{
    "epoch": 39,
    "accuracy": 0.8802,
    "loss": 0.3386,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 6
}
2021-10-03T12:24:26.444394Z  Training epoch completed
{
    "epoch": 40,
    "accuracy": 0.8803,
    "loss": 0.3384,
    "

2021-10-03T12:29:50.132808Z  Training epoch completed
{
    "epoch": 27,
    "accuracy": 0.9346,
    "loss": 0.1931,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 7
}
2021-10-03T12:29:57.445413Z  Training epoch completed
{
    "epoch": 28,
    "accuracy": 0.9345,
    "loss": 0.1929,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 7
}
2021-10-03T12:30:04.800224Z  Training epoch completed
{
    "epoch": 29,
    "accuracy": 0.9347,
    "loss": 0.1928,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 7
}
2021-10-03T12:30:12.133055Z  Training epoch completed
{
    "epoch": 30,
    "accuracy": 0.9346,
    "loss": 0.1924,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 7
}
2021-10-03T12:30:19.478311Z  Training epoch completed
{
    "epoch": 31,
    "accuracy": 0.9347,
    "loss": 0.1922,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 7
}
2021-10-03T12:30:26.820771Z  Training epoch completed
{
    "epoch": 32,
    "accuracy": 0.9347,
    "loss": 0.1919,
   

2021-10-03T14:13:07.130570Z  Training epoch completed
{
    "epoch": 20,
    "accuracy": 0.884,
    "loss": 0.3308,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 8
}
2021-10-03T14:13:35.062836Z  Training epoch completed
{
    "epoch": 21,
    "accuracy": 0.8842,
    "loss": 0.3303,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 8
}
2021-10-03T14:14:02.900127Z  Training epoch completed
{
    "epoch": 22,
    "accuracy": 0.8842,
    "loss": 0.3302,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 8
}
2021-10-03T14:14:30.913839Z  Training epoch completed
{
    "epoch": 23,
    "accuracy": 0.8844,
    "loss": 0.3294,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 8
}
2021-10-03T14:14:58.725937Z  Training epoch completed
{
    "epoch": 24,
    "accuracy": 0.8844,
    "loss": 0.3292,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 8
}
2021-10-03T14:15:25.620538Z  Training epoch completed
{
    "epoch": 25,
    "accuracy": 0.8845,
    "loss": 0.3288,
    

2021-10-03T14:29:28.120615Z  Training epoch completed
{
    "epoch": 29,
    "accuracy": 0.8676,
    "loss": 0.3564,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 9
}
2021-10-03T14:29:47.720622Z  Training epoch completed
{
    "epoch": 30,
    "accuracy": 0.8677,
    "loss": 0.3559,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 9
}
2021-10-03T14:30:04.942433Z  Training epoch completed
{
    "epoch": 31,
    "accuracy": 0.8677,
    "loss": 0.356,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 9
}
2021-10-03T14:30:22.103818Z  Training epoch completed
{
    "epoch": 32,
    "accuracy": 0.8678,
    "loss": 0.3556,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 9
}
2021-10-03T14:30:39.303869Z  Training epoch completed
{
    "epoch": 33,
    "accuracy": 0.8681,
    "loss": 0.3553,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 9
}
2021-10-03T14:30:56.519809Z  Training epoch completed
{
    "epoch": 34,
    "accuracy": 0.868,
    "loss": 0.3552,
    "

2021-10-03T14:42:57.275305Z  Training epoch completed
{
    "epoch": 32,
    "accuracy": 0.9046,
    "loss": 0.2755,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 10
}
2021-10-03T14:43:14.056228Z  Training epoch completed
{
    "epoch": 33,
    "accuracy": 0.9046,
    "loss": 0.2754,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 10
}
2021-10-03T14:43:30.832759Z  Training epoch completed
{
    "epoch": 34,
    "accuracy": 0.9047,
    "loss": 0.2753,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 10
}
2021-10-03T14:43:47.604035Z  Training epoch completed
{
    "epoch": 35,
    "accuracy": 0.9048,
    "loss": 0.275,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 10
}
2021-10-03T14:44:04.314189Z  Training epoch completed
{
    "epoch": 36,
    "accuracy": 0.9047,
    "loss": 0.2751,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 10
}
2021-10-03T14:44:20.972478Z  Training epoch completed
{
    "epoch": 37,
    "accuracy": 0.9048,
    "loss": 0.2747,

2021-10-03T14:47:16.945480Z  Generation in progress
{
    "current_valid_count": 2,
    "current_invalid_count": 116,
    "new_valid_count": 1,
    "new_invalid_count": 8,
    "completion_percent": 0.04
}
ERROR: 	Record generation was terminated. The ratio of invalid records is above the set threshold of 0.6666666666666666. Current ratio: 0.98


ApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Content-Type': 'application/json', 'Content-Length': '76', 'Connection': 'keep-alive', 'Date': 'Sun, 03 Oct 2021 14:47:27 GMT', 'x-amzn-RequestId': 'c040cf1a-76f7-4ef7-9f35-857ac6e1d7c5', 'Access-Control-Allow-Origin': '*', 'x-amz-apigw-id': 'GotTgG_KPHcFSqQ=', 'X-Amzn-Trace-Id': 'Root=1-6159c27c-1263086f015165af73c1bea3;Sampled=0', 'Access-Control-Allow-Credentials': 'true', 'X-Cache': 'Error from cloudfront', 'Via': '1.1 c8a7df1b4956aa390fe495730eb3c9f4.cloudfront.net (CloudFront)', 'X-Amz-Cf-Pop': 'SYD62-P2', 'X-Amz-Cf-Id': '1DM0j2o1dBuVrxynwpSqMMrmpqvmRCcNexwK0tuTytCKGVJz1YsNag=='})
HTTP response body: {"message": "Model cannot be used, current status is: error", "context": {}}


# Hyperparameter Tuning

Our lead machine learning researcher, Amy, found this fantastic library called Optuna to help us tune the configs for your use case. Feel free to take this code, and play with it to find new params! This helped us make sure that the errors you were seeing went away.

In [None]:
import optuna
import yaml
import time

from smart_open import open
from gretel_client import create_project
from gretel_client.helpers import poll

# Pull down the default synthetic config.  We will modify it slightly.
with open("https://raw.githubusercontent.com/gretelai/gretel-blueprints/main/config_templates/gretel/synthetics/default.yml", 'r') as stream:
    config = yaml.safe_load(stream)

# Here we create an object to specify the timeseries task.
time_field = "Timestamp"
trend_fields = cols


task = {
    'type': 'time_series',
    'attrs': {
        'time_field': time_field,
        'trend_fields': trend_fields
    }
}

In [10]:

def objective(trial: optuna.Trial):
    
    config['models'][0]['synthetics']['task'] = task
    config['models'][0]['synthetics']['params']['predict_batch_size'] = 1
    config['models'][0]['synthetics']['params']['overwrite'] = True
    config['models'][0]['synthetics']['params']['validation_split'] = False #Our validation split does not support time series tasks yet. Updated config will be uploaded soon.

#     config['models'][0]['synthetics']['params']['vocab_size'] = trial.suggest_int(name="vocab_size", low=18, high=38, step=10)
    config['models'][0]['synthetics']['params']['rnn_units'] = trial.suggest_int(name="rnn_units", low=64, high=512, step=64)
    config['models'][0]['synthetics']['params']['dropout_rate'] = trial.suggest_float("dropout_rate", .25, .75)
    config['models'][0]['synthetics']['params']['gen_temp'] = trial.suggest_float("gen_temp", .5, 1.5)
    config['models'][0]['synthetics']['params']['learning_rate'] = trial.suggest_float("learning_rate",  .001, 0.01, log=True)
    config['models'][0]['synthetics']['params']['reset_states'] = trial.suggest_categorical(
        "reset_states", choices=[True, False])
        
    seconds = int(time.time())
    project_name = "Tuning Experiment" + str(seconds)
    project = create_project(display_name=project_name)
    
    model = project.create_model_obj(model_config=config)

    # Get a csv to work with, just dump out the train_df.
    train_df.to_csv('train.csv', index=False)
    model.data_source = 'train.csv'

    # Upload the training data.  Train the model.
    model.submit(upload_data_source=True)

    status = "active"
    sqs = 0
    while status == "active":
        #Sleep a bit here
        time.sleep(60)
        models = []
        for model in project.search_models():
            ms = model.__dict__['_data']['model']
            ms = {key: ms[key] for key in ['model_name', 'model_type', 'status'] }
            status = ms["status"]
            print("Status is: " + status)
            if status == "completed":
                report = model.peek_report()
                if report:
                    sqs = report['synthetic_data_quality_score']['score']
                    print("Retrieved report sqs: " + str(sqs))
                else:
                    sqs = 0
            elif status == "error":
                sqs = 0
            
    return sqs

In [11]:
import time
# Create study that maximizes


study = optuna.create_study(direction="maximize")

# Start optimizing with however many trials you want
# You can do just one to see that it works, and then use the cells below
# to run more trials on the same study

study.optimize(objective, n_trials=10)

print(f"Optimized SQS: {study.best_value:.5f}")

print("Best params:")
for key, value in study.best_params.items():
    print(f"\t{key}: {value}")

[32m[I 2021-10-04 00:57:14,660][0m A new study created in memory with name: no-name-e3e3f027-dc9f-49ae-b7df-a887d6defeaa[0m


Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active




Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is:

[32m[I 2021-10-04 03:37:08,750][0m Trial 0 finished with value: 0.0 and parameters: {'rnn_units': 320, 'dropout_rate': 0.28213559211687844, 'gen_temp': 0.5814971515946424, 'learning_rate': 0.001048111277247049, 'reset_states': False}. Best is trial 0 with value: 0.0.[0m


Status is: error
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: 



Status is: active




Status is: active




Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is:

[32m[I 2021-10-04 07:07:16,453][0m Trial 1 finished with value: 0.0 and parameters: {'rnn_units': 384, 'dropout_rate': 0.5233438725029607, 'gen_temp': 0.753264763732727, 'learning_rate': 0.001024961773066772, 'reset_states': False}. Best is trial 0 with value: 0.0.[0m


Status is: error
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: 

[32m[I 2021-10-04 11:01:14,270][0m Trial 2 finished with value: 0.0 and parameters: {'rnn_units': 448, 'dropout_rate': 0.49538895169540803, 'gen_temp': 1.0056285485735774, 'learning_rate': 0.0010313216508021378, 'reset_states': False}. Best is trial 0 with value: 0.0.[0m


Status is: error
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: 

[32m[I 2021-10-04 13:52:07,751][0m Trial 3 finished with value: 0.0 and parameters: {'rnn_units': 384, 'dropout_rate': 0.5139300584580571, 'gen_temp': 0.598708720239554, 'learning_rate': 0.001407107085667807, 'reset_states': False}. Best is trial 0 with value: 0.0.[0m


Status is: error
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: 



Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is:

[32m[I 2021-10-04 16:10:24,541][0m Trial 4 finished with value: 0.0 and parameters: {'rnn_units': 512, 'dropout_rate': 0.5533298478818384, 'gen_temp': 0.6056955911128048, 'learning_rate': 0.004501581450125818, 'reset_states': True}. Best is trial 0 with value: 0.0.[0m


Status is: error
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: 

[32m[I 2021-10-04 20:00:21,413][0m Trial 5 finished with value: 0.0 and parameters: {'rnn_units': 384, 'dropout_rate': 0.690145248103872, 'gen_temp': 0.7010402222377956, 'learning_rate': 0.0010120977233775384, 'reset_states': True}. Best is trial 0 with value: 0.0.[0m


Status is: error
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: 

[32m[I 2021-10-04 21:39:28,802][0m Trial 6 finished with value: 0.0 and parameters: {'rnn_units': 64, 'dropout_rate': 0.4899950303111545, 'gen_temp': 0.9964279730584428, 'learning_rate': 0.004703043830631083, 'reset_states': False}. Best is trial 0 with value: 0.0.[0m


Status is: error
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: 

[32m[I 2021-10-04 22:50:52,879][0m Trial 7 finished with value: 0.0 and parameters: {'rnn_units': 128, 'dropout_rate': 0.2896807376278733, 'gen_temp': 0.7648051856808841, 'learning_rate': 0.00443819090961483, 'reset_states': True}. Best is trial 0 with value: 0.0.[0m


Status is: error
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: 

[32m[I 2021-10-05 01:29:47,685][0m Trial 8 finished with value: 0.0 and parameters: {'rnn_units': 512, 'dropout_rate': 0.5405813730668523, 'gen_temp': 0.9483447821191541, 'learning_rate': 0.0028487001143307294, 'reset_states': True}. Best is trial 0 with value: 0.0.[0m


Status is: error
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: active
Status is: 

[32m[I 2021-10-05 03:01:40,147][0m Trial 9 finished with value: 0.0 and parameters: {'rnn_units': 384, 'dropout_rate': 0.4840435686608944, 'gen_temp': 1.233132467148896, 'learning_rate': 0.005844727401728454, 'reset_states': True}. Best is trial 0 with value: 0.0.[0m


Status is: error
Optimized SQS: 0.00000
Best params:
	rnn_units: 320
	dropout_rate: 0.28213559211687844
	gen_temp: 0.5814971515946424
	learning_rate: 0.001048111277247049
	reset_states: False
