In [44]:
# !git clone https://github.com/jorgpg5/synthetic_data.git

fatal: destination path 'synthetic_data' already exists and is not an empty directory.


# Synthesize Time Series data from your own DataFrame

This Blueprint demonstrates how to create synthetic time series data with Gretel. We assume that within the dataset
there is at least:

1) A specific column holding time data points

2) One or more columns that contain measurements or numerical observations for each point in time.

For this Blueprint, we will generate a very simple sine wave as our time series data.

In [45]:
# %%capture

# !pip install pyyaml smart_open numpy pandas optuna matplotlib
# !pip install -U gretel-client

In [1]:
# Specify your Gretel API key

from getpass import getpass
import pandas as pd
from gretel_client import configure_session, ClientConfig

pd.set_option('max_colwidth', None)

configure_session(ClientConfig(api_key=getpass(prompt="Enter Gretel API key"), 
                               endpoint="https://api.gretel.cloud"))

Enter Gretel API key········


In [16]:
import datetime
import optuna
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('dp_1.csv')

df

Unnamed: 0,Timestamp,BlinkL,BlinkR,Close_car,Dist_laneL,Dist_laneR,Ego_speed,FCW,Gral_pedestrian,GPS_data_type,...,Right_thumb_z,Right_thumb_vis,Left_hip_x,Left_hip_y,Left_hip_z,Left_hip_vis,Right_hip_x,Right_hip_y,Right_hip_z,Right_hip_vis
0,1620700292389898,0,0,0,0.00,0.00,0,0,0,$GPRMC,...,-0.002471,0.622854,0.469241,0.158402,0.094456,0.996623,0.450585,0.290386,0.277914,0.998372
1,1620700292444904,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.002730,0.671047,0.476152,0.174349,0.070491,0.994896,0.457167,0.305354,0.257861,0.997148
2,1620700292520406,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.002738,0.590319,0.474365,0.167934,0.084372,0.997750,0.453799,0.303305,0.273701,0.998800
3,1620700292589380,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.002750,0.585817,0.475759,0.174586,0.081895,0.997823,0.453397,0.304718,0.269305,0.998857
4,1620700292670665,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.002744,0.688776,0.471799,0.170256,0.068155,0.991953,0.453873,0.308401,0.238535,0.996843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22598,1620702606662417,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.002499,0.225026,0.468305,0.250714,0.215763,0.990552,0.455707,0.387940,0.350800,0.991968
22599,1620702606754876,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.002487,0.354484,0.475765,0.261941,0.170480,0.999026,0.459822,0.408762,0.331405,0.999169
22600,1620702606847360,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.002497,0.234411,0.475760,0.272552,0.099595,0.987154,0.463147,0.433973,0.267212,0.986010
22601,1620702606938313,0,0,0,-1.88,1.88,0,0,0,$GPRMC,...,-0.002495,0.445996,0.478145,0.272602,0.130408,0.970782,0.463505,0.435549,0.289939,0.971532


In [28]:
all_cols = list(df.columns.values.tolist())
len(all_cols)

171

In [29]:
cols_to_remove = ['GPS_data_type', 'GPS_timestamp', 'Validity', 'North_South', 
                  'East_West', 'Checksum', 'Date_stamp', 'Speed']
cols = all_cols.copy()
for elem in cols_to_remove:
  cols.remove(elem)
len(cols)

163

In [30]:

from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=9, cols=1)

fig.append_trace(go.Scatter(
    y=df.Ego_speed,
    name='Ego_speed',
), row=1, col=1)

fig.append_trace(go.Scatter(
    y=df.Lattitude,
    name='Lattitude',
), row=2, col=1)

fig.append_trace(go.Scatter(
    y=df.Longitude,
    name='Longitude',
), row=3, col=1)

fig.append_trace(go.Scatter(
    y=df.Accel_X,
    name='Accel_X',
), row=4, col=1)

fig.append_trace(go.Scatter(
    y=df.Accel_Y,
    name='Accel_Y',
), row=5, col=1)

fig.append_trace(go.Scatter(
    y=df.Accel_Z,
    name='Accel_Z',
), row=6, col=1)

fig.append_trace(go.Scatter(
    y=df.Headway,
    name='Headway',
), row=7, col=1)

fig.append_trace(go.Scatter(
    y=df.Number_obstacles,
    name='Number_obstacles',
), row=8, col=1)

fig.append_trace(go.Scatter(
    y=df.Left_elbow_x,
    name='Left_elbow_x',
), row=9, col=1)

fig.update_layout(height=1200, width=1200, title_text="Individual channels")
fig.show()

## Reducing decimal places

In [31]:
# round decimals to 4 places

cols_to_reduce_decimal_places = ['Lane_curvature',  'Nose_x', 'Nose_y', 'Nose_z', 'Nose_vis', 'Left_eye_inner_x',
 'Left_eye_inner_y', 'Left_eye_inner_z', 'Left_eye_inner_vis', 'Left_eye_x', 'Left_eye_y', 'Left_eye_z',
 'Left_eye_vis', 'Left_eye_outer_x', 'Left_eye_outer_y', 'Left_eye_outer_z', 'Left_eye_outer_vis', 'Right_eye_inner_x',
 'Right_eye_inner_y', 'Right_eye_inner_z', 'Right_eye_inner_vis', 'Right_eye_x', 'Right_eye_y', 'Right_eye_z',
 'Right_eye_vis', 'Right_eye_outer_x', 'Right_eye_outer_y', 'Right_eye_outer_z', 'Right_eye_outer_vis', 'Left_ear_x',
 'Left_ear_y', 'Left_ear_z', 'Left_ear_vis', 'Right_ear_x', 'Right_ear_y', 'Right_ear_z', 'Right_ear_vis', 'Mouth_left_x',
 'Mouth_left_y', 'Mouth_left_z', 'Mouth_left_vis', 'Mouth_right_x', 'Mouth_right_y', 'Mouth_right_z', 'Mouth_right_vis',
 'Left_shoulder_x', 'Left_shoulder_y', 'Left_shoulder_z', 'Left_shoulder_vis', 'Right_shoulder_x', 'Right_shoulder_y',
 'Right_shoulder_z', 'Right_shoulder_vis', 'Left_elbow_x', 'Left_elbow_y', 'Left_elbow_z', 'Left_elbow_vis', 
 'Right_elbow_x', 'Right_elbow_y', 'Right_elbow_z', 'Right_elbow_vis', 'Left_wrist_x', 'Left_wrist_y', 'Left_wrist_z',
 'Left_wrist_vis', 'Right_wrist_x', 'Right_wrist_y', 'Right_wrist_z', 'Right_wrist_vis', 'Left_pinky_x',
 'Left_pinky_y', 'Left_pinky_z', 'Left_pinky_vis', 'Right_pinky_x', 'Right_pinky_y', 'Right_pinky_z', 'Right_pinky_vis',
 'Left_index_x', 'Left_index_y', 'Left_index_z', 'Left_index_vis', 'Right_index_x', 'Right_index_y', 'Right_index_z',
 'Right_index_vis', 'Left_thumb_x', 'Left_thumb_y', 'Left_thumb_z', 'Left_thumb_vis', 'Right_thumb_x', 'Right_thumb_y',
 'Right_thumb_z', 'Right_thumb_vis', 'Left_hip_x', 'Left_hip_y', 'Left_hip_z', 'Left_hip_vis', 'Right_hip_x',
 'Right_hip_y', 'Right_hip_z', 'Right_hip_vis']

In [32]:
decimal_places = 4
train_df = df.copy(deep=True)
for name in cols_to_reduce_decimal_places:
    train_df[name] = train_df[name].round(decimals=decimal_places)

## Selecting relevant columns

In [37]:
train_df = train_df[cols]
train_df

Unnamed: 0,Timestamp,BlinkL,BlinkR,Close_car,Dist_laneL,Dist_laneR,Ego_speed,FCW,Gral_pedestrian,Lattitude,...,Right_thumb_z,Right_thumb_vis,Left_hip_x,Left_hip_y,Left_hip_z,Left_hip_vis,Right_hip_x,Right_hip_y,Right_hip_z,Right_hip_vis
0,1620700292389898,0,0,0,0.00,0.00,0,0,0,2727.1414,...,-0.0025,0.6229,0.4692,0.1584,0.0945,0.9966,0.4506,0.2904,0.2779,0.9984
1,1620700292444904,0,0,0,-1.88,1.88,0,0,0,2727.1416,...,-0.0027,0.6710,0.4762,0.1743,0.0705,0.9949,0.4572,0.3054,0.2579,0.9971
2,1620700292520406,0,0,0,-1.88,1.88,0,0,0,2727.1416,...,-0.0027,0.5903,0.4744,0.1679,0.0844,0.9977,0.4538,0.3033,0.2737,0.9988
3,1620700292589380,0,0,0,-1.88,1.88,0,0,0,2727.1416,...,-0.0028,0.5858,0.4758,0.1746,0.0819,0.9978,0.4534,0.3047,0.2693,0.9989
4,1620700292670665,0,0,0,-1.88,1.88,0,0,0,2727.1416,...,-0.0027,0.6888,0.4718,0.1703,0.0682,0.9920,0.4539,0.3084,0.2385,0.9968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22598,1620702606662417,0,0,0,-1.88,1.88,0,0,0,2727.1359,...,-0.0025,0.2250,0.4683,0.2507,0.2158,0.9906,0.4557,0.3879,0.3508,0.9920
22599,1620702606754876,0,0,0,-1.88,1.88,0,0,0,2727.1359,...,-0.0025,0.3545,0.4758,0.2619,0.1705,0.9990,0.4598,0.4088,0.3314,0.9992
22600,1620702606847360,0,0,0,-1.88,1.88,0,0,0,2727.1359,...,-0.0025,0.2344,0.4758,0.2726,0.0996,0.9872,0.4631,0.4340,0.2672,0.9860
22601,1620702606938313,0,0,0,-1.88,1.88,0,0,0,2727.1359,...,-0.0025,0.4460,0.4781,0.2726,0.1304,0.9708,0.4635,0.4355,0.2899,0.9715


In [38]:
trend_cols = cols.copy()
trend_cols.remove('Timestamp')
trend_cols, len(trend_cols)

(['BlinkL',
  'BlinkR',
  'Close_car',
  'Dist_laneL',
  'Dist_laneR',
  'Ego_speed',
  'FCW',
  'Gral_pedestrian',
  'Lattitude',
  'Longitude',
  'Course',
  'Variation',
  'East_West_variation',
  'Headway',
  'Accel_X',
  'Accel_Y',
  'Accel_Z',
  'Lane_curvature',
  'Lane_heading',
  'Marker',
  'Number_obstacles',
  'Obstacle_lane0',
  'Obstacle_lane1',
  'Obstacle_lane2',
  'Obstacle_lane3',
  'Obstacle_lane4',
  'Obstacle_lane5',
  'Obstacle_lane6',
  'Obstacle_lane7',
  'Obstacle_lane8',
  'Obstacle_lane9',
  'Obstacle_type0',
  'Obstacle_type1',
  'Obstacle_type2',
  'Obstacle_type3',
  'Obstacle_type4',
  'Obstacle_type5',
  'Obstacle_type6',
  'Obstacle_type7',
  'Obstacle_type8',
  'Obstacle_type9',
  'Obstacle_VelX0',
  'Obstacle_VelX1',
  'Obstacle_VelX2',
  'Obstacle_VelX3',
  'Obstacle_VelX4',
  'Obstacle_VelX5',
  'Obstacle_VelX6',
  'Obstacle_VelX7',
  'Obstacle_VelX8',
  'Obstacle_VelX9',
  'Obstacle_VelY0',
  'Obstacle_VelY1',
  'Obstacle_VelY2',
  'Obstacle_VelY3'

# Data As-Is

This part of the code generates synthetic data using the time series task with the data as-is. The hyperparameters we found that work best have been included for this run.

In [39]:
train_set

Unnamed: 0,Timestamp,BlinkL,BlinkR,Close_car,Dist_laneL,Dist_laneR,Ego_speed,FCW,Gral_pedestrian,Lattitude,...,Right_thumb_z,Right_thumb_vis,Left_hip_x,Left_hip_y,Left_hip_z,Left_hip_vis,Right_hip_x,Right_hip_y,Right_hip_z,Right_hip_vis
0,1620700292389898,0,0,0,0.00,0.00,0,0,0,2727.1414,...,-0.0025,0.6229,0.4692,0.1584,0.0945,0.9966,0.4506,0.2904,0.2779,0.9984
1,1620700292444904,0,0,0,-1.88,1.88,0,0,0,2727.1416,...,-0.0027,0.6710,0.4762,0.1743,0.0705,0.9949,0.4572,0.3054,0.2579,0.9971
2,1620700292520406,0,0,0,-1.88,1.88,0,0,0,2727.1416,...,-0.0027,0.5903,0.4744,0.1679,0.0844,0.9977,0.4538,0.3033,0.2737,0.9988
3,1620700292589380,0,0,0,-1.88,1.88,0,0,0,2727.1416,...,-0.0028,0.5858,0.4758,0.1746,0.0819,0.9978,0.4534,0.3047,0.2693,0.9989
4,1620700292670665,0,0,0,-1.88,1.88,0,0,0,2727.1416,...,-0.0027,0.6888,0.4718,0.1703,0.0682,0.9920,0.4539,0.3084,0.2385,0.9968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22598,1620702606662417,0,0,0,-1.88,1.88,0,0,0,2727.1359,...,-0.0025,0.2250,0.4683,0.2507,0.2158,0.9906,0.4557,0.3879,0.3508,0.9920
22599,1620702606754876,0,0,0,-1.88,1.88,0,0,0,2727.1359,...,-0.0025,0.3545,0.4758,0.2619,0.1705,0.9990,0.4598,0.4088,0.3314,0.9992
22600,1620702606847360,0,0,0,-1.88,1.88,0,0,0,2727.1359,...,-0.0025,0.2344,0.4758,0.2726,0.0996,0.9872,0.4631,0.4340,0.2672,0.9860
22601,1620702606938313,0,0,0,-1.88,1.88,0,0,0,2727.1359,...,-0.0025,0.4460,0.4781,0.2726,0.1304,0.9708,0.4635,0.4355,0.2899,0.9715


In [41]:
from smart_open import open
import yaml

from gretel_client import create_project
from gretel_client.helpers import poll

# Create a project and model configuration.
project = create_project(display_name="time-series-synthetic-data-as-is")

# Pull down the default synthetic config.  We will modify it slightly.
with open("https://raw.githubusercontent.com/gretelai/gretel-blueprints/main/config_templates/gretel/synthetics/default.yml", 'r') as stream:
    config = yaml.safe_load(stream)

# Here we create an object to specify the timeseries task.
time_field = "Timestamp"
trend_fields = trend_cols

task = {
    'type': 'time_series',
    'attrs': {
        'time_field': time_field,
        'trend_fields': trend_fields
    }
}

config['models'][0]['synthetics']['task'] = task

config['models'][0]['synthetics']['params']['vocab_size'] = 0
config['models'][0]['synthetics']['params']['predict_batch_size'] = 1
config['models'][0]['synthetics']['params']['reset_states'] = True
config['models'][0]['synthetics']['params']['overwrite'] = True
config['models'][0]['synthetics']['params']['validation_split'] = False #Our validation split does not support time series tasks yet. Updated config will be uploaded soon.
config['models'][0]['synthetics']['params']['dropout_rate'] = .25 #0.5
config['models'][0]['synthetics']['params']['gen_temp'] = .750 #.898
config['models'][0]['synthetics']['params']['learning_rate'] = .001 #0.0035
config['models'][0]['synthetics']['params']['rnn_units'] = 64 #64

model = project.create_model_obj(model_config=config)

# Get a csv to work with, just dump out the train_df.

train_df.to_csv('train.csv', index=False)
model.data_source = 'train.csv'

# Upload the training data.  Train the model.
model.submit(upload_data_source=True)

poll(model)

# Use the model to generate synthetic data.
record_handler = model.create_record_handler_obj()

# For time series data we dump out the date column to seed the record handler.
train_df.Timestamp.to_csv('idx_seeds.csv', index=False)

record_handler.submit(
    action="generate",
    params={"num_records": 22600, "max_invalid": 20000},
    data_source='idx_seeds.csv',
    upload_data_source=True
)

poll(record_handler)

synthetic = pd.read_csv(record_handler.get_artifact_link("data"), compression='gzip')

synthetic.head()

train_df.Ego_speed.head(1000).plot(figsize=(12, 8))
plt.show()

synthetic.Ego_speed.head(1000).plot(figsize=(12, 8))
plt.show()

INFO: Starting poller


{
    "uid": "6163e9b4c72b9bc7a6d37316",
    "model_name": "intelligent-entertaining-elephant",
    "runner_mode": "cloud",
    "user_id": "60f7ffc2bff621796155eab8",
    "project_id": "6163e9a3416bf7282862cf78",
    "status_history": {
        "created": "2021-10-11T07:37:24.595030Z"
    },
    "last_modified": "2021-10-11T07:37:24.770303Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:5188b73e1fc582fde1b3d77cac52d03a5e26a7bcc59b68e52fd04f4a8501b7d0",
    "model_type": "synthetics",
    "config": {
        "schema_version": "1.0",
        "name": null,
        "models": [
            {
                "synthetics": {
                    "params": {
                        "field_delimiter": null,
                        "epochs": 100,
                        "

INFO: Status is pending. A Gretel Cloud worker is being allocated to begin model creation.
INFO: Status is active. A worker has started creating your model!
2021-10-11T07:37:42.025047Z  Starting synthetic model training
2021-10-11T07:37:42.027238Z  Loading training data
2021-10-11T07:37:42.798089Z  Training data loaded
{
    "record_count": 22603,
    "field_count": 163
}
2021-10-11T07:37:54.414657Z  Creating semantic validators and preparing training data
2021-10-11T07:41:15.226577Z  Beginning ML model training
2021-10-11T07:41:41.449052Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.7074,
    "loss": 0.969,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-10-11T07:41:53.028376Z  Training epoch completed
{
    "epoch": 1,
    "accuracy": 0.7697,
    "loss": 0.6952,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-10-11T07:42:04.555527Z  Training epoch completed
{
    "epoch": 2,
    "accuracy": 0.7777,
    "loss": 0.6583,
    "val_accuracy

2021-10-11T07:50:09.699127Z  Training epoch completed
{
    "epoch": 44,
    "accuracy": 0.8219,
    "loss": 0.5188,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-10-11T07:50:21.187938Z  Training epoch completed
{
    "epoch": 45,
    "accuracy": 0.8222,
    "loss": 0.5184,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-10-11T07:50:32.759776Z  Training epoch completed
{
    "epoch": 46,
    "accuracy": 0.8223,
    "loss": 0.5177,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-10-11T07:50:44.291614Z  Training epoch completed
{
    "epoch": 47,
    "accuracy": 0.8226,
    "loss": 0.5171,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-10-11T07:50:55.906391Z  Training epoch completed
{
    "epoch": 48,
    "accuracy": 0.8228,
    "loss": 0.5164,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-10-11T07:51:07.500018Z  Training epoch completed
{
    "epoch": 49,
    "accuracy": 0.823,
    "loss": 0.5158,
    

2021-10-11T07:58:42.537486Z  Training epoch completed
{
    "epoch": 4,
    "accuracy": 0.9614,
    "loss": 0.1417,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 1
}
2021-10-11T07:58:47.401398Z  Training epoch completed
{
    "epoch": 5,
    "accuracy": 0.9614,
    "loss": 0.1389,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 1
}
2021-10-11T07:58:52.257696Z  Training epoch completed
{
    "epoch": 6,
    "accuracy": 0.9616,
    "loss": 0.1364,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 1
}
2021-10-11T07:58:57.153688Z  Training epoch completed
{
    "epoch": 7,
    "accuracy": 0.9622,
    "loss": 0.1337,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 1
}
2021-10-11T07:59:02.037510Z  Training epoch completed
{
    "epoch": 8,
    "accuracy": 0.9631,
    "loss": 0.1314,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 1
}
2021-10-11T07:59:06.917063Z  Training epoch completed
{
    "epoch": 9,
    "accuracy": 0.9637,
    "loss": 0.1294,
    "val_

2021-10-11T08:02:30.609958Z  Training epoch completed
{
    "epoch": 24,
    "accuracy": 0.8862,
    "loss": 0.3656,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 2
}
2021-10-11T08:02:35.136466Z  Training epoch completed
{
    "epoch": 25,
    "accuracy": 0.8864,
    "loss": 0.3657,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 2
}
2021-10-11T08:02:39.697835Z  Training epoch completed
{
    "epoch": 26,
    "accuracy": 0.8873,
    "loss": 0.3624,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 2
}
2021-10-11T08:02:44.219287Z  Training epoch completed
{
    "epoch": 27,
    "accuracy": 0.8879,
    "loss": 0.36,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 2
}
2021-10-11T08:02:48.852022Z  Training epoch completed
{
    "epoch": 28,
    "accuracy": 0.8885,
    "loss": 0.3581,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 2
}
2021-10-11T08:02:53.411840Z  Training epoch completed
{
    "epoch": 29,
    "accuracy": 0.889,
    "loss": 0.356,
    "va

2021-10-11T08:06:04.229195Z  Training epoch completed
{
    "epoch": 71,
    "accuracy": 0.8973,
    "loss": 0.3252,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 2
}
2021-10-11T08:06:08.865393Z  Training epoch completed
{
    "epoch": 72,
    "accuracy": 0.8974,
    "loss": 0.325,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 2
}
2021-10-11T08:06:13.389507Z  Training epoch completed
{
    "epoch": 73,
    "accuracy": 0.8976,
    "loss": 0.3247,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 2
}
2021-10-11T08:06:17.923337Z  Training epoch completed
{
    "epoch": 74,
    "accuracy": 0.8976,
    "loss": 0.3246,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 2
}
2021-10-11T08:06:22.463349Z  Training epoch completed
{
    "epoch": 75,
    "accuracy": 0.8975,
    "loss": 0.3244,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 2
}
2021-10-11T08:06:35.705568Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.8416,
    "loss": 0.5062,
    "

2021-10-11T08:11:36.158158Z  Training epoch completed
{
    "epoch": 2,
    "accuracy": 0.7952,
    "loss": 0.5767,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2021-10-11T08:11:54.466441Z  Training epoch completed
{
    "epoch": 3,
    "accuracy": 0.8048,
    "loss": 0.5502,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2021-10-11T08:12:12.767412Z  Training epoch completed
{
    "epoch": 4,
    "accuracy": 0.8125,
    "loss": 0.5283,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2021-10-11T08:12:31.190113Z  Training epoch completed
{
    "epoch": 5,
    "accuracy": 0.8182,
    "loss": 0.5115,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2021-10-11T08:12:49.685088Z  Training epoch completed
{
    "epoch": 6,
    "accuracy": 0.8221,
    "loss": 0.4998,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2021-10-11T08:13:07.897169Z  Training epoch completed
{
    "epoch": 7,
    "accuracy": 0.8246,
    "loss": 0.4913,
    "val_

2021-10-11T08:25:52.411147Z  Training epoch completed
{
    "epoch": 49,
    "accuracy": 0.8414,
    "loss": 0.4358,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2021-10-11T08:26:10.928408Z  Training epoch completed
{
    "epoch": 50,
    "accuracy": 0.8415,
    "loss": 0.4355,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2021-10-11T08:26:29.214180Z  Training epoch completed
{
    "epoch": 51,
    "accuracy": 0.8415,
    "loss": 0.4351,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2021-10-11T08:26:47.492106Z  Training epoch completed
{
    "epoch": 52,
    "accuracy": 0.8416,
    "loss": 0.435,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2021-10-11T08:27:05.709962Z  Training epoch completed
{
    "epoch": 53,
    "accuracy": 0.8417,
    "loss": 0.4348,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2021-10-11T08:27:23.939144Z  Training epoch completed
{
    "epoch": 54,
    "accuracy": 0.8417,
    "loss": 0.4347,
    

2021-10-11T08:38:33.814398Z  Training epoch completed
{
    "epoch": 34,
    "accuracy": 0.8724,
    "loss": 0.377,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 5
}
2021-10-11T08:38:49.088059Z  Training epoch completed
{
    "epoch": 35,
    "accuracy": 0.8725,
    "loss": 0.3764,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 5
}
2021-10-11T08:39:04.537602Z  Training epoch completed
{
    "epoch": 36,
    "accuracy": 0.8728,
    "loss": 0.3756,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 5
}
2021-10-11T08:39:20.136719Z  Training epoch completed
{
    "epoch": 37,
    "accuracy": 0.8729,
    "loss": 0.3753,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 5
}
2021-10-11T08:39:35.705283Z  Training epoch completed
{
    "epoch": 38,
    "accuracy": 0.873,
    "loss": 0.3745,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 5
}
2021-10-11T08:39:51.104007Z  Training epoch completed
{
    "epoch": 39,
    "accuracy": 0.8732,
    "loss": 0.374,
    "v

2021-10-11T08:50:29.158418Z  Training epoch completed
{
    "epoch": 1,
    "accuracy": 0.7734,
    "loss": 0.6633,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 6
}
2021-10-11T08:50:36.451058Z  Training epoch completed
{
    "epoch": 2,
    "accuracy": 0.7856,
    "loss": 0.6184,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 6
}
2021-10-11T08:50:43.726357Z  Training epoch completed
{
    "epoch": 3,
    "accuracy": 0.7951,
    "loss": 0.5858,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 6
}
2021-10-11T08:50:51.027370Z  Training epoch completed
{
    "epoch": 4,
    "accuracy": 0.8065,
    "loss": 0.5566,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 6
}
2021-10-11T08:50:58.356197Z  Training epoch completed
{
    "epoch": 5,
    "accuracy": 0.8161,
    "loss": 0.5314,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 6
}
2021-10-11T08:51:05.652158Z  Training epoch completed
{
    "epoch": 6,
    "accuracy": 0.8232,
    "loss": 0.5113,
    "val_

2021-10-11T08:56:15.973429Z  Training epoch completed
{
    "epoch": 48,
    "accuracy": 0.8662,
    "loss": 0.3811,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 6
}
2021-10-11T08:56:23.410805Z  Training epoch completed
{
    "epoch": 49,
    "accuracy": 0.8665,
    "loss": 0.3805,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 6
}
2021-10-11T08:56:30.835282Z  Training epoch completed
{
    "epoch": 50,
    "accuracy": 0.8667,
    "loss": 0.3797,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 6
}
2021-10-11T08:56:38.212650Z  Training epoch completed
{
    "epoch": 51,
    "accuracy": 0.8668,
    "loss": 0.3796,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 6
}
2021-10-11T08:56:45.621636Z  Training epoch completed
{
    "epoch": 52,
    "accuracy": 0.8671,
    "loss": 0.379,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 6
}
2021-10-11T08:56:53.052019Z  Training epoch completed
{
    "epoch": 53,
    "accuracy": 0.8674,
    "loss": 0.3781,
    

2021-10-11T09:02:08.739461Z  Training epoch completed
{
    "epoch": 10,
    "accuracy": 0.9191,
    "loss": 0.2475,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 7
}
2021-10-11T09:02:15.616007Z  Training epoch completed
{
    "epoch": 11,
    "accuracy": 0.9202,
    "loss": 0.2435,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 7
}
2021-10-11T09:02:22.495075Z  Training epoch completed
{
    "epoch": 12,
    "accuracy": 0.9215,
    "loss": 0.2393,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 7
}
2021-10-11T09:02:29.350862Z  Training epoch completed
{
    "epoch": 13,
    "accuracy": 0.9223,
    "loss": 0.2356,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 7
}
2021-10-11T09:02:36.227747Z  Training epoch completed
{
    "epoch": 14,
    "accuracy": 0.9232,
    "loss": 0.2329,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 7
}
2021-10-11T09:02:43.116329Z  Training epoch completed
{
    "epoch": 15,
    "accuracy": 0.9238,
    "loss": 0.23,
    "

2021-10-11T09:08:04.950909Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.7554,
    "loss": 0.7637,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 8
}
2021-10-11T09:08:25.172171Z  Training epoch completed
{
    "epoch": 1,
    "accuracy": 0.8039,
    "loss": 0.5578,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 8
}
2021-10-11T09:08:45.439007Z  Training epoch completed
{
    "epoch": 2,
    "accuracy": 0.8261,
    "loss": 0.4954,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 8
}
2021-10-11T09:09:05.646671Z  Training epoch completed
{
    "epoch": 3,
    "accuracy": 0.8393,
    "loss": 0.4594,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 8
}
2021-10-11T09:09:25.853601Z  Training epoch completed
{
    "epoch": 4,
    "accuracy": 0.846,
    "loss": 0.4387,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 8
}
2021-10-11T09:09:46.108268Z  Training epoch completed
{
    "epoch": 5,
    "accuracy": 0.85,
    "loss": 0.4257,
    "val_acc

2021-10-11T09:23:55.820466Z  Training epoch completed
{
    "epoch": 47,
    "accuracy": 0.8758,
    "loss": 0.3602,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 8
}
2021-10-11T09:24:16.026450Z  Training epoch completed
{
    "epoch": 48,
    "accuracy": 0.8759,
    "loss": 0.3598,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 8
}
2021-10-11T09:24:36.523375Z  Training epoch completed
{
    "epoch": 49,
    "accuracy": 0.876,
    "loss": 0.3596,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 8
}
2021-10-11T09:24:56.750010Z  Training epoch completed
{
    "epoch": 50,
    "accuracy": 0.8761,
    "loss": 0.3592,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 8
}
2021-10-11T09:25:17.011944Z  Training epoch completed
{
    "epoch": 51,
    "accuracy": 0.8763,
    "loss": 0.3587,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 8
}
2021-10-11T09:25:37.233271Z  Training epoch completed
{
    "epoch": 52,
    "accuracy": 0.8765,
    "loss": 0.3585,
    

2021-10-11T09:37:00.903718Z  Training epoch completed
{
    "epoch": 31,
    "accuracy": 0.859,
    "loss": 0.3876,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 9
}
2021-10-11T09:37:15.431359Z  Training epoch completed
{
    "epoch": 32,
    "accuracy": 0.8591,
    "loss": 0.3871,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 9
}
2021-10-11T09:37:30.005704Z  Training epoch completed
{
    "epoch": 33,
    "accuracy": 0.8592,
    "loss": 0.3867,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 9
}
2021-10-11T09:37:44.578754Z  Training epoch completed
{
    "epoch": 34,
    "accuracy": 0.8592,
    "loss": 0.3863,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 9
}
2021-10-11T09:37:59.160452Z  Training epoch completed
{
    "epoch": 35,
    "accuracy": 0.8595,
    "loss": 0.3857,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 9
}
2021-10-11T09:38:13.824887Z  Training epoch completed
{
    "epoch": 36,
    "accuracy": 0.8595,
    "loss": 0.3855,
    

2021-10-11T09:48:49.390889Z  Training epoch completed
{
    "epoch": 16,
    "accuracy": 0.8813,
    "loss": 0.3272,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 10
}
2021-10-11T09:49:04.164804Z  Training epoch completed
{
    "epoch": 17,
    "accuracy": 0.8816,
    "loss": 0.3258,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 10
}
2021-10-11T09:49:18.919028Z  Training epoch completed
{
    "epoch": 18,
    "accuracy": 0.8819,
    "loss": 0.3249,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 10
}
2021-10-11T09:49:33.712363Z  Training epoch completed
{
    "epoch": 19,
    "accuracy": 0.8825,
    "loss": 0.3238,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 10
}
2021-10-11T09:49:48.503483Z  Training epoch completed
{
    "epoch": 20,
    "accuracy": 0.8827,
    "loss": 0.3228,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 10
}
2021-10-11T09:50:03.279505Z  Training epoch completed
{
    "epoch": 21,
    "accuracy": 0.8829,
    "loss": 0.3219

2021-10-11T09:57:55.488075Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 55,
    "new_valid_count": 0,
    "new_invalid_count": 7,
    "completion_percent": 0.0
}
2021-10-11T09:58:00.493302Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 60,
    "new_valid_count": 0,
    "new_invalid_count": 5,
    "completion_percent": 0.0
}
2021-10-11T09:58:05.499094Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 63,
    "new_valid_count": 0,
    "new_invalid_count": 3,
    "completion_percent": 0.0
}
2021-10-11T09:58:10.505120Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 64,
    "new_valid_count": 0,
    "new_invalid_count": 1,
    "completion_percent": 0.0
}
2021-10-11T09:58:15.511155Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 70,
    "new_valid_count": 0,
    "new_invalid_count": 6,
    "completion_p

ApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Content-Type': 'application/json', 'Content-Length': '94', 'Connection': 'keep-alive', 'Date': 'Mon, 11 Oct 2021 09:59:13 GMT', 'x-amzn-RequestId': '390542d3-cbea-4e99-9261-aa7b6a439b67', 'Access-Control-Allow-Origin': '*', 'x-amz-apigw-id': 'HCalPGhTPHcFd5g=', 'X-Amzn-Trace-Id': 'Root=1-61640aee-2057dfea3f4df1ae58ef709a;Sampled=0', 'Access-Control-Allow-Credentials': 'true', 'X-Cache': 'Error from cloudfront', 'Via': '1.1 35202ecfee8e63e178de36be1b541f0f.cloudfront.net (CloudFront)', 'X-Amz-Cf-Pop': 'SYD62-P2', 'X-Amz-Cf-Id': '-1bbVouX57ldgDipUeIUA-4JkInJ7a7h5kHrVk7QLvSj1epWzAvyqA=='})
HTTP response body: {"message": "Model cannot be used, current status is: error", "context": {}, "error_id": null}


# Data scaled

This section scales the data. The reason we scaled the data here is to avoid the invalid lines error, originally. But it could also help the LSTM find patterns more easily

In [42]:
from smart_open import open
import yaml

from gretel_client import create_project
from gretel_client.helpers import poll

# Create a project and model configuration.
project = create_project(display_name="time-series-synthetic-data-scaled")

# Pull down the default synthetic config.  We will modify it slightly.
with open("https://raw.githubusercontent.com/gretelai/gretel-blueprints/main/config_templates/gretel/synthetics/default.yml", 'r') as stream:
    config = yaml.safe_load(stream)

# Here we create an object to specify the timeseries task.
time_field = "Timestamp"
trend_fields = trend_cols

# modifying dataset: make sure to scale the datapoints appropriately,
# so the LSTM does not create more or less precision for each data point.

train_df[trend_cols] = train_df[trend_cols] * 1e4
train_df = train_df.astype(int)

task = {
    'type': 'time_series',
    'attrs': {
        'time_field': time_field,
        'trend_fields': trend_fields
    }
}

config['models'][0]['synthetics']['task'] = task

config['models'][0]['synthetics']['params']['vocab_size'] = 0
config['models'][0]['synthetics']['params']['predict_batch_size'] = 1
config['models'][0]['synthetics']['params']['reset_states'] = False
config['models'][0]['synthetics']['params']['overwrite'] = True
config['models'][0]['synthetics']['params']['validation_split'] = False #Our validation split does not support time series tasks yet. Updated config will be uploaded soon.
config['models'][0]['synthetics']['params']['dropout_rate'] = .6 #0.5
config['models'][0]['synthetics']['params']['gen_temp'] = 1.0 #.898
config['models'][0]['synthetics']['params']['learning_rate'] = .0099 #0.0035
config['models'][0]['synthetics']['params']['rnn_units'] = 64

model = project.create_model_obj(model_config=config)

# Get a csv to work with, just dump out the train_df.
train_df.to_csv('train.csv', index=False)
model.data_source = 'train.csv'

# Upload the training data.  Train the model.
model.submit(upload_data_source=True)

poll(model)

# Use the model to generate synthetic data.
record_handler = model.create_record_handler_obj()

# For time series data we dump out the date column to seed the record handler.
train_df.Timestamp.to_csv('idx_seeds.csv', index=False)

record_handler.submit(
    action="generate",
    params={"num_records": 22600, "max_invalid": 20000},
    data_source='idx_seeds.csv',
    upload_data_source=True
)

poll(record_handler)

synthetic_scaled = pd.read_csv(record_handler.get_artifact_link("data"), compression='gzip')

synthetic_scaled.head()

INFO: Starting poller


{
    "uid": "61640ff5c9704d3d27d0c188",
    "model_name": "fluffy-sweet-jackalope",
    "runner_mode": "cloud",
    "user_id": "60f7ffc2bff621796155eab8",
    "project_id": "61640fd69cdead02501b24d8",
    "status_history": {
        "created": "2021-10-11T10:20:37.118064Z"
    },
    "last_modified": "2021-10-11T10:20:37.373658Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:5188b73e1fc582fde1b3d77cac52d03a5e26a7bcc59b68e52fd04f4a8501b7d0",
    "model_type": "synthetics",
    "config": {
        "schema_version": "1.0",
        "name": null,
        "models": [
            {
                "synthetics": {
                    "params": {
                        "field_delimiter": null,
                        "epochs": 100,
                        "batch_size"

INFO: Status is created. Model creation has been queued.
INFO: Status is pending. A Gretel Cloud worker is being allocated to begin model creation.
INFO: Status is active. A worker has started creating your model!
2021-10-11T10:20:54.031783Z  Starting synthetic model training
2021-10-11T10:20:54.033812Z  Loading training data
2021-10-11T10:20:54.704691Z  Training data loaded
{
    "record_count": 22603,
    "field_count": 163
}
2021-10-11T10:21:06.449912Z  Creating semantic validators and preparing training data
2021-10-11T10:24:17.189517Z  Beginning ML model training
2021-10-11T10:24:32.785692Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.683,
    "loss": 1.0003,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-10-11T10:24:38.462600Z  Training epoch completed
{
    "epoch": 1,
    "accuracy": 0.7228,
    "loss": 0.8436,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-10-11T10:24:44.190251Z  Training epoch completed
{
    "epoch": 2,
    

2021-10-11T10:28:37.002149Z  Training epoch completed
{
    "epoch": 13,
    "accuracy": 0.9615,
    "loss": 0.1407,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 1
}
2021-10-11T10:28:41.813107Z  Training epoch completed
{
    "epoch": 14,
    "accuracy": 0.9614,
    "loss": 0.1403,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 1
}
2021-10-11T10:28:46.570773Z  Training epoch completed
{
    "epoch": 15,
    "accuracy": 0.9618,
    "loss": 0.1395,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 1
}
2021-10-11T10:28:51.353539Z  Training epoch completed
{
    "epoch": 16,
    "accuracy": 0.9616,
    "loss": 0.1393,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 1
}
2021-10-11T10:28:56.115593Z  Training epoch completed
{
    "epoch": 17,
    "accuracy": 0.9615,
    "loss": 0.1395,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 1
}
2021-10-11T10:29:03.987495Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.8214,
    "loss": 0.5988,
    

2021-10-11T10:31:46.818813Z  Training epoch completed
{
    "epoch": 13,
    "accuracy": 0.9659,
    "loss": 0.1301,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2021-10-11T10:31:51.877547Z  Training epoch completed
{
    "epoch": 14,
    "accuracy": 0.9659,
    "loss": 0.1299,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2021-10-11T10:32:03.697363Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.6552,
    "loss": 1.1288,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2021-10-11T10:32:08.752725Z  Training epoch completed
{
    "epoch": 1,
    "accuracy": 0.6924,
    "loss": 0.9753,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2021-10-11T10:32:13.801179Z  Training epoch completed
{
    "epoch": 2,
    "accuracy": 0.6941,
    "loss": 0.9612,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2021-10-11T10:32:18.854749Z  Training epoch completed
{
    "epoch": 3,
    "accuracy": 0.6958,
    "loss": 0.9541,
    "va

2021-10-11T10:36:03.044313Z  Training epoch completed
{
    "epoch": 8,
    "accuracy": 0.7334,
    "loss": 0.8507,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 5
}
2021-10-11T10:36:08.702944Z  Training epoch completed
{
    "epoch": 9,
    "accuracy": 0.7334,
    "loss": 0.8494,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 5
}
2021-10-11T10:36:14.392265Z  Training epoch completed
{
    "epoch": 10,
    "accuracy": 0.7338,
    "loss": 0.8489,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 5
}
2021-10-11T10:36:20.085539Z  Training epoch completed
{
    "epoch": 11,
    "accuracy": 0.7338,
    "loss": 0.8484,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 5
}
2021-10-11T10:36:25.796529Z  Training epoch completed
{
    "epoch": 12,
    "accuracy": 0.7339,
    "loss": 0.8476,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 5
}
2021-10-11T10:36:31.513048Z  Training epoch completed
{
    "epoch": 13,
    "accuracy": 0.7341,
    "loss": 0.8474,
    "

2021-10-11T10:39:59.060882Z  Training epoch completed
{
    "epoch": 15,
    "accuracy": 0.7551,
    "loss": 0.7643,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 7
}
2021-10-11T10:40:04.657403Z  Training epoch completed
{
    "epoch": 16,
    "accuracy": 0.7554,
    "loss": 0.7643,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 7
}
2021-10-11T10:40:10.242221Z  Training epoch completed
{
    "epoch": 17,
    "accuracy": 0.7557,
    "loss": 0.7628,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 7
}
2021-10-11T10:40:15.801134Z  Training epoch completed
{
    "epoch": 18,
    "accuracy": 0.7558,
    "loss": 0.7626,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 7
}
2021-10-11T10:40:21.420049Z  Training epoch completed
{
    "epoch": 19,
    "accuracy": 0.7559,
    "loss": 0.7625,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 7
}
2021-10-11T10:40:26.972974Z  Training epoch completed
{
    "epoch": 20,
    "accuracy": 0.7558,
    "loss": 0.762,
    

2021-10-11T10:43:50.866798Z  Training epoch completed
{
    "epoch": 5,
    "accuracy": 0.7536,
    "loss": 0.7415,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 9
}
2021-10-11T10:43:54.127793Z  Training epoch completed
{
    "epoch": 6,
    "accuracy": 0.7543,
    "loss": 0.7342,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 9
}
2021-10-11T10:43:57.397249Z  Training epoch completed
{
    "epoch": 7,
    "accuracy": 0.7553,
    "loss": 0.7292,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 9
}
2021-10-11T10:44:00.668879Z  Training epoch completed
{
    "epoch": 8,
    "accuracy": 0.7558,
    "loss": 0.7251,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 9
}
2021-10-11T10:44:03.921372Z  Training epoch completed
{
    "epoch": 9,
    "accuracy": 0.7563,
    "loss": 0.7217,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 9
}
2021-10-11T10:44:07.185884Z  Training epoch completed
{
    "epoch": 10,
    "accuracy": 0.7568,
    "loss": 0.7176,
    "val

2021-10-11T10:46:31.959024Z  Generation in progress
{
    "current_valid_count": 11,
    "current_invalid_count": 161,
    "new_valid_count": 2,
    "new_invalid_count": 33,
    "completion_percent": 0.22
}
2021-10-11T10:46:36.964672Z  Generation in progress
{
    "current_valid_count": 13,
    "current_invalid_count": 189,
    "new_valid_count": 2,
    "new_invalid_count": 28,
    "completion_percent": 0.26
}
2021-10-11T10:46:41.969635Z  Generation in progress
{
    "current_valid_count": 16,
    "current_invalid_count": 212,
    "new_valid_count": 3,
    "new_invalid_count": 23,
    "completion_percent": 0.32
}
2021-10-11T10:46:46.974506Z  Generation in progress
{
    "current_valid_count": 19,
    "current_invalid_count": 241,
    "new_valid_count": 3,
    "new_invalid_count": 29,
    "completion_percent": 0.38
}
2021-10-11T10:46:51.979589Z  Generation in progress
{
    "current_valid_count": 21,
    "current_invalid_count": 270,
    "new_valid_count": 2,
    "new_invalid_count": 29

ApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Content-Type': 'application/json', 'Content-Length': '94', 'Connection': 'keep-alive', 'Date': 'Mon, 11 Oct 2021 10:48:09 GMT', 'x-amzn-RequestId': 'cc2c71dc-d5d6-4a88-8f29-24dc64fcd8a6', 'Access-Control-Allow-Origin': '*', 'x-amz-apigw-id': 'HChwCEzMPHcFg1A=', 'X-Amzn-Trace-Id': 'Root=1-61641666-52a8a41f44bf542740b4a54d;Sampled=0', 'Access-Control-Allow-Credentials': 'true', 'X-Cache': 'Error from cloudfront', 'Via': '1.1 7bda591fa44b42ef6384ae955fdd5d7d.cloudfront.net (CloudFront)', 'X-Amz-Cf-Pop': 'SYD62-P2', 'X-Amz-Cf-Id': '6sIiU42u4g8L7tIm0YPlPE2eLjq5GxmZyBtNjwBq3jqhvlP_8qNBWQ=='})
HTTP response body: {"message": "Model cannot be used, current status is: error", "context": {}, "error_id": null}


In [None]:
# Converts train_df back to the original values for evaluation purposes.
train_df[trend_cols] = train_df[trend_cols] * 1e-4
synthetic_scaled[trend_cols] = synthetic_scaled[trend_cols] * 1e-4

train_df.Ego_speed.head(1000).plot(figsize=(12, 8))
plt.show()

synthetic_scaled.Ego_speed.head(1000).plot(figsize=(12, 8))
plt.show()

# Taking first order of difference of the data (original data)

One last approach we tried is to take the first order difference of the trend columns. This is a traditional way of trying to model time series data. For this task, we utilize the seeding task instead of the time series task, since we already did some feature engineering on the time series dataset.

In [44]:
decimal_places = 4
train_df = df.copy(deep=True)
for name in cols_to_reduce_decimal_places:
    train_df[name] = train_df[name].round(decimals=decimal_places)

In [45]:
train_df = train_df[cols]
train_df

Unnamed: 0,Timestamp,BlinkL,BlinkR,Close_car,Dist_laneL,Dist_laneR,Ego_speed,FCW,Gral_pedestrian,Lattitude,...,Right_thumb_z,Right_thumb_vis,Left_hip_x,Left_hip_y,Left_hip_z,Left_hip_vis,Right_hip_x,Right_hip_y,Right_hip_z,Right_hip_vis
0,1620700292389898,0,0,0,0.00,0.00,0,0,0,2727.1414,...,-0.0025,0.6229,0.4692,0.1584,0.0945,0.9966,0.4506,0.2904,0.2779,0.9984
1,1620700292444904,0,0,0,-1.88,1.88,0,0,0,2727.1416,...,-0.0027,0.6710,0.4762,0.1743,0.0705,0.9949,0.4572,0.3054,0.2579,0.9971
2,1620700292520406,0,0,0,-1.88,1.88,0,0,0,2727.1416,...,-0.0027,0.5903,0.4744,0.1679,0.0844,0.9977,0.4538,0.3033,0.2737,0.9988
3,1620700292589380,0,0,0,-1.88,1.88,0,0,0,2727.1416,...,-0.0028,0.5858,0.4758,0.1746,0.0819,0.9978,0.4534,0.3047,0.2693,0.9989
4,1620700292670665,0,0,0,-1.88,1.88,0,0,0,2727.1416,...,-0.0027,0.6888,0.4718,0.1703,0.0682,0.9920,0.4539,0.3084,0.2385,0.9968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22598,1620702606662417,0,0,0,-1.88,1.88,0,0,0,2727.1359,...,-0.0025,0.2250,0.4683,0.2507,0.2158,0.9906,0.4557,0.3879,0.3508,0.9920
22599,1620702606754876,0,0,0,-1.88,1.88,0,0,0,2727.1359,...,-0.0025,0.3545,0.4758,0.2619,0.1705,0.9990,0.4598,0.4088,0.3314,0.9992
22600,1620702606847360,0,0,0,-1.88,1.88,0,0,0,2727.1359,...,-0.0025,0.2344,0.4758,0.2726,0.0996,0.9872,0.4631,0.4340,0.2672,0.9860
22601,1620702606938313,0,0,0,-1.88,1.88,0,0,0,2727.1359,...,-0.0025,0.4460,0.4781,0.2726,0.1304,0.9708,0.4635,0.4355,0.2899,0.9715


In [46]:
from smart_open import open
import yaml

from gretel_client import create_project
from gretel_client.helpers import poll

# Create a project and model configuration.
project = create_project(display_name="time-series-synthetic-data-diff")

# Pull down the default synthetic config.  We will modify it slightly.
with open("https://raw.githubusercontent.com/gretelai/gretel-blueprints/main/config_templates/gretel/synthetics/default.yml", 'r') as stream:
    config = yaml.safe_load(stream)

# Here we create an object to specify the timeseries task.
fields=["Timestamp"]

#Leaving these here, in case you want to seed with the actual fields.
#trend_fields=["EDA", "ECG", "Left Pupil Diameter (m)", "Right Pupil Diameter (m)", "Eye Opening Left", "Eye Opening Right", "PERCLOS Value", "Blinking"]
#trend_fields = ['ECG'] #Used for individual experiments
#trend_fields=["EDA", "ECG", "Left Pupil Diameter (m)", "Right Pupil Diameter (m)", "Eye Opening Left", "Eye Opening Right", "PERCLOS Value"]

#modifying dataset
#train_df = train_df[['idx_col', 'ECG']] #Used for single column experiments
diff_df = train_df.copy(deep=True)
for col in trend_fields:
    diff_df[col] = diff_df[col].diff()
    diff_df[col].loc[[0]] = 0.0

task = {
    'type': 'seed',
    'attrs': {
        'fields': fields
    }
}

config['models'][0]['synthetics']['task'] = task
config['models'][0]['synthetics']['params']['vocab_size'] = 0
config['models'][0]['synthetics']['params']['predict_batch_size'] = 1
config['models'][0]['synthetics']['params']['reset_states'] = True
config['models'][0]['synthetics']['params']['overwrite'] = True
config['models'][0]['synthetics']['params']['validation_split'] = False #Our validation split does not support time series tasks yet. Updated config will be uploaded soon.
config['models'][0]['synthetics']['params']['dropout_rate'] = .51
config['models'][0]['synthetics']['params']['gen_temp'] = 1.05
config['models'][0]['synthetics']['params']['learning_rate'] = .0099
config['models'][0]['synthetics']['params']['rnn_units'] = 192

model = project.create_model_obj(model_config=config)

# Get a csv to work with, just dump out the train_df.
train_df.to_csv('train.csv', index=False)
model.data_source = 'train.csv'

# Upload the training data.  Train the model.
model.submit(upload_data_source=True)

poll(model)

# Use the model to generate synthetic data.
record_handler = model.create_record_handler_obj()

# For time series data we dump out the date column to seed the record handler.
train_df['Timestamp'].to_csv('idx_seeds.csv', index=False)

#    data_source='idx_seeds.csv', (taken out for experimentation)
record_handler.submit(
    action="generate",
    params={"num_records": 22600, "max_invalid": 20000},
    data_source='idx_seeds.csv',
    upload_data_source=True
)

poll(record_handler)

synthetic_diff = pd.read_csv(record_handler.get_artifact_link("data"), compression='gzip')

synthetic_diff.head()





A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

INFO: Starting poller


{
    "uid": "61641f6f79117449ed486a37",
    "model_name": "jazzy-glamorous-mule",
    "runner_mode": "cloud",
    "user_id": "60f7ffc2bff621796155eab8",
    "project_id": "61641f566d892a003550280d",
    "status_history": {
        "created": "2021-10-11T11:26:39.347118Z"
    },
    "last_modified": "2021-10-11T11:26:39.598060Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:5188b73e1fc582fde1b3d77cac52d03a5e26a7bcc59b68e52fd04f4a8501b7d0",
    "model_type": "synthetics",
    "config": {
        "schema_version": "1.0",
        "name": null,
        "models": [
            {
                "synthetics": {
                    "params": {
                        "field_delimiter": null,
                        "epochs": 100,
                        "batch_size": 

INFO: Status is created. Model creation has been queued.
INFO: Status is pending. A Gretel Cloud worker is being allocated to begin model creation.
INFO: Status is active. A worker has started creating your model!
2021-10-11T11:26:56.362927Z  Starting synthetic model training
2021-10-11T11:26:56.364755Z  Loading training data
2021-10-11T11:26:57.201666Z  Training data loaded
{
    "record_count": 22603,
    "field_count": 163
}
2021-10-11T11:27:07.570906Z  Creating semantic validators and preparing training data
2021-10-11T11:30:19.759611Z  Beginning ML model training
2021-10-11T11:30:36.984113Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.7597,
    "loss": 0.7125,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-10-11T11:30:44.620312Z  Training epoch completed
{
    "epoch": 1,
    "accuracy": 0.8088,
    "loss": 0.5216,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-10-11T11:30:52.279223Z  Training epoch completed
{
    "epoch": 2,
   

2021-10-11T11:37:14.613705Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.9132,
    "loss": 0.3014,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2021-10-11T11:37:21.251938Z  Training epoch completed
{
    "epoch": 1,
    "accuracy": 0.9516,
    "loss": 0.1694,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2021-10-11T11:37:27.883002Z  Training epoch completed
{
    "epoch": 2,
    "accuracy": 0.9585,
    "loss": 0.1422,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2021-10-11T11:37:34.513478Z  Training epoch completed
{
    "epoch": 3,
    "accuracy": 0.962,
    "loss": 0.1348,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2021-10-11T11:37:41.129917Z  Training epoch completed
{
    "epoch": 4,
    "accuracy": 0.9623,
    "loss": 0.1326,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2021-10-11T11:37:47.745963Z  Training epoch completed
{
    "epoch": 5,
    "accuracy": 0.962,
    "loss": 0.1357,
    "val_ac

2021-10-11T11:40:45.558945Z  Training epoch completed
{
    "epoch": 13,
    "accuracy": 0.9582,
    "loss": 0.137,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 5
}
2021-10-11T11:41:03.422054Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.6118,
    "loss": 1.1368,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 6
}
2021-10-11T11:41:13.066252Z  Training epoch completed
{
    "epoch": 1,
    "accuracy": 0.7071,
    "loss": 0.8141,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 6
}
2021-10-11T11:41:22.699805Z  Training epoch completed
{
    "epoch": 2,
    "accuracy": 0.7173,
    "loss": 0.7828,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 6
}
2021-10-11T11:41:32.312839Z  Training epoch completed
{
    "epoch": 3,
    "accuracy": 0.7214,
    "loss": 0.7698,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 6
}
2021-10-11T11:41:41.934969Z  Training epoch completed
{
    "epoch": 4,
    "accuracy": 0.7238,
    "loss": 0.7625,
    "val_

2021-10-11T11:47:52.616780Z  Training epoch completed
{
    "epoch": 13,
    "accuracy": 0.6326,
    "loss": 0.9738,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 8
}
2021-10-11T11:47:58.889326Z  Training epoch completed
{
    "epoch": 14,
    "accuracy": 0.6324,
    "loss": 0.9741,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 8
}
2021-10-11T11:48:05.144924Z  Training epoch completed
{
    "epoch": 15,
    "accuracy": 0.6326,
    "loss": 0.9741,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 8
}
2021-10-11T11:48:11.437418Z  Training epoch completed
{
    "epoch": 16,
    "accuracy": 0.6329,
    "loss": 0.9731,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 8
}
2021-10-11T11:48:17.680201Z  Training epoch completed
{
    "epoch": 17,
    "accuracy": 0.6328,
    "loss": 0.9733,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 8
}
2021-10-11T11:48:23.932522Z  Training epoch completed
{
    "epoch": 18,
    "accuracy": 0.6326,
    "loss": 0.9743,
   

2021-10-11T11:53:13.152753Z  Generation in progress
{
    "current_valid_count": 3,
    "current_invalid_count": 10,
    "new_valid_count": 3,
    "new_invalid_count": 8,
    "completion_percent": 0.06
}
2021-10-11T11:53:18.157645Z  Generation in progress
{
    "current_valid_count": 7,
    "current_invalid_count": 19,
    "new_valid_count": 4,
    "new_invalid_count": 9,
    "completion_percent": 0.14
}
2021-10-11T11:53:23.162683Z  Generation in progress
{
    "current_valid_count": 10,
    "current_invalid_count": 31,
    "new_valid_count": 3,
    "new_invalid_count": 12,
    "completion_percent": 0.2
}
2021-10-11T11:53:28.167765Z  Generation in progress
{
    "current_valid_count": 14,
    "current_invalid_count": 37,
    "new_valid_count": 4,
    "new_invalid_count": 6,
    "completion_percent": 0.28
}
2021-10-11T11:53:33.173782Z  Generation in progress
{
    "current_valid_count": 16,
    "current_invalid_count": 50,
    "new_valid_count": 2,
    "new_invalid_count": 13,
    "comp

ApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Content-Type': 'application/json', 'Content-Length': '94', 'Connection': 'keep-alive', 'Date': 'Mon, 11 Oct 2021 11:55:12 GMT', 'x-amzn-RequestId': '63ede7df-482a-4c7b-b401-ad93c4f282cd', 'Access-Control-Allow-Origin': '*', 'x-amz-apigw-id': 'HCrktF8ePHcF9dA=', 'X-Amzn-Trace-Id': 'Root=1-6164261d-45e586fa2511727456277d47;Sampled=0', 'Access-Control-Allow-Credentials': 'true', 'X-Cache': 'Error from cloudfront', 'Via': '1.1 4ab519b4cd27a1b8a4b258d7f39bbc7f.cloudfront.net (CloudFront)', 'X-Amz-Cf-Pop': 'SYD62-P2', 'X-Amz-Cf-Id': 'CNiARWJs1AkR_2YN3H7cL6FNU0zhrCsECR3PZZjGFgzINXPrsF9iTg=='})
HTTP response body: {"message": "Model cannot be used, current status is: error", "context": {}, "error_id": null}


In [None]:
for col, initial_val in zip(trend_fields, list(train_df[trend_fields].loc[[0]].values[0])):
    synthetic_diff[col].loc[[0]] = initial_val
    synthetic_diff[col] = synthetic_diff[col].cumsum()

train_df.ECG.head(1000).plot(figsize=(12, 8))
plt.show()

synthetic_diff.ECG.head(1000).plot(figsize=(12, 8))
plt.show()

# Hyperparameter Tuning

Our lead machine learning researcher, Amy, found this fantastic library called Optuna to help us tune the configs for your use case. Feel free to take this code, and play with it to find new params! This helped us make sure that the errors you were seeing went away.

In [None]:
import optuna
import yaml
import time

from smart_open import open
from gretel_client import create_project
from gretel_client.helpers import poll

# Pull down the default synthetic config.  We will modify it slightly.
with open("https://raw.githubusercontent.com/gretelai/gretel-blueprints/main/config_templates/gretel/synthetics/default.yml", 'r') as stream:
    config = yaml.safe_load(stream)

# Here we create an object to specify the timeseries task.
time_field = "Timestamp"
trend_fields = cols


task = {
    'type': 'time_series',
    'attrs': {
        'time_field': time_field,
        'trend_fields': trend_fields
    }
}

In [None]:

def objective(trial: optuna.Trial):
    
    config['models'][0]['synthetics']['task'] = task
    config['models'][0]['synthetics']['params']['predict_batch_size'] = 1
    config['models'][0]['synthetics']['params']['overwrite'] = True
    config['models'][0]['synthetics']['params']['validation_split'] = False #Our validation split does not support time series tasks yet. Updated config will be uploaded soon.

#     config['models'][0]['synthetics']['params']['vocab_size'] = trial.suggest_int(name="vocab_size", low=18, high=38, step=10)
    config['models'][0]['synthetics']['params']['rnn_units'] = trial.suggest_int(name="rnn_units", low=64, high=512, step=64)
    config['models'][0]['synthetics']['params']['dropout_rate'] = trial.suggest_float("dropout_rate", .25, .75)
    config['models'][0]['synthetics']['params']['gen_temp'] = trial.suggest_float("gen_temp", .5, 1.5)
    config['models'][0]['synthetics']['params']['learning_rate'] = trial.suggest_float("learning_rate",  .001, 0.01, log=True)
    config['models'][0]['synthetics']['params']['reset_states'] = trial.suggest_categorical(
        "reset_states", choices=[True, False])
        
    seconds = int(time.time())
    project_name = "Tuning Experiment" + str(seconds)
    project = create_project(display_name=project_name)
    
    model = project.create_model_obj(model_config=config)

    # Get a csv to work with, just dump out the train_df.
    train_df.to_csv('train.csv', index=False)
    model.data_source = 'train.csv'

    # Upload the training data.  Train the model.
    model.submit(upload_data_source=True)

    status = "active"
    sqs = 0
    while status == "active":
        #Sleep a bit here
        time.sleep(60)
        models = []
        for model in project.search_models():
            ms = model.__dict__['_data']['model']
            ms = {key: ms[key] for key in ['model_name', 'model_type', 'status'] }
            status = ms["status"]
            print("Status is: " + status)
            if status == "completed":
                report = model.peek_report()
                if report:
                    sqs = report['synthetic_data_quality_score']['score']
                    print("Retrieved report sqs: " + str(sqs))
                else:
                    sqs = 0
            elif status == "error":
                sqs = 0
            
    return sqs

In [None]:
import time
# Create study that maximizes


study = optuna.create_study(direction="maximize")

# Start optimizing with however many trials you want
# You can do just one to see that it works, and then use the cells below
# to run more trials on the same study

study.optimize(objective, n_trials=10)

print(f"Optimized SQS: {study.best_value:.5f}")

print("Best params:")
for key, value in study.best_params.items():
    print(f"\t{key}: {value}")