In [1]:
!git clone https://github.com/jorgpg5/synthetic_data.git

Cloning into 'synthetic_data'...
remote: Enumerating objects: 13, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 13 (delta 4), reused 8 (delta 2), pack-reused 0[K
Unpacking objects: 100% (13/13), done.


# Synthesize Time Series data from your own DataFrame

This Blueprint demonstrates how to create synthetic time series data with Gretel. We assume that within the dataset
there is at least:

1) A specific column holding time data points

2) One or more columns that contain measurements or numerical observations for each point in time.

For this Blueprint, we will generate a very simple sine wave as our time series data.

In [2]:
%%capture

!pip install pyyaml smart_open numpy pandas
!pip install -U gretel-client

In [3]:
# Specify your Gretel API key

from getpass import getpass
import pandas as pd
from gretel_client import configure_session, ClientConfig

pd.set_option('max_colwidth', None)

configure_session(ClientConfig(api_key=getpass(prompt="Enter Gretel API key"), 
                               endpoint="https://api.gretel.cloud"))

Enter Gretel API key··········


In [4]:
# Create a simple timeseries with a sine and cosine wave

import datetime
import pandas as pd
import numpy as np

train_df = pd.read_csv('/content/synthetic_data/example_dataset.csv')

train_df

Unnamed: 0,EDA,ECG,Label,Left Pupil Diameter (m),Right Pupil Diameter (m),Eye Opening Left,Eye Opening Right,PERCLOS Value,Blinking
0,4.3114,-0.1822,2,0.002089,0.002242,0.4522,0.4111,0.5904,0
1,4.3129,-0.1535,2,0.002101,0.002183,0.4679,0.4372,0.5904,0
2,4.3120,-0.1209,2,0.002115,0.002224,0.4489,0.4276,0.5903,0
3,4.3129,-0.0918,2,0.002274,0.002392,0.5027,0.4294,0.5902,0
4,4.3116,-0.0696,2,0.002195,0.002318,0.4237,0.4388,0.5902,0
...,...,...,...,...,...,...,...,...,...
17995,5.0606,-0.0341,2,0.002624,0.002474,0.1496,0.0798,0.4570,0
17996,5.0598,0.0034,2,0.002624,0.002474,0.1496,0.0949,0.4570,0
17997,5.0613,0.0277,2,0.002624,0.002474,0.1496,0.0974,0.4571,0
17998,5.0589,0.0610,2,0.002624,0.002474,0.1496,0.0834,0.4571,0


In [5]:
train_df['idx_col'] = train_df.index
train_df

Unnamed: 0,EDA,ECG,Label,Left Pupil Diameter (m),Right Pupil Diameter (m),Eye Opening Left,Eye Opening Right,PERCLOS Value,Blinking,idx_col
0,4.3114,-0.1822,2,0.002089,0.002242,0.4522,0.4111,0.5904,0,0
1,4.3129,-0.1535,2,0.002101,0.002183,0.4679,0.4372,0.5904,0,1
2,4.3120,-0.1209,2,0.002115,0.002224,0.4489,0.4276,0.5903,0,2
3,4.3129,-0.0918,2,0.002274,0.002392,0.5027,0.4294,0.5902,0,3
4,4.3116,-0.0696,2,0.002195,0.002318,0.4237,0.4388,0.5902,0,4
...,...,...,...,...,...,...,...,...,...,...
17995,5.0606,-0.0341,2,0.002624,0.002474,0.1496,0.0798,0.4570,0,17995
17996,5.0598,0.0034,2,0.002624,0.002474,0.1496,0.0949,0.4570,0,17996
17997,5.0613,0.0277,2,0.002624,0.002474,0.1496,0.0974,0.4571,0,17997
17998,5.0589,0.0610,2,0.002624,0.002474,0.1496,0.0834,0.4571,0,17998


In [6]:

from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=9, cols=1)

fig.append_trace(go.Scatter(
    y=train_df.EDA,
    name='EDA',
), row=1, col=1)

fig.append_trace(go.Scatter(
    y=train_df.ECG,
    name='ECG',
), row=2, col=1)

fig.append_trace(go.Scatter(
    y=train_df.Label,
    name='Label',
), row=3, col=1)

fig.append_trace(go.Scatter(
    y=train_df['Left Pupil Diameter (m)'],
    name='Left pupil diameter',
), row=4, col=1)

fig.append_trace(go.Scatter(
    y=train_df['Right Pupil Diameter (m)'],
    name='Right pupil diameter',
), row=5, col=1)

fig.append_trace(go.Scatter(
    y=train_df['Eye Opening Left'],
    name='Eye Opening Left',
), row=6, col=1)

fig.append_trace(go.Scatter(
    y=train_df['Eye Opening Right'],
    name='Eye Opening Right',
), row=7, col=1)

fig.append_trace(go.Scatter(
    y=train_df['PERCLOS Value'],
    name='PERCLOS',
), row=8, col=1)

fig.append_trace(go.Scatter(
    y=train_df.Blinking,
    name='Blink',
), row=9, col=1)

fig.update_layout(height=1200, width=1200, title_text="Individual channels")
fig.show()

In [8]:
from smart_open import open
import yaml

from gretel_client import create_project
from gretel_client.helpers import poll

# Create a project and model configuration.
project = create_project(display_name="time-series-synthetic")

# Pull down the default synthetic config.  We will modify it slightly.
with open("https://raw.githubusercontent.com/gretelai/gretel-blueprints/main/config_templates/gretel/synthetics/default.yml", 'r') as stream:
    config = yaml.safe_load(stream)

# Here we create an object to specify the timeseries task.
time_field="idx_col"
trend_fields=["EDA", "ECG", "Left Pupil Diameter (m)", "Right Pupil Diameter (m)", "Eye Opening Left", "Eye Opening Right", "PERCLOS Value", "Blinking"]
# trend_fields=["EDA", "ECG"]

task = {
    'type': 'time_series',
    'attrs': {
        'time_field': time_field,
        'trend_fields': trend_fields
    }
}

config['models'][0]['synthetics']['task'] = task

config['models'][0]['synthetics']['params']['vocab_size'] = 0
config['models'][0]['synthetics']['params']['predict_batch_size'] = 1
config['models'][0]['synthetics']['params']['reset_states'] = True
config['models'][0]['synthetics']['params']['overwrite'] = True

# Modifying some values
config['models'][0]['synthetics']['params']['rnn_units'] = 1024


model = project.create_model_obj(model_config=config)

# Get a csv to work with, just dump out the train_df.
train_df.to_csv('train.csv', index=False)
model.data_source = 'train.csv'

# Upload the training data.  Train the model.
model.submit(upload_data_source=True)

poll(model)

# Use the model to generate synthetic data.
record_handler = model.create_record_handler_obj()

# For time series data we dump out the date column to seed the record handler.
train_df['idx_col'].to_csv('idx_seeds.csv', index=False)

record_handler.submit(
    action="generate",
    params={"num_records": 18000, "max_invalid": 20000},
    data_source='idx_seeds.csv',
    upload_data_source=True
)

poll(record_handler)

synthetic = pd.read_csv(record_handler.get_artifact_link("data"), compression='gzip')

synthetic.head()

[32mINFO: [0mStarting poller


{
    "uid": "6113c0214485f832e6a39a51",
    "model_name": "bashful-clumsy-bear",
    "runner_mode": "cloud",
    "user_id": "60f7ffc2bff621796155eab8",
    "project_id": "6113c01fdb3e03a9250dfd46",
    "logs": null,
    "status_history": {
        "created": "2021-08-11T12:18:41.781285Z"
    },
    "last_modified": "2021-08-11T12:18:41.831124Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:9cb2a0f32860dfffd00318473812aba06e84513464ac1818c9ed042b82e9aadc",
    "model_type": "synthetics",
    "config": {
        "schema_version": "1.0",
        "name": null,
        "models": [
            {
                "synthetics": {
                    "params": {
                        "field_delimiter": null,
                        "epochs": 100,
                        "batch_size": 64,
                        "voc

[32mINFO: [0mStatus is created. Model creation has been queued.
[32mINFO: [0mStatus is pending. A Gretel Cloud worker is being allocated to begin model creation.
[32mINFO: [0mStatus is active. A worker has started creating your model!
2021-08-11T12:18:52.045114Z  Starting synthetic model training
2021-08-11T12:18:52.046999Z  Loading training data
2021-08-11T12:18:52.318711Z  Training data loaded
{
    "record_count": 18000,
    "field_count": 10
}
2021-08-11T12:18:58.643689Z  Creating semantic validators and preparing training data
2021-08-11T12:19:06.888538Z  Beginning ML model training
2021-08-11T12:19:54.799781Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.3206,
    "loss": 2.5712,
    "val_accuracy": 0.2326,
    "val_loss": 2.2985,
    "batch": 0
}
2021-08-11T12:20:32.720855Z  Training epoch completed
{
    "epoch": 1,
    "accuracy": 0.3029,
    "loss": 2.3129,
    "val_accuracy": 0.3748,
    "val_loss": 2.1877,
    "batch": 0
}
2021-08-11T12:21:10.667257Z  

ApiException: ignored

In [None]:
train_df.ECG.plot(figsize=(12, 8))

In [None]:
# Does our synthetic data look the same? Yup!

synthetic.ECG.plot(figsize=(12, 8))