# Create synthetic data with the Python SDK

This notebook will walk you through the process of creating your own synthetic data using Gretel's Python SDK from a CSV or a DataFrame of your choosing. 

To run this notebook, you will need an API key from the Gretel console,  at https://console.gretel.cloud.



In [1]:
%%capture
!pip install -U gretel-client

In [2]:
# Specify your Gretel API key

from getpass import getpass
import pandas as pd
from gretel_client import configure_session, ClientConfig

pd.set_option('max_colwidth', None)

configure_session(ClientConfig(api_key=getpass(prompt="grtu91cd0878c2bdc0687b5f3cb1079a003e4be28a48efabc70a1479dafdca778a69"), 
                               endpoint="https://api.gretel.cloud"))

                            

grtu91cd0878c2bdc0687b5f3cb1079a003e4be28a48efabc70a1479dafdca778a69········


In [3]:
# Create a project

from gretel_client import create_project

project = create_project(display_name="synthetic-data")

## Create the synthetic data configuration
Load the default configuration template. This template will work well for most datasets. View other templates at https://github.com/gretelai/gretel-blueprints/tree/main/config_templates/gretel/synthetics

In [4]:
import json
from smart_open import open
import yaml

with open("https://raw.githubusercontent.com/gretelai/gretel-blueprints/main/config_templates/gretel/synthetics/default.yml", 'r') as stream:
    config = yaml.safe_load(stream)

# Set the model epochs to 50
config['models'][0]['synthetics']['params']['epochs'] = 50

print(json.dumps(config, indent=2))

{
  "schema_version": "1.0",
  "models": [
    {
      "synthetics": {
        "data_source": "__tmp__",
        "params": {
          "epochs": 50,
          "batch_size": 64,
          "vocab_size": 20000,
          "reset_states": false,
          "learning_rate": 0.01,
          "rnn_units": 256,
          "dropout_rate": 0.2,
          "overwrite": true,
          "early_stopping": true,
          "gen_temp": 1.0,
          "predict_batch_size": 64,
          "validation_split": false,
          "dp": false,
          "dp_noise_multiplier": 0.001,
          "dp_l2_norm_clip": 5.0,
          "dp_microbatches": 1,
          "data_upsample_limit": 10000
        },
        "validators": {
          "in_set_count": 10,
          "pattern_count": 10
        },
        "generate": {
          "num_records": 5000,
          "max_invalid": null
        },
        "privacy_filters": {
          "outliers": "medium",
          "similarity": "medium"
        }
      }
    }
  ]
}


## Load and preview the source dataset
Specify a data source to train the model on. This can be a local file, web location, or HDFS file.


In [5]:
# Load and preview the DataFrame to train the synthetic model on.
import pandas as pd

dataset_path = "D:\\7th Semester\\FYP\\Dataset\\FilteredData.csv"
df = pd.read_csv(dataset_path)
df.to_csv('D:\\7th Semester\\FYP\\Dataset\\training_data.csv', index=False)
df

Unnamed: 0,Conclusion,HGB(g/dL),PLT(10^3/uL),Judgment,Positive(Diff.),Positive(Morph.),Positive(Count),RBC Abnormal,RBC Suspect,PLT Abnormal,...,[MicroR(%)],[MacroR(%)],[RBC-O(10^6/uL)],[PLT-O(10^3/uL)],[RBC-He(pg)],[Delta-He(pg)],[RET-Y(ch)],[RET-RBC-Y(ch)],[IRF-Y(ch)],[FRC#(10^6/uL)]
0,HM,7.9,23,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,1.3,12.6,1.99,74,27.2,0.8,155.4,161.0,148.9,0.0420
1,SEP,8.6,4,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,5.3,4.6,2.71,2,25.4,8.6,175.3,153.9,162.8,0.0243
2,MDA,10.7,21,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,18.1,3.8,3.04,2,21.3,-2.1,117.0,136.0,122.8,0.1003
3,MDA,10.9,30,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,4.0,9.6,2.33,79,24.0,3.1,152.3,148.5,154.7,0.0405
4,AA,7.8,2,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,1.1,4.2,4.38,230,27.7,9.5,184.6,162.9,179.7,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,IDA,11.1,436,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,3.6,3.8,4.13,205,24.5,3,153.6,150.5,165.5,0.0384
82,ITP,11.9,30,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,2.3,12.9,2.03,29,27.8,3.3,166.2,163.4,168.9,0.0342
83,IDA,9.7,475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.1,7.4,3.36,169,23.6,5.9,161.0,146.6,153.4,0.0029
84,Extremly Increased Iron deposition,9.5,269,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,6.9,3.1,2.90,41,22.1,6.6,157.9,140.0,155.2,0.0677


## Train the synthetic model
In this step, we will task the worker running in the Gretel cloud, or locally, to train a synthetic model on the source dataset.

In [6]:
from gretel_client.helpers import poll

model = project.create_model_obj(model_config=config)
model.data_source = 'D:\\7th Semester\\FYP\\Dataset\\training_data.csv'
model.submit(upload_data_source=True)

poll(model)

INFO: Starting poller


{
    "uid": "621fb77ca98a71d0af76650e",
    "guid": "model_25qDWk6B7EOjk0b7FNQhAOGFMVG",
    "model_name": "happy-remarkable-iguana",
    "runner_mode": "cloud",
    "user_id": "621f405ebff62130903b66e0",
    "user_guid": "user_25pDimzOCwoJGoaCuMdlnnoxDZU",
    "billing_domain": null,
    "billing_domain_guid": null,
    "project_id": "621fb76e3980c20fe77bf0fe",
    "project_guid": "proj_25qDV2BjSv2iO22Rk3Cb88OEZWm",
    "status_history": {
        "created": "2022-03-02T18:29:16.591594Z"
    },
    "last_modified": "2022-03-02T18:29:16.809278Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:717a68c0e4ef3000c8b650bbed308162ef10c1b2cb4bfc3026b773bc908ee577",
    "model_type": "synthetics",
    "config": {
        "schema_version": "1.0",
        "name": null,
   

INFO: Status is pending. A Gretel Cloud worker is being allocated to begin model creation.
INFO: Status is active. A worker has started creating your model!
2022-03-02T18:29:36.582404Z  Starting synthetic model training
2022-03-02T18:29:36.584457Z  Loading training data
2022-03-02T18:29:36.766488Z  Training data loaded, detected format: 'csv'
2022-03-02T18:29:36.887421Z  Training data loaded
{
    "record_count": 86,
    "field_count": 67,
    "upsample_count": 9914
}
2022-03-02T18:29:40.745012Z  Creating semantic validators and preparing training data
2022-03-02T18:30:18.626472Z  Beginning ML model training
2022-03-02T18:30:25.729629Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.5276,
    "loss": 1.41,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2022-03-02T18:30:26.623167Z  Training epoch completed
{
    "epoch": 1,
    "accuracy": 0.8494,
    "loss": 0.4641,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2022-03-02T18:30:27.517040Z  Trainin

2022-03-02T18:31:39.097964Z  Training epoch completed
{
    "epoch": 17,
    "accuracy": 0.9484,
    "loss": 0.1484,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 1
}
2022-03-02T18:31:41.637512Z  Training epoch completed
{
    "epoch": 18,
    "accuracy": 0.9485,
    "loss": 0.1479,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 1
}
2022-03-02T18:31:44.170663Z  Training epoch completed
{
    "epoch": 19,
    "accuracy": 0.949,
    "loss": 0.147,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 1
}
2022-03-02T18:31:46.698637Z  Training epoch completed
{
    "epoch": 20,
    "accuracy": 0.9487,
    "loss": 0.1478,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 1
}
2022-03-02T18:31:49.208380Z  Training epoch completed
{
    "epoch": 21,
    "accuracy": 0.9489,
    "loss": 0.1471,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 1
}
2022-03-02T18:31:51.725113Z  Training epoch completed
{
    "epoch": 22,
    "accuracy": 0.9489,
    "loss": 0.1473,
    "

2022-03-02T18:32:53.519131Z  Training epoch completed
{
    "epoch": 2,
    "accuracy": 0.7239,
    "loss": 0.9005,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2022-03-02T18:32:55.872022Z  Training epoch completed
{
    "epoch": 3,
    "accuracy": 0.8688,
    "loss": 0.458,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2022-03-02T18:32:58.223569Z  Training epoch completed
{
    "epoch": 4,
    "accuracy": 0.906,
    "loss": 0.3304,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2022-03-02T18:33:00.622385Z  Training epoch completed
{
    "epoch": 5,
    "accuracy": 0.9211,
    "loss": 0.2797,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2022-03-02T18:33:02.991672Z  Training epoch completed
{
    "epoch": 6,
    "accuracy": 0.9295,
    "loss": 0.2509,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2022-03-02T18:33:05.356671Z  Training epoch completed
{
    "epoch": 7,
    "accuracy": 0.9342,
    "loss": 0.2337,
    "val_ac

2022-03-02T18:34:42.578665Z  Training epoch completed
{
    "epoch": 17,
    "accuracy": 0.9403,
    "loss": 0.2089,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2022-03-02T18:34:44.601665Z  Training epoch completed
{
    "epoch": 18,
    "accuracy": 0.9408,
    "loss": 0.2066,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2022-03-02T18:34:46.647851Z  Training epoch completed
{
    "epoch": 19,
    "accuracy": 0.941,
    "loss": 0.2052,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2022-03-02T18:34:48.752204Z  Training epoch completed
{
    "epoch": 20,
    "accuracy": 0.9407,
    "loss": 0.2059,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2022-03-02T18:34:50.766230Z  Training epoch completed
{
    "epoch": 21,
    "accuracy": 0.9414,
    "loss": 0.2019,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2022-03-02T18:34:52.763199Z  Training epoch completed
{
    "epoch": 22,
    "accuracy": 0.9415,
    "loss": 0.2031,
    

2022-03-02T18:37:02.416333Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 18,
    "new_valid_count": 0,
    "new_invalid_count": 0,
    "completion_percent": 0.0
}
2022-03-02T18:37:07.421112Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 18,
    "new_valid_count": 0,
    "new_invalid_count": 0,
    "completion_percent": 0.0
}
2022-03-02T18:37:12.427185Z  Generation in progress
{
    "current_valid_count": 140,
    "current_invalid_count": 52,
    "new_valid_count": 140,
    "new_invalid_count": 34,
    "completion_percent": 2.8
}
2022-03-02T18:37:17.432921Z  Generation in progress
{
    "current_valid_count": 427,
    "current_invalid_count": 88,
    "new_valid_count": 287,
    "new_invalid_count": 36,
    "completion_percent": 8.54
}
2022-03-02T18:37:22.440720Z  Generation in progress
{
    "current_valid_count": 676,
    "current_invalid_count": 124,
    "new_valid_count": 249,
    "new_invalid_count": 36,


# View the generated synthetic data

In [7]:
# View the synthetic data

synthetic_df = pd.read_csv(model.get_artifact_link("data_preview"), compression='gzip')

synthetic_df

Unnamed: 0,Conclusion,HGB(g/dL),PLT(10^3/uL),Judgment,Positive(Diff.),Positive(Morph.),Positive(Count),RBC Abnormal,RBC Suspect,PLT Abnormal,...,[MicroR(%)],[MacroR(%)],[RBC-O(10^6/uL)],[PLT-O(10^3/uL)],[RBC-He(pg)],[Delta-He(pg)],[RET-Y(ch)],[RET-RBC-Y(ch)],[IRF-Y(ch)],[FRC#(10^6/uL)]
0,NM,8.6,4,1.0,0.0,1.0,1.0,1.0,0.0,1.0,...,2.5,4.4,4.68,35,24.8,4.3,159.5,151.5,160.8,0.0000
1,NM,8.7,4,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,22.4,5.9,4.68,35,22.1,4.3,161.1,139.9,151.9,0.0280
2,ACD,10.5,43,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,22.4,5.9,3.10,200,22.1,3.7,161.1,139.9,151.9,0.0280
3,ACD,8.5,8,1.0,0.0,1.0,1.0,1.0,0.0,1.0,...,18.6,3.1,3.10,200,20.0,3.7,118.1,129.9,119.0,0.1393
4,NM,7.9,23,1.0,0.0,1.0,1.0,1.0,0.0,1.0,...,10.1,5.1,3.02,17,23.2,5.6,168.4,144.9,166.4,0.1219
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,ITP,8.5,8,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,9.5,3.6,3.26,195,20.4,6,147.3,131.6,152.5,0.1656
4996,NM,8.6,4,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,4.5,10.8,4.68,35,24.6,4.3,156.3,151.0,127.4,0.0166
4997,NM,9.4,32,1.0,0.0,1.0,1.0,1.0,0.0,1.0,...,4.1,3.4,3.02,17,25.2,5.6,172.6,153.3,171.7,0.0151
4998,MA,10.9,30,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,6.1,7.4,3.13,179,23.6,2.4,161.0,146.6,153.4,0.0029


# View the synthetic data quality report

In [8]:
# Generate report that shows the statistical performance between the training and synthetic data

import IPython
from smart_open import open

IPython.display.HTML(data=open(model.get_artifact_link("report")).read())

0,1,2,3,4,5
Synthetic Data Use Cases,Excellent,Good,Moderate,Poor,Very Poor
Significant tuning required to improve model,,,,,
Improve your model using our tips and advice,,,,,
Demo environments or mock data,,,,,
Pre-production testing environments,,,,,
Balance or augment machine learning data sources,,,,,
Machine learning or statistical analysis,,,,,

0,1,2,3,4
Data Sharing Use Case,Excellent,Very Good,Good,Normal
"Internally, within the same team",,,,
"Internally, across different teams",,,,
"Externally, with trusted partners",,,,
"Externally, public availability",,,,

Unnamed: 0,Training Data,Synthetic Data
Row Count,86,86
Column Count,67,67
Training Lines Duplicated,--,0

Default Privacy Protections,Advanced Protections

Field,Unique,Missing,Ave. Length,Type,Distribution Stability
[PLT-O(10^3/uL)],73,0,2.35,Other,
P-LCR(%),51,0,3.88,Other,
IP ABN(PLT)PLT Abn Scattergram,1,0,3.0,Numeric,Excellent
RET-He(pg),68,0,3.88,Other,
PCT(%),33,0,3.92,Other,
Conclusion,23,0,3.63,Other,
Positive(Morph.),2,0,3.0,Binary,Excellent
RDW-SD(fL),79,0,3.93,Other,
PDW(fL),45,0,3.74,Other,
MPV(fL),36,0,3.69,Other,


# Generate unlimited synthetic data
You can now use the trained synthetic model to generate as much synthetic data as you like.

In [9]:
# Generate more records from the model

record_handler = model.create_record_handler_obj()

record_handler.submit(
    action="generate",
    params={"num_records": 1000, "max_invalid": 500}
)

poll(record_handler)

INFO: Starting poller


{
    "uid": "621fb9f7a3d7c1994b06d33d",
    "guid": "model_run_25qEocfkPNvYjkuAh6Vd0OJqD5S",
    "model_name": null,
    "runner_mode": "cloud",
    "user_id": "621f405ebff62130903b66e0",
    "user_guid": "user_25pDimzOCwoJGoaCuMdlnnoxDZU",
    "billing_domain": null,
    "billing_domain_guid": null,
    "project_id": "621fb76e3980c20fe77bf0fe",
    "project_guid": "proj_25qDV2BjSv2iO22Rk3Cb88OEZWm",
    "status_history": {
        "created": "2022-03-02T18:39:51.578000Z"
    },
    "last_modified": "2022-03-02T18:39:51.700000Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:717a68c0e4ef3000c8b650bbed308162ef10c1b2cb4bfc3026b773bc908ee577",
    "model_id": "621fb77ca98a71d0af76650e",
    "model_guid": "model_25qDWk6B7EOjk0b7FNQhAOGFMVG",
    "action": "generate",

INFO: Status is pending. A Gretel Cloud worker is being allocated to begin generating synthetic records.
INFO: Status is active. A worker has started!
2022-03-02T18:40:12.843580Z  Loading model to worker
2022-03-02T18:40:13.984284Z  Checking for synthetic smart seeds
2022-03-02T18:40:13.984725Z  No smart seeds provided, will attempt generation without them
2022-03-02T18:40:13.985598Z  Loading model
2022-03-02T18:40:23.357744Z  Generating records
{
    "num_records": 1000
}
2022-03-02T18:40:28.364115Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 0,
    "new_valid_count": 0,
    "new_invalid_count": 0,
    "completion_percent": 0.0
}
2022-03-02T18:40:33.375449Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 0,
    "new_valid_count": 0,
    "new_invalid_count": 0,
    "completion_percent": 0.0
}
2022-03-02T18:40:38.382186Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 0,
  

In [10]:
synthetic_df = pd.read_csv(record_handler.get_artifact_link("data"), compression='gzip')

In [11]:
synthetic_df.to_csv("D:\\7th Semester\\FYP\\Dataset\\Synthetic_data.csv", index=False)
