# Create synthetic data with the Python SDK

This notebook will walk you through the process of creating your own synthetic data using Gretel's Python SDK from a CSV or a DataFrame of your choosing. 

To run this notebook, you will need an API key from the Gretel console,  at https://console.gretel.cloud.



In [1]:
%%capture
!pip install -U gretel-client

In [2]:
# Specify your Gretel API key

from getpass import getpass
import pandas as pd
from gretel_client import configure_session, ClientConfig

pd.set_option('max_colwidth', None)

configure_session(ClientConfig(api_key=getpass(prompt="grtu91cd0878c2bdc0687b5f3cb1079a003e4be28a48efabc70a1479dafdca778a69"), 
                               endpoint="https://api.gretel.cloud"))

                            

grtu91cd0878c2bdc0687b5f3cb1079a003e4be28a48efabc70a1479dafdca778a69········


In [3]:
# Create a project

from gretel_client import create_project

project = create_project(display_name="synthetic-data")

## Create the synthetic data configuration
Load the default configuration template. This template will work well for most datasets. View other templates at https://github.com/gretelai/gretel-blueprints/tree/main/config_templates/gretel/synthetics

In [4]:
import json
from smart_open import open
import yaml

with open("https://raw.githubusercontent.com/gretelai/gretel-blueprints/main/config_templates/gretel/synthetics/default.yml", 'r') as stream:
    config = yaml.safe_load(stream)

# Set the model epochs to 50
config['models'][0]['synthetics']['params']['epochs'] = 50

print(json.dumps(config, indent=2))

{
  "schema_version": "1.0",
  "models": [
    {
      "synthetics": {
        "data_source": "__tmp__",
        "params": {
          "epochs": 50,
          "batch_size": 64,
          "vocab_size": 20000,
          "reset_states": false,
          "learning_rate": 0.01,
          "rnn_units": 256,
          "dropout_rate": 0.2,
          "overwrite": true,
          "early_stopping": true,
          "gen_temp": 1.0,
          "predict_batch_size": 64,
          "validation_split": false,
          "dp": false,
          "dp_noise_multiplier": 0.001,
          "dp_l2_norm_clip": 5.0,
          "dp_microbatches": 1,
          "data_upsample_limit": 10000
        },
        "validators": {
          "in_set_count": 10,
          "pattern_count": 10
        },
        "generate": {
          "num_records": 5000,
          "max_invalid": null
        },
        "privacy_filters": {
          "outliers": "medium",
          "similarity": "medium"
        }
      }
    }
  ]
}


## Load and preview the source dataset
Specify a data source to train the model on. This can be a local file, web location, or HDFS file.


In [5]:
# Load and preview the DataFrame to train the synthetic model on.
import pandas as pd

dataset_path = "D:\\7th Semester\\FYP\\Dataset\\FilteredParameters.csv"
df = pd.read_csv(dataset_path)
df.to_csv('D:\\7th Semester\\FYP\\Dataset\\training_data.csv', index=False)
df

Unnamed: 0,Conclusion,HGB(g/dL),PLT(10^3/uL),Judgment,Positive(Diff.),Positive(Morph.),Positive(Count),RBC Abnormal,RBC Suspect,PLT Abnormal,...,[MicroR(%)],[MacroR(%)],[RBC-O(10^6/uL)],[PLT-O(10^3/uL)],[RBC-He(pg)],[Delta-He(pg)],[RET-Y(ch)],[RET-RBC-Y(ch)],[IRF-Y(ch)],[FRC#(10^6/uL)]
0,0,-0.696082,-0.683231,1,1,1,1,1,0,1,...,-0.782633,0.752464,-1.265063,-0.454417,1.139458,-1.057746,0.224564,1.070146,-0.054790,-0.258442
1,1,-0.397407,-0.775863,1,1,1,1,1,0,1,...,-0.469616,-0.359902,-0.601901,-0.872599,0.621279,1.435251,1.283004,0.628161,0.692027,-0.515784
2,2,0.498619,-0.692982,1,1,1,1,1,1,1,...,0.532038,-0.471139,-0.297952,-0.872599,-0.559017,-1.984629,-1.817852,-0.486141,-1.457087,0.589190
3,2,0.583955,-0.649103,1,1,1,1,0,1,1,...,-0.571347,0.335327,-0.951903,-0.425376,0.218251,-0.322631,0.059682,0.292003,0.256832,-0.280250
4,3,-0.738750,-0.785613,1,1,1,1,1,0,1,...,-0.798284,-0.415520,0.936267,0.451646,1.283396,1.722905,1.777651,1.188424,1.600028,-0.869085
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,7,0.669290,1.330293,1,0,1,0,1,1,0,...,-0.602648,-0.471139,0.706002,0.306444,0.362190,-0.354593,0.128826,0.416506,0.837092,-0.310783
82,4,1.010633,-0.649103,1,1,1,1,0,0,1,...,-0.704379,0.794177,-1.228221,-0.715781,1.312184,-0.258708,0.798994,1.219550,1.019767,-0.371847
83,7,0.071940,1.520432,0,0,0,0,0,0,0,...,-0.407013,0.029426,-0.003213,0.097352,0.103100,0.572291,0.522417,0.173725,0.186985,-0.826922
84,22,-0.013396,0.516108,1,0,0,1,1,0,0,...,-0.344409,-0.568471,-0.426900,-0.646084,-0.328716,0.796021,0.357534,-0.237135,0.283696,0.115214


## Train the synthetic model
In this step, we will task the worker running in the Gretel cloud, or locally, to train a synthetic model on the source dataset.

In [6]:
from gretel_client.helpers import poll

model = project.create_model_obj(model_config=config)
model.data_source = 'D:\\7th Semester\\FYP\\Dataset\\training_data.csv'
model.submit(upload_data_source=True)

poll(model)

INFO: Starting poller


{
    "uid": "621f89db25750dcf704fb8c4",
    "guid": "model_25ppqpVWa9hvYHfTMMHOBC2kjjV",
    "model_name": "remarkable-enthusiastic-hippopotamus",
    "runner_mode": "cloud",
    "user_id": "621f405ebff62130903b66e0",
    "user_guid": "user_25pDimzOCwoJGoaCuMdlnnoxDZU",
    "billing_domain": null,
    "billing_domain_guid": null,
    "project_id": "621f89d04459aef720f7f0e9",
    "project_guid": "proj_25pppQW2plJuYdrVaQWyOCJ0dQ1",
    "status_history": {
        "created": "2022-03-02T15:14:35.506929Z"
    },
    "last_modified": "2022-03-02T15:14:35.602903Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:717a68c0e4ef3000c8b650bbed308162ef10c1b2cb4bfc3026b773bc908ee577",
    "model_type": "synthetics",
    "config": {
        "schema_version": "1.0",
        "name

INFO: Status is created. Model creation has been queued.
INFO: Status is pending. A Gretel Cloud worker is being allocated to begin model creation.
INFO: Status is active. A worker has started creating your model!
2022-03-02T15:14:55.461795Z  Starting synthetic model training
2022-03-02T15:14:55.463851Z  Loading training data
2022-03-02T15:14:55.620954Z  Training data loaded, detected format: 'csv'
2022-03-02T15:14:55.657977Z  Training data loaded
{
    "record_count": 86,
    "field_count": 67,
    "upsample_count": 9914
}
2022-03-02T15:14:59.738077Z  Creating semantic validators and preparing training data
2022-03-02T15:15:36.277665Z  Beginning ML model training
2022-03-02T15:15:50.183275Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.2176,
    "loss": 4.4921,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2022-03-02T15:15:51.955568Z  Training epoch completed
{
    "epoch": 1,
    "accuracy": 0.523,
    "loss": 2.1421,
    "val_accuracy": 0,
    "val_loss"

2022-03-02T15:17:34.669795Z  Training epoch completed
{
    "epoch": 6,
    "accuracy": 0.9541,
    "loss": 0.1903,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 1
}
2022-03-02T15:17:39.659506Z  Training epoch completed
{
    "epoch": 7,
    "accuracy": 0.9559,
    "loss": 0.1781,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 1
}
2022-03-02T15:17:44.618967Z  Training epoch completed
{
    "epoch": 8,
    "accuracy": 0.9571,
    "loss": 0.1722,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 1
}
2022-03-02T15:17:49.592041Z  Training epoch completed
{
    "epoch": 9,
    "accuracy": 0.9581,
    "loss": 0.1641,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 1
}
2022-03-02T15:17:54.541226Z  Training epoch completed
{
    "epoch": 10,
    "accuracy": 0.9586,
    "loss": 0.1605,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 1
}
2022-03-02T15:17:59.522595Z  Training epoch completed
{
    "epoch": 11,
    "accuracy": 0.9595,
    "loss": 0.1571,
    "va

2022-03-02T15:20:57.504139Z  Training epoch completed
{
    "epoch": 21,
    "accuracy": 0.9578,
    "loss": 0.1585,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 2
}
2022-03-02T15:21:00.870069Z  Training epoch completed
{
    "epoch": 22,
    "accuracy": 0.9581,
    "loss": 0.1567,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 2
}
2022-03-02T15:21:04.263200Z  Training epoch completed
{
    "epoch": 23,
    "accuracy": 0.958,
    "loss": 0.1561,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 2
}
2022-03-02T15:21:07.622582Z  Training epoch completed
{
    "epoch": 24,
    "accuracy": 0.9579,
    "loss": 0.1564,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 2
}
2022-03-02T15:21:10.956727Z  Training epoch completed
{
    "epoch": 25,
    "accuracy": 0.9581,
    "loss": 0.1564,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 2
}
2022-03-02T15:21:14.290054Z  Training epoch completed
{
    "epoch": 26,
    "accuracy": 0.9584,
    "loss": 0.1551,
    

2022-03-02T15:23:19.453406Z  Training epoch completed
{
    "epoch": 30,
    "accuracy": 0.9482,
    "loss": 0.1784,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2022-03-02T15:23:22.130363Z  Training epoch completed
{
    "epoch": 31,
    "accuracy": 0.9483,
    "loss": 0.1788,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2022-03-02T15:23:24.864316Z  Training epoch completed
{
    "epoch": 32,
    "accuracy": 0.9484,
    "loss": 0.1776,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2022-03-02T15:23:27.607469Z  Training epoch completed
{
    "epoch": 33,
    "accuracy": 0.9483,
    "loss": 0.1782,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2022-03-02T15:23:30.307859Z  Training epoch completed
{
    "epoch": 34,
    "accuracy": 0.9485,
    "loss": 0.1777,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 3
}
2022-03-02T15:23:33.032111Z  Training epoch completed
{
    "epoch": 35,
    "accuracy": 0.9487,
    "loss": 0.1772,
   

2022-03-02T15:24:28.696889Z  Training epoch completed
{
    "epoch": 31,
    "accuracy": 0.884,
    "loss": 0.3224,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2022-03-02T15:24:29.480194Z  Training epoch completed
{
    "epoch": 32,
    "accuracy": 0.8844,
    "loss": 0.3213,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2022-03-02T15:24:30.264077Z  Training epoch completed
{
    "epoch": 33,
    "accuracy": 0.8845,
    "loss": 0.3198,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2022-03-02T15:24:31.033028Z  Training epoch completed
{
    "epoch": 34,
    "accuracy": 0.8851,
    "loss": 0.3205,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2022-03-02T15:24:31.817478Z  Training epoch completed
{
    "epoch": 35,
    "accuracy": 0.8848,
    "loss": 0.3179,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 4
}
2022-03-02T15:24:32.577548Z  Training epoch completed
{
    "epoch": 36,
    "accuracy": 0.885,
    "loss": 0.318,
    "v

2022-03-02T15:26:47.891095Z  Generation in progress
{
    "current_valid_count": 1159,
    "current_invalid_count": 130,
    "new_valid_count": 268,
    "new_invalid_count": 28,
    "completion_percent": 23.18
}
2022-03-02T15:26:52.896629Z  Generation in progress
{
    "current_valid_count": 1329,
    "current_invalid_count": 143,
    "new_valid_count": 170,
    "new_invalid_count": 13,
    "completion_percent": 26.58
}
2022-03-02T15:26:57.902228Z  Generation in progress
{
    "current_valid_count": 1524,
    "current_invalid_count": 169,
    "new_valid_count": 195,
    "new_invalid_count": 26,
    "completion_percent": 30.48
}
2022-03-02T15:27:02.910495Z  Generation in progress
{
    "current_valid_count": 1648,
    "current_invalid_count": 177,
    "new_valid_count": 124,
    "new_invalid_count": 8,
    "completion_percent": 32.96
}
2022-03-02T15:27:07.919986Z  Generation in progress
{
    "current_valid_count": 1903,
    "current_invalid_count": 195,
    "new_valid_count": 255,
    

# View the generated synthetic data

In [7]:
# View the synthetic data

synthetic_df = pd.read_csv(model.get_artifact_link("data_preview"), compression='gzip')

synthetic_df

Unnamed: 0,Conclusion,HGB(g/dL),PLT(10^3/uL),Judgment,Positive(Diff.),Positive(Morph.),Positive(Count),RBC Abnormal,RBC Suspect,PLT Abnormal,...,[MicroR(%)],[MacroR(%)],[RBC-O(10^6/uL)],[PLT-O(10^3/uL)],[RBC-He(pg)],[Delta-He(pg)],[RET-Y(ch)],[RET-RBC-Y(ch)],[IRF-Y(ch)],[FRC#(10^6/uL)]
0,4,-0.098731,-0.649103,1,1,1,1,1,1,1,...,-0.720030,-0.443330,1.037583,1.049880,0.736430,0.412483,0.559648,0.727763,0.713518,-0.869085
1,20,-1.037425,1.340044,1,1,1,1,1,1,0,...,5.180337,-0.929990,-1.044009,1.142809,-2.343856,-1.217553,-2.307180,-2.658717,-1.682744,-0.828376
2,20,-1.037425,1.340044,1,1,1,1,1,1,0,...,5.180337,-0.929990,-1.044009,-0.808710,-2.343856,1.722905,-2.307180,-2.658717,-1.682744,-0.828376
3,20,0.029272,1.340044,1,1,1,1,1,0,0,...,0.242497,-0.526757,2.492856,0.248363,-1.307498,1.722905,-0.924295,-1.289185,-0.683406,5.795655
4,4,-1.549439,-0.614976,1,0,1,1,1,0,1,...,-0.469616,-0.359902,0.088893,0.620081,0.621279,0.412483,1.283004,0.628161,0.692027,-0.515784
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,13,0.242612,-0.566222,1,0,1,1,1,1,1,...,0.485085,-0.637994,-0.021634,-0.872599,-0.933258,-0.130862,-0.892382,-0.890775,-0.892944,-0.550678
4996,2,-1.378768,-0.000680,1,0,1,1,1,0,0,...,0.485085,-0.637994,-0.279531,-0.872599,-0.933258,-0.482439,-0.892382,-0.890775,-0.892944,-0.550678
4997,3,0.327947,-0.775863,1,1,1,1,0,0,1,...,-0.782633,0.752464,0.558633,-0.785478,1.139458,-0.066939,0.224564,1.070146,-0.054790,-0.258442
4998,2,1.010633,3.280438,1,0,1,1,0,0,1,...,-0.313108,-0.526757,-1.228221,-0.558962,-0.098414,-0.354593,-0.195620,-0.006804,-0.065535,-0.831284


# View the synthetic data quality report

In [8]:
# Generate report that shows the statistical performance between the training and synthetic data

import IPython
from smart_open import open

IPython.display.HTML(data=open(model.get_artifact_link("report")).read())

0,1,2,3,4,5
Synthetic Data Use Cases,Excellent,Good,Moderate,Poor,Very Poor
Significant tuning required to improve model,,,,,
Improve your model using our tips and advice,,,,,
Demo environments or mock data,,,,,
Pre-production testing environments,,,,,
Balance or augment machine learning data sources,,,,,
Machine learning or statistical analysis,,,,,

0,1,2,3,4
Data Sharing Use Case,Excellent,Very Good,Good,Normal
"Internally, within the same team",,,,
"Internally, across different teams",,,,
"Externally, with trusted partners",,,,
"Externally, public availability",,,,

Unnamed: 0,Training Data,Synthetic Data
Row Count,86,86
Column Count,67,67
Training Lines Duplicated,--,0

Default Privacy Protections,Advanced Protections

Field,Unique,Missing,Ave. Length,Type,Distribution Stability
IP ABN(PLT)PLT Abn Scattergram,1,0,1.0,Categorical,Excellent
IP SUS(RBC)Turbidity/HGB Interf?,2,0,1.0,Binary,Excellent
IP SUS(RBC)HGB Defect?,2,0,1.0,Binary,Excellent
PLT Suspect,1,0,1.0,Categorical,Excellent
MCV(fL),79,0,11.41,Numeric,Good
RET%(%),76,0,11.69,Numeric,Good
[IRF-Y(ch)],83,0,11.4,Numeric,Good
[MicroR(%)],72,0,11.56,Numeric,Good
MCHC(g/dL),53,0,11.36,Numeric,Good
P-LCR(%),51,0,11.45,Numeric,Good


# Generate unlimited synthetic data
You can now use the trained synthetic model to generate as much synthetic data as you like.

In [9]:
# Generate more records from the model

record_handler = model.create_record_handler_obj()

record_handler.submit(
    action="generate",
    params={"num_records": 1000, "max_invalid": 500}
)

poll(record_handler)

INFO: Starting poller


{
    "uid": "621f8d61d8ca7731eb771dc4",
    "guid": "model_run_25prgFEA06ZMO9c2xfLLHCbQyTk",
    "model_name": null,
    "runner_mode": "cloud",
    "user_id": "621f405ebff62130903b66e0",
    "user_guid": "user_25pDimzOCwoJGoaCuMdlnnoxDZU",
    "billing_domain": null,
    "billing_domain_guid": null,
    "project_id": "621f89d04459aef720f7f0e9",
    "project_guid": "proj_25pppQW2plJuYdrVaQWyOCJ0dQ1",
    "status_history": {
        "created": "2022-03-02T15:29:36.879000Z"
    },
    "last_modified": "2022-03-02T15:29:37.033000Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:717a68c0e4ef3000c8b650bbed308162ef10c1b2cb4bfc3026b773bc908ee577",
    "model_id": "621f89db25750dcf704fb8c4",
    "model_guid": "model_25ppqpVWa9hvYHfTMMHOBC2kjjV",
    "action": "generate",

INFO: Status is pending. A Gretel Cloud worker is being allocated to begin generating synthetic records.
INFO: Status is active. A worker has started!
2022-03-02T15:29:53.216124Z  Loading model to worker
2022-03-02T15:29:54.579405Z  Checking for synthetic smart seeds
2022-03-02T15:29:54.579798Z  No smart seeds provided, will attempt generation without them
2022-03-02T15:29:54.580639Z  Loading model
2022-03-02T15:30:04.202135Z  Generating records
{
    "num_records": 1000
}
2022-03-02T15:30:09.209014Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 0,
    "new_valid_count": 0,
    "new_invalid_count": 0,
    "completion_percent": 0.0
}
2022-03-02T15:30:14.216362Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 0,
    "new_valid_count": 0,
    "new_invalid_count": 0,
    "completion_percent": 0.0
}
2022-03-02T15:30:19.222918Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 0,
  

In [10]:
synthetic_df = pd.read_csv(record_handler.get_artifact_link("data"), compression='gzip')

In [11]:
synthetic_df.to_csv("D:\\7th Semester\\FYP\\Dataset\\Synthetic_data.csv", index=False)
