
_____
# GRETEL - Synthethic data

!pip install gretel-client

https://colab.research.google.com/github/gretelai/gretel-blueprints/blob/main/sdk_blueprints/Gretel_Advanced_Tabular_Blueprint.ipynb

https://colab.research.google.com/github/gretelai/gretel-blueprints/blob/main/sdk_blueprints/Gretel_101_Blueprint.ipynb#scrollTo=zFeKqpkunEo1

https://colab.research.google.com/github/gretelai/gretel-blueprints/blob/main/sdk_blueprints/Gretel_101_Blueprint.ipynb#scrollTo=zFeKqpkunEo1



## Getting data

Download the data and save it as csv file in desired location. It is not included in the repo as the file is too large for it. 

In [2]:
import pandas as pd
import numpy as np
url = 'https://data.wprdc.org/datastore/dump/2c13021f-74a9-4289-a1e5-fe0472c89881?bom=True'
print(f'Downloading original dataset.\n url = {url}')
df = pd.read_csv(url, low_memory = False)
df.shape

Downloading original dataset.
 url = https://data.wprdc.org/datastore/dump/2c13021f-74a9-4289-a1e5-fe0472c89881?bom=True


(238582, 191)

In [3]:
df.to_csv('../data/synthsata.csv')

## Reading original data from csv file

In [4]:
import pandas as pd
import numpy as np
df = pd.read_csv('../data/synthsata.csv', low_memory = False)
df.shape

(238582, 192)

## Generating synthetic data

Need an api with account from Gretel.ai



In [6]:
from gretel_client import Gretel
import sys, os
sys.path.append(os.path.abspath(os.path.join('..', 'secret')))
from secret_info import gretel

# connect to your Gretel account
gretel = Gretel(api_key=gretel)

# Training the model with the first 20K records of the original dataset 
trained = gretel.submit_train(
    base_config="tabular-actgan",
    data_source=df.head(20000),
    params={"epochs": 500},
)
print(trained.report)

# Getting synthethic data
generated = gretel.submit_generate(trained.model_id, num_records=1000)
generated.synthetic_data.shape

No project set -> creating a new one...
Project URL: https://console.gretel.ai/proj_2hTxuiFaWhs8YLOXNJQZm4szrn9
Submitting ACTGAN training job...
Model Docs:https://docs.gretel.ai/reference/synthetics/models/gretel-actgan
Console URL: https://console.gretel.ai/proj_2hTxuiFaWhs8YLOXNJQZm4szrn9/models/6660fc68bead8430933127ad/activity
Analyzing input data and checking for auto-params... 
Found 2 auto-params that were set based on input data. batch_size 600, force_conditioning False
Starting ACTGAN model training... num_epochs 500
Training data loaded. record_count 20000, field_count 192, upsample_count 0
Training: [██████████████████████████████████████████████████] 500/500 epochs.
ACTGAN model training complete. 
Sampling records for data preview... num_records 5000
Preparing privacy filters 
Loaded 0 privacy filters 
Starting privacy filtering 
Privacy filtering complete. 
Sampled 5000 records. 
Creating synthetic quality report (SQS)... 
Finished creating SQS 
Uploading artifacts to G

(1000, 192)

In [9]:
generated.synthetic_data

Unnamed: 0.1,Unnamed: 0,_id,CRASH_CRN,DISTRICT,CRASH_COUNTY,MUNICIPALITY,POLICE_AGCY,CRASH_YEAR,CRASH_MONTH,DAY_OF_WEEK,...,LANE_COUNT,RDWY_ORIENT,ROAD_OWNER,ROUTE,SPEED_LIMIT,SEGMENT,OFFSET,STREET_NAME,TOT_INJ_COUNT,SCHOOL_BUS_UNIT
0,12589,18251,2005124152,11,2,2449,02431,2005,12,5,...,3.0,E,2.0,1025,41.0,196.0,870.0,,,
1,11074,5892,2004169164,11,2,2386,02117,2004,4,7,...,2.0,N,2.0,0019,35.0,27.0,1446.0,,,
2,19999,8295,2005226625,11,2,2213,02301,2005,7,6,...,2.0,N,2.0,,35.0,259.0,52.0,AMITY ST,,
3,6932,10632,2005008264,11,2,2137,02505,2004,9,2,...,1.0,E,1.0,0376,54.0,927.0,0.0,RAYMOND P SHAFER HW,,
4,19999,1,2004170460,11,2,2295,02301,2004,6,5,...,2.0,W,4.0,,35.0,,0.0,EASTBUSWAY WY,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,13132,4130,2005143029,11,2,2133,02301,2005,7,3,...,3.0,W,4.0,,36.0,246.0,0.0,FROM 8007/0510 RD,,
996,6988,15398,2005093858,11,2,2295,02118,2004,4,4,...,3.0,N,2.0,,35.0,251.0,42.0,COAL VALLEY RD,,
997,12011,17917,2004962798,11,2,2397,68B03,2005,1,6,...,1.0,S,2.0,1025,25.0,9.0,,ROSE DR,,
998,9303,12320,2004079659,11,2,2490,68B03,2005,5,2,...,94.0,W,2.0,0019,43.0,,15.0,WASHINGTON RD,,


___ 

# WIP

### RELATIONAL DATABASE

https://github.com/gretelai/gretel-blueprints/blob/develop/docs/notebooks/synthesize_relational_database.ipynb

In [None]:
# Imports

import pandas as pd
import yaml
import time

from gretel_client import configure_session
from gretel_client import create_or_get_unique_project
from gretel_client.config import get_session_config
from gretel_client.rest_v1.api.connections_api import ConnectionsApi
from gretel_client.rest_v1.api.workflows_api import WorkflowsApi
from gretel_client.rest_v1.models import (
    CreateConnectionRequest,
    CreateWorkflowRunRequest,
    CreateWorkflowRequest,
)
from gretel_client.workflows.logs import print_logs_for_workflow_run

In [None]:

# @title Helper functions
# Helpers for running workflows from the notebook


def run_workflow(config: str):
    """Create a workflow, and workflow run from a given yaml config. Blocks and
    prints log lines until the workflow reaches a terminal state.

    Args:
        config: The workflow config to run.
    """
    print("Validating actions in the config...")
    config_dict = yaml.safe_load(config)

    for action in config_dict["actions"]:
        print(f"Validating action {action['name']}")
        response = workflow_api.validate_workflow_action(action)
        print(f"Validation response: {response}")

    workflow = workflow_api.create_workflow(
        CreateWorkflowRequest(project_id=project.project_guid, config=config_dict, name=config_dict["name"])
    )

    workflow_run = workflow_api.create_workflow_run(
        CreateWorkflowRunRequest(workflow_id=workflow.id)
    )

    print(f"workflow: {workflow.id}")
    print(f"workflow run id: {workflow_run.id}")

    print_logs_for_workflow_run(workflow_run.id, session)
     

In [None]:

# Log into Gretel
# configure_session(api_key="prompt", cache="yes", validate=True)

from secret.secret_info import gretel

# connect to your Gretel account
# gretel = Gretel(api_key=gretel)
configure_session(api_key=gretel, cache="yes", validate=True)


In [None]:
session = get_session_config()
connection_api = session.get_v1_api(ConnectionsApi)
workflow_api = session.get_v1_api(WorkflowsApi)

project = create_or_get_unique_project(name="Synthesize-Telecom-Database")

In [None]:

input_connection_uid = "sample_mysql_telecom" # @param {type:"string"}
connection_type = connection_api.get_connection(input_connection_uid).dict()['type']

In [None]:
workflow_config = f"""\
name: my-{connection_type}-workflow

actions:
  - name: {connection_type}-read
    type: {connection_type}_source
    connection: {input_connection_uid}

  - name: model-train-run
    type: gretel_tabular
    input: {connection_type}-read
    config:
      project_id: {project.project_guid}
      train:
        model: "synthetics/tabular-actgan"
        dataset: "{{{connection_type}-read.outputs.dataset}}"
      run:
        num_records_multiplier: 1.0

"""
print(workflow_config)

In [None]:
run_workflow(workflow_config)


# View Results


In [None]:
# @markdown Download output artifacts by clicking link:
output_url = project.get_artifact_link(project.artifacts[-1]['key'])
print(output_url)

In [None]:

# @markdown Or view the results within the notebook by running this cell.
import urllib.request
urllib.request.urlretrieve(project.get_artifact_link(project.artifacts[-1]['key']), "/content/workflow-output.tar.gz")
!gunzip /content/workflow-output.tar.gz
!tar -xzvf /content/workflow-output.tar

In [None]:

#@title Compare Source and Synthesized Table from Database
table = "invoice" #@param {type:"string"}
from IPython.display import display, HTML

source_table = pd.read_csv(f"https://gretel-blueprints-pub.s3.amazonaws.com/rdb/{table}.csv").head(10)
trans_table = pd.read_csv(f"/content/synth_{table}.csv").head(10)

print("\033[1m Source Table:")
display(source_table)
print("\n\n\033[1m Synthesized Table:")
display(trans_table)

In [None]:
# View relational report
import IPython
from smart_open import open

report_path = str("/content/relational_report.html")

IPython.display.HTML(data=open(report_path).read())