
_____
# GRETEL - Synthethic data

!pip install gretel-client

https://colab.research.google.com/github/gretelai/gretel-blueprints/blob/main/sdk_blueprints/Gretel_Advanced_Tabular_Blueprint.ipynb

https://colab.research.google.com/github/gretelai/gretel-blueprints/blob/main/sdk_blueprints/Gretel_101_Blueprint.ipynb#scrollTo=zFeKqpkunEo1

https://colab.research.google.com/github/gretelai/gretel-blueprints/blob/main/sdk_blueprints/Gretel_101_Blueprint.ipynb#scrollTo=zFeKqpkunEo1



## Getting data

Download the data and save it as csv file in desired location. It is not included in the repo as the file is too large for it. 

In [1]:
import pandas as pd
import numpy as np
url = 'https://data.wprdc.org/datastore/dump/2c13021f-74a9-4289-a1e5-fe0472c89881?bom=True'
print(f'Downloading original dataset.\n url = {url}')
df = pd.read_csv(url, low_memory = False)
df.shape

Downloading original dataset.
 url = https://data.wprdc.org/datastore/dump/2c13021f-74a9-4289-a1e5-fe0472c89881?bom=True


(238582, 191)

In [2]:
df.to_csv('../data/synthsata.csv')

## Reading original data from csv file

In [3]:
import pandas as pd
import numpy as np
df = pd.read_csv('../data/synthsata.csv', low_memory = False)
df.shape

(238582, 192)

## Generating synthetic data

Need an api with account from Gretel.ai



In [6]:
from gretel_client import Gretel
import sys, os
sys.path.append(os.path.abspath(os.path.join('..', 'secret')))
from secret_info import gretel

# connect to your Gretel account
gretel = Gretel(api_key=gretel)

# Training the model with the first 20K records of the original dataset 
trained = gretel.submit_train(
    base_config="tabular-actgan",
    data_source=df.head(20000),
    params={"epochs": 500},
)
print(trained.report)

# Getting synthethic data
generated = gretel.submit_generate(trained.model_id, num_records=1000)
generated.synthetic_data.shape

No project set -> creating a new one...
Project URL: https://console.gretel.ai/proj_2kqn1pjVAOscfXLD8z5vwijnbiG
Submitting ACTGAN training job...
Model Docs: https://docs.gretel.ai/create-synthetic-data/models/synthetics/gretel-actgan
Console URL: https://console.gretel.ai/proj_2kqn1pjVAOscfXLD8z5vwijnbiG/models/66c2743b7219962e052d2dd5/activity
Model ID: 66c2743b7219962e052d2dd5
Analyzing input data and checking for auto-params... 
Found 2 auto-params that were set based on input data. batch_size 600, force_conditioning False
Starting ACTGAN model training... num_epochs 500
Training data loaded. record_count 19000, field_count 192, upsample_count 0
Training: [██████████████████████████████████████████████████] 500/500 epochs.
ACTGAN model training complete. 
Sampling records for data preview... num_records 5000
Preparing privacy filters 
Loaded 0 privacy filters 
Starting privacy filtering 
Privacy filtering complete. 
Sampled 5000 records. 
Creating synthetic quality report (SQS)... 

(1000, 192)

In [8]:
generated.synthetic_data

Unnamed: 0.1,Unnamed: 0,_id,CRASH_CRN,DISTRICT,CRASH_COUNTY,MUNICIPALITY,POLICE_AGCY,CRASH_YEAR,CRASH_MONTH,DAY_OF_WEEK,...,LANE_COUNT,RDWY_ORIENT,ROAD_OWNER,ROUTE,SPEED_LIMIT,SEGMENT,OFFSET,STREET_NAME,TOT_INJ_COUNT,SCHOOL_BUS_UNIT
0,19999,17987,2004150063,11,2,2133,02302,2005,4,7,...,1.0,E,4.0,,35.0,,55.0,NAYLOR ST,,
1,15602,1692,2005095536,11,2,2101,68D02,2004,2,7,...,1.0,S,2.0,,25.0,255.0,690.0,DAVIS AV,,
2,15172,1,2005064668,11,2,2193,02472,2004,7,5,...,1.0,W,2.0,,24.0,,728.0,OLIVE ST,,
3,2991,16294,2005090449,11,2,2296,02476,2004,8,5,...,2.0,N,4.0,,35.0,,0.0,BROWNSHILL RD,,
4,19000,14743,2005031018,11,2,2227,02301,2004,1,3,...,2.0,N,2.0,79.0,41.0,179.0,1544.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,18771,6916,2005253231,11,2,2133,02424,2004,5,5,...,1.0,E,4.0,,24.0,251.0,0.0,FERN HOLLOW RD,,
996,7295,7986,2004096099,11,2,2403,02301,2004,8,7,...,2.0,E,5.0,,53.0,,1189.0,CLAIRTON BL,,
997,14539,16135,2005071491,11,2,2101,02204,2004,7,7,...,3.0,S,2.0,79.0,35.0,253.0,738.0,RAYMOND P SHAFER HW,,
998,18771,7428,2005149588,11,2,2392,02301,2004,11,4,...,2.0,E,4.0,,,,37.0,PENN LINCOLN PY,,


___ 

# WIP

### RELATIONAL DATABASE

https://github.com/gretelai/gretel-blueprints/blob/develop/docs/notebooks/synthesize_relational_database.ipynb

In [None]:
# Imports

import pandas as pd
import yaml
import time

from gretel_client import configure_session
from gretel_client import create_or_get_unique_project
from gretel_client.config import get_session_config
from gretel_client.rest_v1.api.connections_api import ConnectionsApi
from gretel_client.rest_v1.api.workflows_api import WorkflowsApi
from gretel_client.rest_v1.models import (
    CreateConnectionRequest,
    CreateWorkflowRunRequest,
    CreateWorkflowRequest,
)
from gretel_client.workflows.logs import print_logs_for_workflow_run

In [None]:

# @title Helper functions
# Helpers for running workflows from the notebook


def run_workflow(config: str):
    """Create a workflow, and workflow run from a given yaml config. Blocks and
    prints log lines until the workflow reaches a terminal state.

    Args:
        config: The workflow config to run.
    """
    print("Validating actions in the config...")
    config_dict = yaml.safe_load(config)

    for action in config_dict["actions"]:
        print(f"Validating action {action['name']}")
        response = workflow_api.validate_workflow_action(action)
        print(f"Validation response: {response}")

    workflow = workflow_api.create_workflow(
        CreateWorkflowRequest(project_id=project.project_guid, config=config_dict, name=config_dict["name"])
    )

    workflow_run = workflow_api.create_workflow_run(
        CreateWorkflowRunRequest(workflow_id=workflow.id)
    )

    print(f"workflow: {workflow.id}")
    print(f"workflow run id: {workflow_run.id}")

    print_logs_for_workflow_run(workflow_run.id, session)
     

In [None]:

# Log into Gretel
# configure_session(api_key="prompt", cache="yes", validate=True)

from secret.secret_info import gretel

# connect to your Gretel account
# gretel = Gretel(api_key=gretel)
configure_session(api_key=gretel, cache="yes", validate=True)


In [None]:
session = get_session_config()
connection_api = session.get_v1_api(ConnectionsApi)
workflow_api = session.get_v1_api(WorkflowsApi)

project = create_or_get_unique_project(name="Synthesize-Telecom-Database")

In [None]:

input_connection_uid = "sample_mysql_telecom" # @param {type:"string"}
connection_type = connection_api.get_connection(input_connection_uid).dict()['type']

In [None]:
workflow_config = f"""\
name: my-{connection_type}-workflow

actions:
  - name: {connection_type}-read
    type: {connection_type}_source
    connection: {input_connection_uid}

  - name: model-train-run
    type: gretel_tabular
    input: {connection_type}-read
    config:
      project_id: {project.project_guid}
      train:
        model: "synthetics/tabular-actgan"
        dataset: "{{{connection_type}-read.outputs.dataset}}"
      run:
        num_records_multiplier: 1.0

"""
print(workflow_config)

In [None]:
run_workflow(workflow_config)


# View Results


In [None]:
# @markdown Download output artifacts by clicking link:
output_url = project.get_artifact_link(project.artifacts[-1]['key'])
print(output_url)

In [None]:

# @markdown Or view the results within the notebook by running this cell.
import urllib.request
urllib.request.urlretrieve(project.get_artifact_link(project.artifacts[-1]['key']), "/content/workflow-output.tar.gz")
!gunzip /content/workflow-output.tar.gz
!tar -xzvf /content/workflow-output.tar

In [None]:

#@title Compare Source and Synthesized Table from Database
table = "invoice" #@param {type:"string"}
from IPython.display import display, HTML

source_table = pd.read_csv(f"https://gretel-blueprints-pub.s3.amazonaws.com/rdb/{table}.csv").head(10)
trans_table = pd.read_csv(f"/content/synth_{table}.csv").head(10)

print("\033[1m Source Table:")
display(source_table)
print("\n\n\033[1m Synthesized Table:")
display(trans_table)

In [None]:
# View relational report
import IPython
from smart_open import open

report_path = str("/content/relational_report.html")

IPython.display.HTML(data=open(report_path).read())