# Create Synthetic data

## Initialize Workspace
> [!NOTE] Must use Python 3.10 SDK V2 for this demo.

In [None]:
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient

# Establish a connection to the workspace
ml_client = MLClient.from_config(DefaultAzureCredential())


## Load Existing Data
Load the existing diabetes data to understand its structure before generating synthetic data

In [None]:
import mltable

# Load the existing data asset
data_asset = ml_client.data.get("diabetes-mltable-production", label="latest")

# Convert the data asset to a Pandas DataFrame
tbl = mltable.load(data_asset.path)
diabetes = tbl.to_pandas_dataframe()
diabetes.head()  # Display the first few rows to understand the data structure

## Generate Synthetic Data

Create a synthetic dataset mimicking the structure and statistical properties of the original diabetes dataset.

In [None]:
import pandas as pd
import numpy as np
import os

# Create a synthetic dataset with the same shape as the original dataset
num_rows = diabetes.shape[0]
synthetic_data = pd.DataFrame()

# Generate PatientID - unique 7-digit numbers
synthetic_data['PatientID'] = np.random.choice(range(1000000, 9999999), num_rows, replace=False)

# Generate Pregnancies - assuming a range of 0 to 17 pregnancies
synthetic_data['Pregnancies'] = np.random.randint(0, 18, num_rows)

# Generate PlasmaGlucose - assuming a mean of 120 and std deviation of 20
synthetic_data['PlasmaGlucose'] = np.random.normal(120, 20, num_rows).astype(int)

# Generate DiastolicBloodPressure - assuming a mean of 80 and std deviation of 10
synthetic_data['DiastolicBloodPressure'] = np.random.normal(80, 10, num_rows).astype(int)

# Generate TricepsThickness - assuming a mean of 25 and std deviation of 10
synthetic_data['TricepsThickness'] = np.random.normal(25, 10, num_rows).astype(int)

# Generate SerumInsulin - assuming a mean of 140 and std deviation of 85
synthetic_data['SerumInsulin'] = np.random.normal(140, 85, num_rows).astype(int)

# Generate BMI - assuming a mean of 30 and std deviation of 5
synthetic_data['BMI'] = np.random.normal(30, 5, num_rows)

# Generate DiabetesPedigree - assuming a mean of 0.5 and std deviation of 0.3
synthetic_data['DiabetesPedigree'] = np.random.normal(0.5, 0.3, num_rows)

# Generate Age - assuming a mean of 30 and std deviation of 10
synthetic_data['Age'] = np.random.normal(30, 10, num_rows).astype(int)

# Save the synthetic dataset to disk
os.makedirs("data/synthetic-diabetes-data/", exist_ok=True)
synthetic_data.to_csv("data/synthetic-diabetes-data/synthetic.csv", index=False)


## Convert to MLTable and Save

Convert the synthetic dataset to an mltable object and save it to disk.

In [None]:
tbl_synthetic = mltable.from_delimited_files(paths=[{"pattern": "data/synthetic-diabetes-data/synthetic.csv"}])
tbl_synthetic.save("data/synthetic-diabetes-data")
tbl_synthetic.show(5)  # Display the first few rows of the synthetic data

## Register Synthetic Data

Register the synthetic data as a new data asset in the Azure ML workspace.

In [None]:
import time
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

# Set the version number of the data asset to the current UTC time
VERSION = time.strftime("%Y.%m.%d.%H%M%S", time.gmtime())

my_data_synthetic = Data(
    path="./data/synthetic-diabetes-data",
    type=AssetTypes.MLTABLE,
    description="Synthetic data for the diabetes dataset",
    name="diabetes-mltable-synthetic",
    version=VERSION,
)

ml_client.data.create_or_update(my_data_synthetic)