# Create Synthetic data

In [None]:
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient

# Establish a connection to the workspace
ml_client = MLClient.from_config(DefaultAzureCredential())


## Load Existing Data
Load the existing diabetes data to understand its structure before generating synthetic data

In [None]:
import mltable

# Load the existing data asset
data_asset = ml_client.data.get("diabetes-mltable-production", label="latest")

# Convert the data asset to a Pandas DataFrame
tbl = mltable.load(data_asset.path)
diabetes = tbl.to_pandas_dataframe()
diabetes.head()  # Display the first few rows to understand the data structure

## Generate Synthetic Data

Create a synthetic dataset mimicking the structure and statistical properties of the original diabetes dataset.

In [None]:
import pandas as pd
import numpy as np
import os

# Create a synthetic dataset with the same shape as the original dataset
num_rows = diabetes.shape[0]
synthetic_data = pd.DataFrame()

# Generate PatientID - unique 7-digit numbers
synthetic_data['PatientID'] = np.random.choice(range(1000000, 9999999), num_rows, replace=False)

# Generate Pregnancies - assuming a range of 0 to 17 pregnancies
synthetic_data['Pregnancies'] = np.random.randint(0, 18, num_rows)

# Generate PlasmaGlucose - assuming a mean of 120 and std deviation of 20
synthetic_data['PlasmaGlucose'] = np.random.normal(120, 20, num_rows).astype(int)

# Generate DiastolicBloodPressure - assuming a mean of 80 and std deviation of 10
synthetic_data['DiastolicBloodPressure'] = np.random.normal(80, 10, num_rows).astype(int)

# Generate TricepsThickness - assuming a mean of 25 and std deviation of 10
synthetic_data['TricepsThickness'] = np.random.normal(25, 10, num_rows).astype(int)

# Generate SerumInsulin - assuming a mean of 140 and std deviation of 85
synthetic_data['SerumInsulin'] = np.random.normal(140, 85, num_rows).astype(int)

# Generate BMI - assuming a mean of 30 and std deviation of 5
synthetic_data['BMI'] = np.random.normal(30, 5, num_rows)

# Generate DiabetesPedigree - assuming a mean of 0.5 and std deviation of 0.3
synthetic_data['DiabetesPedigree'] = np.random.normal(0.5, 0.3, num_rows)

# Generate Age - assuming a mean of 30 and std deviation of 10
synthetic_data['Age'] = np.random.normal(30, 10, num_rows).astype(int)

# Save the synthetic dataset to disk
os.makedirs("data/synthetic-diabetes-data/", exist_ok=True)
synthetic_data.to_csv("data/synthetic-diabetes-data/synthetic.csv", index=False)


## Convert to MLTable and Save

Convert the synthetic dataset to an mltable object and save it to disk.

In [None]:
tbl_synthetic = mltable.from_delimited_files(paths=[{"pattern": "data/synthetic-diabetes-data/synthetic.csv"}])
tbl_synthetic.save("data/synthetic-diabetes-data")
tbl_synthetic.show(5)  # Display the first few rows of the synthetic data

## (Optional) Register Synthetic Data

In [None]:
# import time
# from azure.ai.ml.entities import Data
# from azure.ai.ml.constants import AssetTypes

# # Set the version number of the data asset to the current UTC time
# VERSION = time.strftime("%Y.%m.%d.%H%M%S", time.gmtime())

# my_data_synthetic = Data(
#     path="./data/synthetic-diabetes-data",
#     type=AssetTypes.MLTABLE,
#     description="Synthetic data for the diabetes dataset",
#     name="diabetes-mltable-synthetic",
#     version=VERSION,
# )

# ml_client.data.create_or_update(my_data_synthetic)

## Invoke the Endpoint

Initialize the variables with your endpoint name, deployment name, and API key.

In [None]:
online_endpoint_name = "<your_endpoint_name>"
deployment = "<your_deployment_name>"

In [None]:
from utils.invoke import invoke_endpoint

# Get the details for online endpoint
endpoint = ml_client.online_endpoints.get(name=online_endpoint_name)

api_key = ml_client.online_endpoints.get_keys(online_endpoint_name).primary_key

# Display existing traffic details
print(f"Traffic Details: {endpoint.traffic}")

# Display the scoring URI
print(f"Scoring URI: {endpoint.scoring_uri}")

url = endpoint.scoring_uri

Remember, we can load the MLTable as a pandas dataframe:

In [None]:
data_asset = ml_client.data.get("diabetes-mltable-synthetic", label="latest")

# Load the table and convert to Pandas DataFrame
tbl = mltable.load(data_asset.path)
df = tbl.to_pandas_dataframe()
df.head()  # Display the first few rows of the DataFrame

Prepare the Synthetic data by separating the features (X) and the target variable (y), then split the data into batches for batch processing.

In [None]:
import numpy as np

# Separate features and target variable
X, y = df[['Pregnancies', 'PlasmaGlucose', 'DiastolicBloodPressure', 'TricepsThickness', 'SerumInsulin', 'BMI', 'DiabetesPedigree', 'Age']], df['Diabetic']

# Split the DataFrame into batches of size 5
batches = [batch.values.tolist() for batch in np.array_split(X, len(X) / 5)]


Invoke the endpoint with each batch of data to get predictions.

In [None]:
predictions = []

from tqdm import tqdm

for batch in tqdm(batches):
    payload = {"input_data": batch, "params": {}}
    predictions_batch = invoke_endpoint(url, deployment, api_key, payload)
    predictions.extend(predictions_batch)

## Explore Collected Data from Production

### Load and Explore Data

Last but not least, we'll load the data assets, convert them to Pandas DataFrames, and inspect the last record of each dataset to understand the data.

In [None]:
import mltable

def load_and_inspect_data(asset_name, version):
    # Get the data asset
    data_asset = ml_client.data.get(asset_name, version=version)
    
    # Define the path to the data asset
    path = {'folder': data_asset.path}
    
    # Load the data as an mltable object
    tbl = mltable.from_json_lines_files(paths=[path])
    
    # Convert the mltable to a Pandas DataFrame
    df = tbl.to_pandas_dataframe()
    
    # Inspect the last record of the DataFrame
    last_record = df.iloc[-1]
    print(f'Last record of {asset_name}:\n{last_record}\n')

# Base string for the deployment name
deployment_base_name = "<deployment-name>" # Replace with your deployment name

# Construct asset names dynamically based on the deployment name
asset_info = [
    (f"{deployment_base_name}-model_outputs", "1"),
    (f"{deployment_base_name}-model_inputs_outputs", "1"),
    (f"{deployment_base_name}-model_inputs", "1")
]

# Load and inspect each data asset
for asset_name, version in asset_info:
    load_and_inspect_data(asset_name, version)


## Next step

Congratulations, we have developed and understood the basics how to
- train 
- deploy
- invoke
- monitor

a machine learning model.

Now we head towards automation. The `CLI` README section of this repository will guide our next steps.