# integrate.ai API Sample Notebook to run client on Local Python VirtualEnv

This task group implementation is meant to facilitate development - not to be used in a production setting.

## Set environment variables (or replace inline) with your IAI credentials
### Generate and manage this token in the UI, in the Tokens page

In [None]:
import os

IAI_TOKEN = os.environ.get("IAI_TOKEN")

## Authenticate to the integrate.ai api client

In [None]:
from integrate_ai_sdk.api import connect

client = connect(token=IAI_TOKEN)

## Sample model config and data schema
You can find the model config and data schema in the [integrate.ai end user tutorial](https://integrate-ai.gitbook.io/integrate.ai-user-documentation/tutorials/end-user-tutorials/model-training-with-a-sample-local-dataset)

In [None]:
model_config = {
    "experiment_name": "test_synthetic_tabular",
    "experiment_description": "test_synthetic_tabular",
    "strategy": {"name": "FedAvg", "params": {}},
    "model": {"params": {"input_size": 15, "hidden_layer_sizes": [6, 6, 6], "output_size": 2}},
    "balance_train_datasets": False,
    "ml_task": {
        "type": "classification",
        "params": {
            "loss_weights": None,
        },
    },
    "optimizer": {"name": "SGD", "params": {"learning_rate": 0.2, "momentum": 0.0}},
    "differential_privacy_params": {"epsilon": 4, "max_grad_norm": 7},
    "save_best_model": {
        "metric": "loss",  # to disable this and save model from the last round, set to None
        "mode": "min",
    },
    "seed": 23,  # for reproducibility
}

data_schema = {
    "predictors": ["x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14"],
    "target": "y",
}

## Import Local Python Environment SessionTaskGroup

In [None]:
from integrate_ai_sdk.taskgroup.taskbuilder import local
from integrate_ai_sdk.taskgroup.base import SessionTaskGroup
from concurrent.futures import ProcessPoolExecutor

# important to use ProcessPoolExecutor to avoid deadlocks
# max workers needs to be set to at least the same number of concurrent tasks,
# in order to avoid deadlocks.
executor = ProcessPoolExecutor(max_workers=10)

tb = local.local_python(executor, client)

### Set up autoreload in notebook (jupyter specific)

This allows for code that's being developed in the python virtual env to be have modules autoreloaded on change

In [None]:
%load_ext autoreload
%autoreload 2


### Specifying optional AWS Credentials, path to datasets

In [None]:
# Specify the path to your training and test data on S3
data_dir = "~/Downloads/synthetic"
storage_path = "s3://devel.integrate.ai"
train_path1 = f"{data_dir}/train_silo0.parquet"
train_path2 = f"{data_dir}/train_silo1.parquet"
test_path = f"{data_dir}/test.parquet"

## Create and Run EDA Session

In [None]:
dataset_config = {"dataset_one": [], "dataset_two": []}

eda_session = client.create_eda_session(
    name="Testing notebook - EDA",
    description="I am testing EDA session creation through a notebook",
    data_config=dataset_config,
    startup_mode="external",
).start()
eda_session.id

In [None]:
eda_task_group = (
    SessionTaskGroup(eda_session)
    .add_task(tb.fls(storage_path=storage_path))
    .add_task(tb.eda(dataset_name="dataset_one", dataset_path=train_path1))
    .add_task(tb.eda(dataset_name="dataset_two", dataset_path=train_path2))
)

In [None]:
eda_task_group_context = eda_task_group.start()

In [None]:
eda_task_group_context.wait(30)

In [None]:
results = eda_session.results()
results.describe()

## Create a Training Session

The documentation for [creating a session](https://integrate-ai.gitbook.io/integrate.ai-user-documentation/tutorials/end-user-tutorials/model-training-with-a-sample-local-dataset#create-and-start-the-session) gives a bit more context into the parameters that are used during training session creation.<br />
For this session we are going to be using two training clients and two rounds. 

In [None]:
training_session = client.create_fl_session(
    name="Testing notebook",
    description="I am testing session creation through a notebook",
    min_num_clients=2,
    num_rounds=2,
    package_name="iai_ffnet",
    model_config=model_config,
    data_config=data_schema,
    startup_mode="external",
).start()

training_session.id

### Create task_group with appropriate number of tasks
#### Number of tasks added should match min_number of clients specified when creating the session

In [None]:
task_group = (
    SessionTaskGroup(training_session)
    .add_task(tb.fls(storage_path=storage_path))
    .add_task(tb.hfl(train_path=train_path1, test_path=test_path))
    .add_task(tb.hfl(train_path=train_path2, test_path=test_path))
)

#

In [None]:
task_group_context = task_group.start()

### Monitor submitted jobs

In [None]:
# session available in group context after submission
print(task_group_context.contexts)

In [None]:
# status of tasks submitted
for task in task_group_context.contexts:
    print(f"{task.future} {task.status()}")

In [None]:
# Use to monitor if a session has completed successfully or has failed
# You can modify the time to wait as per your specific task
task_group_context.wait(30)

In [None]:
training_session.metrics()