# integrate.ai API Sample Notebook to run client on AWS Batch

## Set environment variables (or replace inline) with your IAI credentials
### Generate and manage this token in the UI, in the Tokens page
### Generate AWS session credentials or use the default profile

In [None]:
import os

IAI_TOKEN = os.environ.get("IAI_TOKEN")

## Authenticate to the integrate.ai api client

In [None]:
from integrate_ai_sdk.api import connect

client = connect(token=IAI_TOKEN)


### Specifying optional AWS Credentials

In [None]:
# Set your AWS Credentials if you are generating temporary ones, else use the default profile credentials
aws_creds = {
    "ACCESS_KEY": os.environ.get("AWS_ACCESS_KEY_ID"),
    "SECRET_KEY": os.environ.get("AWS_SECRET_ACCESS_KEY"),
    "SESSION_TOKEN": os.environ.get("AWS_SESSION_TOKEN"),
    "REGION": os.environ.get("AWS_REGION"),
}

### Specifying path to datasets and batch job definitions

In [None]:
# Test data path in s3
train_path1 = "s3://iai-client.sample-data-e2e.integrate.ai/train_silo0.parquet"
train_path2 = "s3://iai-client.sample-data-e2e.integrate.ai/train_silo1.parquet"
test_path = "s3://iai-client.sample-data-e2e.integrate.ai/test.parquet"
# Specify the name of your job_queue, job_definition
job_queue = "iai-client-batch-job-queue"
job_def = "iai-client-batch-job"

### Create batch task builder object

In [None]:
from integrate_ai_sdk.taskgroup.taskbuilder import aws as taskbuilder_aws
from integrate_ai_sdk.taskgroup.base import SessionTaskGroup

In [None]:
tb = taskbuilder_aws.batch(job_queue=job_queue, aws_credentials=aws_creds, cpu_job_definition=job_def)

## Create an EDA Session for exploring the datasets

To create an EDA session, we specify a `dataset_config` dictionary indicating the columns to explore for each dataset. Here the empty list `[]` means to include all columns. The number of expected datasets will be inferred as the number of items in dataset_config (i.e., two). Alternatively, we can manually set it with the optional argument `num_datasets` in `client.create_eda_session()`

For information more information on how to configure an EDA session from scratch, reference the documentation [here](https://integrate-ai.gitbook.io/integrate.ai-user-documentation/tutorials/exploratory-data-analysis-eda).

In [None]:
dataset_config = {"dataset_one": [], "dataset_two": []}

In [None]:
eda_session = client.create_eda_session(
    name="Testing notebook - EDA",
    description="I am testing EDA session creation through a notebook",
    data_config=dataset_config,
).start()

eda_session.id

## Run EDA Client jobs on AWS Batch

### Create task_group with appropriate number of tasks
#### Number of tasks added should match number of datasets specified when creating the session

In [None]:
eda_task_group_context = (
    SessionTaskGroup(eda_session)
    .add_task(tb.eda(dataset_name="dataset_one", dataset_path=train_path1, vcpus="2", memory="16384", client=client))
    .add_task(tb.eda(dataset_name="dataset_two", dataset_path=train_path2, vcpus="2", memory="16384", client=client))
    .start()
)

### Poll for status

In [None]:
eda_task_group_context.wait(150)

## EDA Session Complete!
Now you can analyze the datasets.

The results object is a dataset collection, which is comprised of multiple datasets that can be retrieved by name. 

Each dataset is comprised of columns, which can be retrieved by column name. 

The same base analysis functions can be performed at the collection, dataset, or column level.

In [None]:
results = eda_session.results()["dataset_one", "dataset_two"]
results

In [None]:
results.mean().shape

In [None]:
dataset_one = eda_session.results()["dataset_one"]
dataset_one_count = dataset_one["x0"].count()
dataset_one["x0"].mean()

Histogram plots can be created using the `.plot_hist()` function.

In [None]:
single_hist = dataset_one["x0"].plot_hist()

# single_hist.legends

In [None]:
# grab histogram data for testing
bars = single_hist.gca().patches

# Best Estimate of Mean for a histogram: Î£m*n / N
# m: The midpoint of the bin
# n: The frequency of the bin
# N: The total sample size

totalSumOfHisto = 0
for i in range(len(bars)):
    totalSumOfHisto += (bars[i].get_xy()[0] + bars[i].get_width() / 2) * bars[i].get_height()
estimatedAvg = totalSumOfHisto / dataset_one_count
print(estimatedAvg)

## Create a Training Session

The documentation for [creating a session](https://integrate-ai.gitbook.io/integrate.ai-user-documentation/tutorials/end-user-tutorials/model-training-with-a-sample-local-dataset#create-and-start-the-session) gives a bit more context into the parameters that are used during training session creation.<br />
For this session we are going to be using two training clients and two rounds. 

### Sample model config and data schema
You can find the model config and data schema in the [integrate.ai end user tutorial](https://integrate-ai.gitbook.io/integrate.ai-user-documentation/tutorials/end-user-tutorials/model-training-with-a-sample-local-dataset)

In [None]:
model_config = {
    "experiment_name": "test_synthetic_tabular",
    "experiment_description": "test_synthetic_tabular",
    "strategy": {"name": "FedAvg", "params": {}},
    "model": {"params": {"input_size": 15, "hidden_layer_sizes": [6, 6, 6], "output_size": 2}},
    "balance_train_datasets": False,
    "ml_task": {
        "type": "classification",
        "params": {
            "loss_weights": None,
        },
    },
    "optimizer": {"name": "SGD", "params": {"learning_rate": 0.2, "momentum": 0.0}},
    "differential_privacy_params": {"epsilon": 4, "max_grad_norm": 7},
    "save_best_model": {
        "metric": "loss",  # to disable this and save model from the last round, set to None
        "mode": "min",
    },
    "seed": 23,  # for reproducibility
}

data_schema = {
    "predictors": ["x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14"],
    "target": "y",
}

In [None]:
training_session = client.create_fl_session(
    name="Testing notebook",
    description="I am testing session creation through a notebook",
    min_num_clients=2,
    num_rounds=2,
    package_name="iai_ffnet",
    model_config=model_config,
    data_config=data_schema,
).start()

training_session.id

## Run Training Client jobs on AWS Batch

### Create task_group with appropriate number of tasks
#### Number of tasks added should match min_number of clients specified when creating the session

In [None]:
task_group_context = (
    SessionTaskGroup(training_session)
    .add_task(tb.hfl(train_path=train_path1, test_path=test_path, vcpus="2", memory="16384", client=client))
    .add_task(tb.hfl(train_path=train_path2, test_path=test_path, vcpus="2", memory="16384", client=client))
    .start()
)

### Monitor submitted jobs

In [None]:
# session available in group context after submission
print(task_group_context.session.id)

In [None]:
# status of tasks submitted
task_group_status = task_group_context.status()
for task_status in task_group_status:
    print(task_status)

In [None]:
# Use to monitor if a session has completed successfully or has failed
# You can modify the time to wait as per your specific task
task_group_context.wait(30)

## Session Complete!
Now you can view the training metrics and start making predictions

In [None]:
training_session.metrics().as_dict()

In [None]:
fig = training_session.metrics().plot()

## Trained model parameters are accessible from the completed session

Model parameters can be retrieved using the model's state_dict method. These parameters can then be saved with torch.save().

In [None]:
import torch

model = training_session.model().as_pytorch()

save_state_dict_folder = "./saved_models"
# PyTorch conventional file type
file_name = f"{training_session.id}.pt"
os.makedirs(save_state_dict_folder, exist_ok=True)
saved_state_dict_path = os.path.join(save_state_dict_folder, file_name)

with open(saved_state_dict_path, "w") as f:
    torch.save(model.state_dict(), saved_state_dict_path)

## Load the saved model

To load a model saved previously, a model object needs to be initialized first. This can be done by directly importing one of the IAI-supported packages (e.g., FFNet) or using the model class defined in a custom package. 

In [None]:
from integrate_ai_sdk.packages.FFNet.nn_model import FFNet

model = FFNet(input_size=15, output_size=2, hidden_layer_sizes=[6, 6, 6])

# use torch.load to unpickle the state_dict
target_state_dict = torch.load(saved_state_dict_path)

model.load_state_dict(target_state_dict)

## Load test data

In [None]:
import pandas as pd

test_data = pd.read_parquet("./test.parquet")
test_data.head()

## Convert test data to tensors

In [None]:
Y = torch.tensor(test_data["y"].values)

In [None]:
X = torch.tensor(
    test_data[["x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14"]].values
)

## Run model predictions

In [None]:
model(X)

In [None]:
labels = model(X).max(dim=1)[1]
labels