# integrate.ai API Sample Notebook to run client on AWS Batch and AWS Fargate

## Set environment variables (or replace inline) with your IAI credentials
### Generate and manage this token in the UI, in the Tokens page
### Generate AWS session credentials or use the default profile

In [3]:
import os
IAI_TOKEN =  os.environ.get("IAI_TOKEN")

## Authenticate to the integrate.ai api client

In [4]:
from integrate_ai_sdk.api import connect

client = connect(token=IAI_TOKEN)

### Import required packaged to run on AWS

In [5]:
from integrate_ai_sdk.taskgroup.taskbuilder import aws as taskbuilder_aws
from integrate_ai_sdk.taskgroup.base import SessionTaskGroup

### Specifying path to datasets and batch job definitions

In [6]:
# Example data paths in s3
active_train_path = "s3://iai-client.sample-data-e2e.integrate.ai/prl_vfl/active_train.parquet"
passive_train_path = "s3://iai-client.sample-data-e2e.integrate.ai/prl_vfl/passive_train.parquet"
active_test_path = "s3://iai-client.sample-data-e2e.integrate.ai/prl_vfl/active_test.parquet"
passive_test_path = "s3://iai-client.sample-data-e2e.integrate.ai/prl_vfl/passive_test.parquet"

# Specify the AWS parameters
job_queue = "iai-client-batch-job-queue"
job_def = "iai-client-batch-job"

### Specifying optional AWS Credentials

In [7]:
# Set your AWS Credentials if you are generating temporary ones, else use the default profile credentials
aws_creds = {
    'ACCESS_KEY': os.environ.get("AWS_ACCESS_KEY_ID"),
    'SECRET_KEY': os.environ.get("AWS_SECRET_ACCESS_KEY"),
    'SESSION_TOKEN': os.environ.get("AWS_SESSION_TOKEN"),
    'REGION': os.environ.get("AWS_REGION"),
}

### Create task builder object

In [8]:
tb = taskbuilder_aws.batch( 
    job_queue=job_queue,
    aws_credentials=aws_creds,
    cpu_job_definition=job_def)

## PRL Data Config

For this session, two clients are going to be providing data. Client 1 and client 2 are naming their clients client_1 and client_2 respectively. Their datasets will be linked by the "id" column in any provided datasets.

In [9]:
prl_data_config = {
    "clients": {
        "active_client": {"id_columns": ["id"]},
        "passive_client": {"id_columns": ["id"]},
    }
}

## Create a PRL Session for linking the datasets

To create a PRL session, specify a `dataset_config` dictionary indicating the client names and columns to use as identifiers to link the datasets to each other. The number of expected clients will be inferred as the number of items in dataset_config (i.e., two). These client names are referenced for the compute on the PRL session and for any sessions that use the PRL session downstream.


In [None]:
prl_session = client.create_prl_session(
    name="Testing notebook - PRL",
    description="I am testing PRL session creation through a notebook",
    data_config=prl_data_config,
).start()

prl_session.id

### Create task_group with appropriate number of tasks
#### Number of tasks added should match the number of clients specified in the data config when creating the session

In [None]:
task_group_context = SessionTaskGroup(prl_session)\
        .add_task(tb.prl(train_path=active_train_path, test_path=active_test_path, vcpus='2', memory='16384', client=client, client_name="active_client"))\
        .add_task(tb.prl(train_path=passive_train_path, test_path=passive_test_path, vcpus='2', memory='16384', client=client, client_name="passive_client")).start()

### Monitor submitted jobs

In [None]:
# session available in group context after submission
print(task_group_context.session.id)

In [None]:
# status of tasks submitted
task_group_status = task_group_context.status()
for task_status in task_group_status:
    print(task_status)

In [None]:
# Use to monitor if a session has completed successfully or has failed
# You can modify the time to wait as per your specific task
task_group_context.wait(300)

## PRL Session Complete!
Now you can view the overlap stats for the datasets.

In [None]:
prl_session.metrics().as_dict()

## Create a VFL Training Session
To create a VFL train session, specify the `prl_session_id` indicating the session above used to link the datasets together. The `vfl_mode` needs to be set to `'train'`.

In [None]:
model_config = {
    "strategy": {"name": "SplitNN", "params": {}},
    "model": {
        "feature_models": {
            "passive_client": {"params": {"input_size": 7, "hidden_layer_sizes": [6], "output_size": 5}},
            "active_client": {"params": {"input_size": 8, "hidden_layer_sizes": [6], "output_size": 5}},
        },
        "label_model": {"params": {"hidden_layer_sizes": [5], "output_size": 2}},
    },
    "ml_task": {
        "type": "classification",
        "params": {
            "loss_weights": None,
        },
    },
    "optimizer": {"name": "SGD", "params": {"learning_rate": 0.2, "momentum": 0.0}},
    "seed": 23,  # for reproducibility
}

data_config = {
        "passive_client": {
            "label_client": False,
            "predictors": ["x1", "x3", "x5", "x7", "x9", "x11", "x13"],
            "target": None,
        },
        "active_client": {
            "label_client": True,
            "predictors": ["x0", "x2", "x4", "x6", "x8", "x10", "x12", "x14"],
            "target": "y",
        },
    }

In [None]:
vfl_train_session = client.create_vfl_session(
    name="Testing notebook - VFL Train",
    description="I am testing VFL Train session creation through a notebook",
    prl_session_id=prl_session.id,
    vfl_mode='train',
    min_num_clients=2,
    num_rounds=2,
    package_name="iai_ffnet",
    data_config=data_config,
    model_config=model_config
).start()

vfl_train_session.id

### Create VFL task_group with appropriate number of tasks
#### Number of tasks added should match the number of clients specified in the data config when creating the session

In [None]:
vfl_task_group_context = SessionTaskGroup(vfl_train_session)\
        .add_task(tb.vfl_train(train_path=active_train_path, test_path=active_test_path, vcpus='2', memory='16384', batch_size=1024, storage_path="s3://iai-client.sample-data-e2e.integrate.ai/prl_vfl/",client=client, client_name="active_client"))\
        .add_task(tb.vfl_train(train_path=passive_train_path, test_path=passive_test_path, vcpus='2', memory='16384', batch_size=1024, storage_path="s3://iai-client.sample-data-e2e.integrate.ai/prl_vfl/",client=client, client_name="passive_client")).start() 

### Monitor submitted jobs

In [None]:
# session available in group context after submission
print(vfl_task_group_context.session.id)

In [None]:
# status of tasks submitted
vfl_task_group_status = vfl_task_group_context.status()
for task_status in vfl_task_group_status:
    print(task_status)

In [None]:
vfl_task_group_context.wait(600)

## Session Complete!
Now you can view the vfl training metrics and start making predictions

In [None]:
vfl_train_session.metrics().as_dict()

In [None]:
fig = vfl_train_session.metrics().plot()

## Make a Prediction on the trained VFL Model
### Create a VFL Prediction Session
To create a VFL predict session, specify the `prl_session_id` indicating the session above used to link the datasets together. You also need the `training_id` of the above VFL train session.The `vfl_mode` needs to be set to `'predict'`.

In [None]:
vfl_predict_session = client.create_vfl_session(
    name="Testing notebook - VFL Predict",
    description="I am testing VFL Predict session creation through a notebook",
    prl_session_id=prl_session.id,
    training_session_id=vfl_train_session.id,
    vfl_mode='predict',
    data_config=data_config
).start()

vfl_predict_session.id

### Specify the full path to store your predictions including file name

In [None]:
active_predictions_storage_path = "s3://iai-client.sample-data-e2e.integrate.ai/prl_vfl/active_client_predictions.csv"

vfl_predict_task_group_context = SessionTaskGroup(vfl_predict_session)\
        .add_task(tb.vfl_predict(client_name='active_client', dataset_path=active_test_path, vcpus='2', memory='16384', batch_size=1024, storage_path=active_predictions_storage_path, client=client, raw_output=True))\
        .add_task(tb.vfl_predict(client_name='passive_client', dataset_path=passive_test_path, vcpus='2', memory='16384', batch_size=1024, storage_path="None", client=client, raw_output=True)).start()

### Monitor submitted jobs

In [None]:
# session available in group context after submission
print(vfl_predict_task_group_context.session.id)

In [None]:
# status of tasks submitted
vfl_predict_task_group_status = vfl_predict_task_group_context.status()
for task_status in vfl_predict_task_group_status:
    print(task_status)

In [None]:
vfl_predict_task_group_context.wait(300)

## Session Complete!
Now you can view the vfl predictions and evaluate the performance as needed

In [None]:
import pandas as pd

df_pred = pd.read_csv(active_predictions_storage_path)
df_pred.head()