# integrate.ai API Sample Notebook

## Set environment variables (or replace inline) with your IAI credentials
### Generate and manage this token in the UI, in the Tokens page

In [None]:
import os

IAI_TOKEN = os.environ.get("IAI_TOKEN")

## Authenticate to the integrate.ai api client

In [None]:
from integrate_ai_sdk.api import connect

client = connect(token=IAI_TOKEN)

In [None]:
# Set your AWS Credentials if you are generating temporary ones, else use the default profile credentials
aws_creds = {
    "ACCESS_KEY": os.environ.get("AWS_ACCESS_KEY_ID"),
    "SECRET_KEY": os.environ.get("AWS_SECRET_ACCESS_KEY"),
    "SESSION_TOKEN": os.environ.get("AWS_SESSION_TOKEN"),
    "REGION": os.environ.get("AWS_REGION"),
}

## Create an EDA Session for exploring the datasets

To create an EDA session, we specify a `dataset_config` dictionary indicating the columns to explore for each dataset. Here the empty list `[]` means to include all columns. 

For information more information on how to configure an EDA session, see the documentation [here](https://integrate-ai.gitbook.io/integrate.ai-user-documentation/tutorials/hfl-exploratory-data-analysis).

In [None]:
eda_data_config = {"prl_silo0": [], "prl_silo1": []}
eda_config = {"strategy": {"name": "EDAHistogram", "params": {}}}
prl_session_id = "<prl session id>"

In [None]:
eda_session = client.create_eda_session(
    name="Testing notebook - EDA Intersect session",
    description="I am testing EDA on PRL session creation through a notebook",
    data_config=eda_data_config,
    eda_mode="intersect",
    prl_session_id=prl_session_id,
).start()

eda_session.id

## Start an EDA Session using IAI client
Follow the documentation on directions for how to install the [integrate_ai](https://pypi.org/project/integrate-ai/) package and the [sample data](https://integrate-ai.gitbook.io/integrate.ai-user-documentation/tutorials/end-user-tutorials/model-training-with-a-sample-local-dataset#prerequisites).<br/>
Unzip the sample data to your `~/Downloads` directory, otherwise update the `data_path` below to point to the sample data.

In [None]:
# from integrate_ai_sdk.taskgroup.taskbuilder import aws as taskbuilder_fargate
from integrate_ai_sdk.taskgroup.taskbuilder import aws as taskbuilder_aws
from integrate_ai_sdk.taskgroup.base import SessionTaskGroup

In [None]:
# Example data paths in s3
train_path1 = "s3://sample-data.integrate.ai/prl/prl_silo0.csv"
train_path2 = "s3://sample-data.integrate.ai/prl/prl_silo1.csv"
test_path1 = "s3://sample-data.integrate.ai/prl/prl_silo0.csv"
test_path2 = "s3://sample-data.integrate.ai/prl/prl_silo1.csv"

# Specify the AWS parameters
# cluster = "iai-fl-server-ecs-cluster"
# task_definition = "iai-fl-server-fargate-job"
model_storage = "s3://iai-client.sample-data.integrate.ai"
security_group = "iai_fl_server_security_group"
subnet_id = "<subnet>"  # Public subnet (routed via IGW)
job_queue = "iai-client-batch-job-queue"
job_def = "iai-client-batch-job"

In [None]:
# task_server = taskbuilder_aws.fargate(
#  cluster=cluster)
# task_definition=task_definition)

tb = taskbuilder_aws.batch(job_queue=job_queue, aws_credentials=aws_creds, cpu_job_definition=job_def)

Important: The dataset_name specified in the task must be identical to the client_name specified in the PRL session.

In [None]:
task_group_context = (
    SessionTaskGroup(eda_session)
    .add_task(tb.eda(dataset_path=train_path1, dataset_name="client_1", vcpus="2", memory="16384", client=client))
    .add_task(tb.eda(dataset_path=train_path2, dataset_name="client_2", vcpus="2", memory="16384", client=client))
    .start()
)
# .add_task(task_server.fls(subnet_id, security_group, storage_path=model_storage, client=client))\

In [None]:
# session available in group context after submission
print(task_group_context.session.id)

In [None]:
# status of tasks submitted
task_group_status = task_group_context.status()
for task_status in task_group_status:
    print(task_status)

In [None]:
# Use to monitor if a session has completed successfully or has failed
# You can modify the time to wait as per your specific task
task_group_context.wait(300)

## EDA Session Complete!
Now you can analyze the datasets.


In [None]:
results = eda_session.results()
results

In [None]:
results.describe()

In [None]:
results["client_2"].describe()

For categorical columns, other statistics like `unique_count`, `mode`, and `uniques` can be used for further exploration.

In [None]:
results["client_2"][["x10", "x11"]].uniques()

Functions like `.mean()`, `.median()`, `.std()` can also be called individually. 

In [None]:
results["client_2"].mean()

In [None]:
results["client_2"]["x1"].mean()

Histogram plots can be created using the `.plot_hist()` function.

In [None]:
saved_dataset_one_hist_plots = results["client_2"].plot_hist()

In [None]:
single_hist = results["client_2"]["x1"].plot_hist()