# integrate.ai HFL Gradient Boosting Methods Sample Notebook

## Set environment variables (or replace inline) with your IAI credentials
### Generate and manage this token in the UI, in the Tokens page

In [None]:
import os

IAI_TOKEN = os.environ.get("IAI_TOKEN")

## Authenticate to the integrate.ai api client

In [None]:
from integrate_ai_sdk.api import connect

client = connect(token="IAI_TOKEN")

## Sample model config and data schema
You can find the model config and data schema in the [HFL-GBM tutorial](https://integrate-ai.gitbook.io/integrate.ai-user-documentation/tutorials/hfl-gradient-boosted-models-hfl-gbm#review-the-sample-model-configuration).

In [None]:
model_config = {
    "strategy": {"name": "HistGradientBoosting", "params": {}},
    "model": {
        "params": {
            "max_depth": 4,
            "learning_rate": 0.05,
            "random_state": 23,  # for reproducibility
            "max_bins": 128,
            "sketch_relative_accuracy": 0.001,
        }
    },
    "ml_task": {"type": "classification", "params": {}},
    "save_best_model": {"metric": None, "mode": "min"},
}

data_schema = {
    "predictors": ["x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14"],
    "target": "y",
}

## Create a Training Session

The documentation for [creating a session](https://integrate-ai.gitbook.io/integrate.ai-user-documentation/tutorials/end-user-tutorials/model-training-with-a-sample-local-dataset#create-and-start-the-session) gives a bit more context into the parameters that are used during training session creation.<br />
For this session we are going to be using two training clients and ten rounds. 

In [None]:
training_session = client.create_fl_session(
    name="HFL session testing GBM",
    description="I am testing GBM session creation through a notebook",
    min_num_clients=2,
    num_rounds=10,
    package_name="iai_gbm",
    model_config=model_config,
    data_config=data_schema,
).start()

training_session.id

## Start a training session using iai client
Make sure that the sample data you [downloaded](#https://integrate-ai.gitbook.io/integrate.ai-user-documentation/tutorials/hfl-gradient-boosted-models-hfl-gbm#review-the-sample-model-configuration) is saved to your `~/Downloads` directory, otherwise update the `data_path` below to point to the sample data.

In [None]:
import subprocess

data_path = "~/Downloads/synthetic"

client_1 = subprocess.Popen(
    f"iai client train --token {IAI_TOKEN} --session {training_session.id} --train-path {data_path}/train_silo0.parquet --test-path {data_path}/test.parquet --batch-size 1024 --client-name client-1 --remove-after-complete",
    shell=True,
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
)

client_2 = subprocess.Popen(
    f"iai client train --token {IAI_TOKEN} --session {training_session.id} --train-path {data_path}/train_silo1.parquet --test-path {data_path}/test.parquet --batch-size 1024 --client-name client-2 --remove-after-complete",
    shell=True,
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
)

## Poll for session status

You can log whatever you would like about the session during this time. For now we are logging the current round and the session status. If you want to access the logs later you can use `iai client log` command.

In [None]:
import time

current_round = None
current_status = None
while client_1.poll() is None or client_2.poll() is None:
    output1 = client_1.stdout.readline().decode("utf-8").strip()
    output2 = client_2.stdout.readline().decode("utf-8").strip()
    if output1:
        print("silo1: ", output1)
    if output2:
        print("silo2: ", output2)

    # poll for status and round
    if current_status != training_session.status:
        print("Session status: ", training_session.status)
        current_status = training_session.status
    if current_round != training_session.round and training_session.round > 0:
        print("Session round: ", training_session.round)
        current_round = training_session.round
    time.sleep(1)

output1, error1 = client_1.communicate()
output2, error2 = client_2.communicate()

print(
    "client_1 finished with return code: %d\noutput: %s\n  %s"
    % (client_1.returncode, output1.decode("utf-8"), error1.decode("utf-8"))
)
print(
    "client_2 finished with return code: %d\noutput: %s\n  %s"
    % (client_2.returncode, output2.decode("utf-8"), error2.decode("utf-8"))
)

## Session Complete!
Now you can view the training metrics and start making predictions

In [None]:
training_session.metrics().as_dict()

In [None]:
fig = training_session.metrics().plot()

### Trained model parameters are accessible from the completed session

Model parameters can be retrieved using the model's as_sklearn method. 

In [None]:
model = training_session.model().as_sklearn()
model

## Load test data

In [None]:
import pandas as pd

test_data = pd.read_parquet(f"{data_path}/test.parquet")
test_data.head()

## Convert test data to tensors

In [None]:
Y = test_data["y"]

In [None]:
X = test_data[["x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14"]]

## Run model predictions

In [None]:
model.predict(X)

In [None]:
from sklearn.metrics import roc_auc_score

y_hat = model.predict_proba(X)
roc_auc_score(Y, y_hat[:, 1])