# Preliminaries 
Choose a dataset and set simulation parameters

In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
# from classes.params import simul_param, fl_param

# load a tabular dataset (example with scikitlearn datasets)
X, y = datasets.make_regression(n_samples=1000, n_features=10, n_informative=7, n_targets=1)
x_train, x_valid,  y_train, y_valid = train_test_split(X, y, test_size=0.4, random_state=42)

Sets the number of clients and the number of trees (xgboost) per client

In [2]:
num_clients = 5  # K
trees_client = 10 # M

# Centralized performance
Data are fused on the server, this is the classical distributed xgboost, privacy critical


In [3]:
import joblib
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import numpy as np

hyperparams = {
    "objective": "reg:squarederror",
    # Same number of trees as in the decentralized case
    "n_estimators": num_clients * trees_client,
    "max_depth": 5,
    "learning_rate": 0.1,
    "base_score": 0.5,
    "random_state": 34,
}

reg = xgb.XGBRegressor(**hyperparams)
reg.fit(x_train, y_train)
y_pred = reg.predict(x_valid)
error_centr = mean_squared_error(y_valid, y_pred)

print(f"MSE: {error_centr:.2f}") 

# save and store the centralized model
checkpointpath1 = 'xgb_models/XGB_centralized_model.h5'
joblib.dump(reg, checkpointpath1, compress=0)


MSE: 2644.02


['xgb_models/XGB_centralized_model.h5']

# Isolated case (no federation) 
Training of local xgboost models (base models of the ensemble)

Code below implements iid split (can be extened with sample/label/feature imbalance), saves training, validation data in data/client_i and server parameters in server folders


In [4]:
from classes.Datasets.dataset_client import Dataset
from classes.Datasets.data_partitioner import split_iid_sim
import os, json
# or run python -m classes.Datasets.data_generator.py to get a data distribution
samples = 100
niid_type = 'iid'
alpha = 1

print('Splitting IID')

# split the training dataset and create folders in data/client_#i/train
split_iid_sim(x_train, y_train, samples, num_clients, type='train')

# split the validation dataset and create folders in data/client_#i/valid
split_iid_sim(x_valid, y_valid, samples, num_clients, type='valid')

x_train_clients = []
y_train_clients = []
x_valid_clients = []
y_valid_clients = []

# create train and valid datasets for all clients
for k in range(num_clients):
    handle = Dataset(k) # get an handle to training dataset of client k
    x_train_clients.append(handle.x_train_local)
    y_train_clients.append(handle.y_train_local)
    x_valid_clients.append(handle.x_valid)
    y_valid_clients.append(handle.y_valid)

datasets = tuple(zip(x_train_clients, y_train_clients))


Splitting IID
Client 0 | Samples 100
Client 1 | Samples 100
Client 2 | Samples 100
Client 3 | Samples 100
Client 4 | Samples 100
Saved train data
Client 0 | Samples 80
Client 1 | Samples 80
Client 2 | Samples 80
Client 3 | Samples 80
Client 4 | Samples 80
Saved valid data


Train the xboost tree models locally. Decision tree models are the ensemble model (base models) for fedxbgoostllr. Save the ensembles and evaluate them separately (no federation)

In [5]:
# Hyperparameters for each of the clients
hyperparams = {
    "objective": "reg:squarederror",
    "n_estimators": trees_client,
    "max_depth": 5,
    "learning_rate": 0.1,
    "base_score": 0.5,  # np.mean(y_train)
    "random_state": 34,
}

errors_clients = []
for c, (x_train, y_train) in enumerate(
        datasets
):  # extract the dataset for the current client
    reg = xgb.XGBRegressor(**hyperparams) # train the model
    reg.fit(x_train, y_train)
    # save model
    checkpointpath = 'xgb_models/XGB_client_model_{}.h5'.format(c)
    joblib.dump(reg, checkpointpath, compress=0)
    
    # MSE test
    y_pred = reg.predict(x_valid)
    error = mean_squared_error(y_valid, y_pred)
    print(f"MSE, (Client {c}): {error :.2f}")
    errors_clients.append(error)


MSE, (Client 0): 10936.35
MSE, (Client 1): 12367.12
MSE, (Client 2): 11425.08
MSE, (Client 3): 11698.99
MSE, (Client 4): 11588.88


# Federated XGBoost 
The global model is a 1D-CNN type with specific filter sizes. The global model acts as an "ensemble model"

The pipeline is the following (XGB trees outputs-> 1D-CNN -> predictions)

Create FIRST a new "dataset" input to 1D-CNN which consists of XGB trees model outputs 

NOTE: During initialization, all xgboost models (of all clients) must be shared with all clients before starting the FL process. MQTT can be used for this (but also other methods apply). In the following xgboost base models are loaded from a shared folder. 

In [6]:
from utils import get_trees_predictions_xgb

# load all xgboost models and prepare the data
XGB_models = []
for c in range(num_clients):
    checkpointpath1 = 'xgb_models/XGB_client_model_{}.h5'.format(c)
    xgb = joblib.load(checkpointpath1)
    XGB_models.append(xgb)

# prepare the new dataset for training
objective = "regression"
x_xgb_trees_out = []
y_xgb_trees_out = []
for c, (x_train, y_train) in enumerate(datasets):  # for each client
    print("Converting the data of client", c, 100 * "-")
    x_xgb_trees_out.append(get_trees_predictions_xgb(x_train, objective, *XGB_models)) # XGB trees outputs (for all XGBoost trees!) corresponding to training data of client c
    y_xgb_trees_out.append(y_train) # true labels of client c

datasets_out = tuple(zip(x_xgb_trees_out, y_xgb_trees_out)) # dataset_out is the new federated dataset input to 1D-CNN (XGB trees output-> 1D-CNN -> accuracy)

# Validation data

xgb_valid_out = get_trees_predictions_xgb(x_valid, objective, *XGB_models) # XGB trees outputs corresponding to validation data: to simplify the reasoning, we apply same validation set for all (other options are also feasible)


Converting the data of client 0 ----------------------------------------------------------------------------------------------------
Converting the data of client 1 ----------------------------------------------------------------------------------------------------
Converting the data of client 2 ----------------------------------------------------------------------------------------------------
Converting the data of client 3 ----------------------------------------------------------------------------------------------------
Converting the data of client 4 ----------------------------------------------------------------------------------------------------


# FedXGBooost aggregator 
initialize the global model (or ensemble model)

In [8]:
from models import CNN

filters = 32 # convolutional filters (16, 32 ok, >32 too large, depends on tree structures) TO BE OPTIMIZED
filter_size = trees_client # CNN filter size MUST BE equal to the number of trees per client

params_cnn = (num_clients, filter_size, filters, objective)
models_clients = []  # list of models

model_global = CNN(*params_cnn)  # global model
num_layers = len(model_global.get_weights())

model_global.summary()



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_1 (Conv1D)           (None, 5, 32)             352       
                                                                 
 flatten_1 (Flatten)         (None, 160)               0         
                                                                 
 dense_2 (Dense)             (None, 160)               25760     
                                                                 
 dense_3 (Dense)             (None, 1)                 161       
                                                                 
Total params: 26273 (102.63 KB)
Trainable params: 26273 (102.63 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


# Federated Learning process 
Federated Averaging with Adam optimizer simulator. No MQTT.

In [9]:
R = 25  # global FL rounds
E = 10  # local epochs (local training of xgboost models)

print(f"Round 0/{R}")  # init model

for r in range(R):  # for each round
    
    # update phase for each client
    for c, (x_train_c, y_train_c) in enumerate(datasets_out):  
        print(f"Round {r + 1}/{R}, Client {c + 1}/{num_clients}")
        model_client = CNN(*params_cnn)  # create a new model
        # set global weights (no memory of prev local weights)
        model_client.set_weights(model_global.get_weights())  
        # update phase
        model_client.fit(
            x_train_c, y_train_c, epochs=E, verbose=False
        )  # train the model on the client data
        models_clients.append(model_client)  # save the model
    
    # aggregation phase
    global_weights = []
    for i in range(num_layers):  # aggregate the weights, no memory of prev global weights
        global_weights.append(
            np.sum([model.get_weights()[i] for model in models_clients], axis=0)
            / len(models_clients)
        )
    model_global.set_weights(global_weights)

    model_global.evaluate(xgb_valid_out, y_valid)  # evaluate the global model


Round 0/25
Round 1/25, Client 1/5
Round 1/25, Client 2/5
Round 1/25, Client 3/5
Round 1/25, Client 4/5
Round 1/25, Client 5/5
Round 2/25, Client 1/5
Round 2/25, Client 2/5
Round 2/25, Client 3/5
Round 2/25, Client 4/5
Round 2/25, Client 5/5
Round 3/25, Client 1/5
Round 3/25, Client 2/5
Round 3/25, Client 3/5
Round 3/25, Client 4/5
Round 3/25, Client 5/5
Round 4/25, Client 1/5
Round 4/25, Client 2/5
Round 4/25, Client 3/5
Round 4/25, Client 4/5
Round 4/25, Client 5/5
Round 5/25, Client 1/5
Round 5/25, Client 2/5
Round 5/25, Client 3/5
Round 5/25, Client 4/5
Round 5/25, Client 5/5
Round 6/25, Client 1/5
Round 6/25, Client 2/5
Round 6/25, Client 3/5
Round 6/25, Client 4/5
Round 6/25, Client 5/5
Round 7/25, Client 1/5
Round 7/25, Client 2/5
Round 7/25, Client 3/5
Round 7/25, Client 4/5
Round 7/25, Client 5/5
Round 8/25, Client 1/5
Round 8/25, Client 2/5
Round 8/25, Client 3/5
Round 8/25, Client 4/5
Round 8/25, Client 5/5
Round 9/25, Client 1/5
Round 9/25, Client 2/5
Round 9/25, Client 3/5


Final testing 

In [10]:
import scipy.io as sio

y_hat_xgb = model_global.predict(xgb_valid_out)
error_fed = mean_squared_error(y_valid, y_hat_xgb)

# performance and mse
print(f"MSE (Centralized): {error_centr :.2f}")
for c, error in enumerate(errors_clients):
    print(f"MSE (Client {c}): {error :.2f}")
print(f"MSE (Federated): {error_fed :.2f}")

# saving results
checkpointpath = 'xgb_models/XGB_federated_model_regression.h5'
model_global.save(checkpointpath)
# joblib.dump(model_global, checkpointpath, compress=0)
dict_1 = {"MSE_centralized": error_centr,
          "MSE_clients": errors_clients,
          "MSE_federation": error_fed,
          }
sio.savemat(
    "results/fedXGboost_regression.mat", dict_1)

MSE (Centralized): 2644.02
MSE (Client 0): 10936.35
MSE (Client 1): 12367.12
MSE (Client 2): 11425.08
MSE (Client 3): 11698.99
MSE (Client 4): 11588.88
MSE (Federated): 3797.67


  saving_api.save_model(
