Preliminaries: choose a dataset and set simulation parameters

In [1]:
from classes.Datasets.data_loader import load_stroke, load_stroke_nprep
# from classes.params import simul_param, fl_param


data = "kaggle_stroke" # stroke data without SMOTE
run = 0

# load the data (only for centralized perf)
if data == "kaggle_stroke":
    x_train, y_train, x_valid, y_valid = load_stroke()
elif data == "kaggle_stroke_nprep":
    x_train, y_train, x_valid, y_valid = load_stroke_nprep()


Sets the number of clients and the number of trees (xgboost) per client

In [2]:
num_clients = 6  # K
trees_client = 15  # M

# Centralized performance
Data are fused on the server, this is the classical distributed xboost, privacy critical


In [3]:
import joblib
from utils import accuracy
import pandas as pd
from sklearn.metrics import confusion_matrix
import xgboost as xgb
import numpy as np


hyperparams = {
    "objective": "binary:logistic",
    # Same number of trees as in the decentralized case
    "n_estimators": num_clients * trees_client,
    "max_depth": 5,
    "learning_rate": 0.1,
    "base_score": 0.5,
    "random_state": 34,
}

reg = xgb.XGBClassifier(**hyperparams)
reg.fit(x_train, y_train)
y_pred = reg.predict(x_valid)

error_centr = accuracy(y_valid, y_pred)
cm = confusion_matrix(y_valid, y_pred)
TPR_centralized = cm[1,1]/(cm[1,0] + cm[1,1])
TNR_centralized = cm[0,0] / (cm[0,0] + cm[0,1])
print(f"Accuracy (Centralized), TPR, TNR: {100*error_centr :.5f} {100*TPR_centralized :.5f} {100*TNR_centralized :.5f}%")
# save and store the centralized model
checkpointpath1 = 'xgb_models/XGB_centralized_model.h5'
joblib.dump(reg, checkpointpath1, compress=0)


Accuracy (Centralized), TPR, TNR: 85.85209 85.80247 85.93482%


['xgb_models/XGB_centralized_model.h5']

# INDIVIDUAL CLIENTS (NO FEDERATION)
iid split (can be extened with sample/label/feature imbalance)


In [4]:
from classes.Datasets.dataset_client import Dataset
from classes.Datasets.data_partitioner import split_iid_sim
import os, json
# or run python -m classes.Datasets.data_generator.py to get a data distribution
samples = 100
niid_type = 'iid'
alpha = 1

print('Splitting IID')

# split the training dataset and create folders in data/client_#i/train
split_iid_sim(x_train, y_train, samples, type='train')

# split the validation dataset and create folders in data/client_#i/valid
split_iid_sim(x_valid, y_valid, samples, type='valid')

# save data info to json for PS only
n_classes = np.unique(y_valid, axis=0).shape[0] if np.unique(y_valid, axis=0).shape[0]>2 else 1
data_info = {
        'input_shape': x_train.shape[1:],
        'num_classes': n_classes, #np.unique(y_valid, axis=0).shape[0],
        'data': data,
        'niid_type': niid_type,
        'alpha': alpha
    }
# save data/server/
dir = "data/server/"
os.makedirs(dir, exist_ok=True)
with open(dir + "data_info.json", "w") as outfile:
    json.dump(data_info, outfile)

x_train_clients = []
y_train_clients = []
x_valid_clients = []
y_valid_clients = []

# create train and valid datasets for all clients
for k in range(num_clients):
    handle = Dataset(k) # get an handle to training dataset of client k
    x_train_clients.append(handle.x_train_local)
    y_train_clients.append(handle.y_train_local)
    x_valid_clients.append(handle.x_valid)
    y_valid_clients.append(handle.y_valid)

datasets = tuple(zip(x_train_clients, y_train_clients))


Splitting IID
Client 0: KL Divergence: 0.0255 | Wasserstein Distance: 0.0000 | Samples 100
Client 1: KL Divergence: 0.0152 | Wasserstein Distance: 0.0000 | Samples 100
Client 2: KL Divergence: -0.0340 | Wasserstein Distance: 0.0000 | Samples 100
Client 3: KL Divergence: -0.1116 | Wasserstein Distance: 0.0000 | Samples 100
Client 4: KL Divergence: 0.1133 | Wasserstein Distance: 0.0000 | Samples 100
Client 5: KL Divergence: -0.0050 | Wasserstein Distance: 0.0000 | Samples 100
Saved train data
Client 0: KL Divergence: -0.0073 | Wasserstein Distance: 0.0000 | Samples 259
Client 1: KL Divergence: 0.0043 | Wasserstein Distance: 0.0000 | Samples 259
Client 2: KL Divergence: 0.0160 | Wasserstein Distance: 0.0000 | Samples 259
Client 3: KL Divergence: 0.0004 | Wasserstein Distance: 0.0000 | Samples 259
Client 4: KL Divergence: 0.0082 | Wasserstein Distance: 0.0000 | Samples 259
Client 5: KL Divergence: -0.0186 | Wasserstein Distance: 0.0000 | Samples 259
Saved valid data


train the xboost tree models locally. Decision tree models are the ensemble model (base models) for fedxbgoostllr. Save the ensembles and evaluate them separately (no federation)

In [5]:
# Hyperparameters for each of the clients
hyperparams = {
    "objective": "binary:logistic",
    "n_estimators": trees_client,
    "max_depth": 5,
    "learning_rate": 0.1,
    "base_score": 0.5,  # np.mean(y_train)
    "random_state": 34,
}

errors_clients = []
TPR_clients = []
TNR_clients = []
for c, (x_train, y_train) in enumerate(
        datasets
):  # extract the dataset for the current client
    reg = xgb.XGBClassifier(**hyperparams)
    reg.fit(x_train, y_train)
    # save model
    checkpointpath = 'xgb_models/XGB_client_model_{}.h5'.format(c)
    joblib.dump(reg, checkpointpath, compress=0)
    # full performance tests (accuracy and confusion matrix)
    y_pred = reg.predict(x_valid)

    error = accuracy(y_valid, y_pred)
    cm = confusion_matrix(y_valid, y_pred)
    TPR_isolated = cm[1,1] / (cm[1,0] + cm[1,1])
    TNR_isolated = cm[0,0] / (cm[0,0] + cm[0,1])
    print(f"Accuracy, TPR, TNR (Client {c}): {100*error :.5f} {100*TPR_isolated :.5f} {100*TNR_isolated :.5f}%")
    errors_clients.append(error)
    TPR_clients.append(TPR_isolated)
    TNR_clients.append(TNR_isolated)
    # XGB_models.append(reg)

Accuracy, TPR, TNR (Client 0): 73.82637 78.80658 65.52316%
Accuracy, TPR, TNR (Client 1): 70.54662 73.35391 65.86621%
Accuracy, TPR, TNR (Client 2): 72.54019 78.80658 62.09262%
Accuracy, TPR, TNR (Client 3): 71.70418 64.71193 83.36192%
Accuracy, TPR, TNR (Client 4): 74.21222 81.99588 61.23499%
Accuracy, TPR, TNR (Client 5): 69.90354 77.16049 57.80446%


# FEDERATED XGBOOST 
The global model is a 1D-CNN type with specific filter sizes. The global model acts as an "ensemble model"

Create FIRST a new data for 1D-CNN which consists of XGB trees outputs 

The pipeline is the following (XGB trees output-> 1D-CNN -> accuracy)

DATA: input to FL is now the output of the XGB trees (base models)

NOTE: During initialization, all clients xgboost models must be shared among all clients before starting FL process. MQTT can be used for this or models can be loaded from a shared folder. Specific security measures should be probably applied here.

In [6]:
from utils import get_trees_predictions_xgb



# load all xgboost models and prepare the data
XGB_models = []
for c in range(num_clients):
    checkpointpath1 = 'xgb_models/XGB_client_model_{}.h5'.format(c)
    xgb = joblib.load(checkpointpath1)
    XGB_models.append(xgb)

# prepare the new dataset for training
objective = "binary"
x_xgb_trees_out = []
y_xgb_trees_out = []
for c, (x_train, y_train) in enumerate(datasets):  # for each client
    print("Converting the data of client", c, 100 * "-")
    x_xgb_trees_out.append(get_trees_predictions_xgb(x_train, objective, *XGB_models)) # XGB trees outputs (for all XGBoost trees!) corresponding to training data of client c
    y_xgb_trees_out.append(y_train) # true labels of client c

datasets_out = tuple(zip(x_xgb_trees_out, y_xgb_trees_out)) # dataset_out is the new federated dataset input to 1D-CNN (XGB trees output-> 1D-CNN -> accuracy)

# Validation data

xgb_valid_out = get_trees_predictions_xgb(x_valid, objective, *XGB_models) # XGB trees outputs corresponding to validation data: to simplify the reasoning, we apply same validation set for all (other options are also feasible)


Converting the data of client 0 ----------------------------------------------------------------------------------------------------
Converting the data of client 1 ----------------------------------------------------------------------------------------------------
Converting the data of client 2 ----------------------------------------------------------------------------------------------------
Converting the data of client 3 ----------------------------------------------------------------------------------------------------
Converting the data of client 4 ----------------------------------------------------------------------------------------------------
Converting the data of client 5 ----------------------------------------------------------------------------------------------------


# FEDXGBOOST aggregator 
initialize the global model (or ensemble model)

In [7]:
from models import CNN

filters = 16 # convolutional filters (16, 32 ok, >32 too large, depends on tree structures) TO BE OPTIMIZED
filter_size = trees_client # CNN filter size MUST BE equal to the number of trees per client

params_cnn = (num_clients, filter_size, filters, objective)
models_clients = []  # list of models

model_global = CNN(*params_cnn)  # global model
num_layers = len(model_global.get_weights())

model_global.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 6, 16)             256       
                                                                 
 flatten (Flatten)           (None, 96)                0         
                                                                 
 dense (Dense)               (None, 96)                9312      
                                                                 
 dense_1 (Dense)             (None, 1)                 97        
                                                                 
Total params: 9665 (37.75 KB)
Trainable params: 9665 (37.75 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


# FEDERATED LEARNING PROCESS 
Federated averaging with adam optimizer simulator. No MQTT.

In [8]:
R = 6  # global FL rounds
E = 5  # local epochs
print(f"Round 0/{R}")  # init model

for r in range(R):  # for each round
    
    # update phase for each client
    for c, (x_train_c, y_train_c) in enumerate(datasets_out):  
        print(f"Round {r + 1}/{R}, Client {c + 1}/{num_clients}")
        model_client = CNN(*params_cnn)  # create a new model
        # set global weights (no memory of prev local weights)
        model_client.set_weights(model_global.get_weights())  
        # update phase
        model_client.fit(
            x_train_c, y_train_c, epochs=E, verbose=False
        )  # train the model on the client data
        models_clients.append(model_client)  # save the model
    
    # aggregation phase
    global_weights = []
    for i in range(num_layers):  # aggregate the weights, no memory of prev global weights
        global_weights.append(
            np.sum([model.get_weights()[i] for model in models_clients], axis=0)
            / len(models_clients)
        )
    model_global.set_weights(global_weights)

    model_global.evaluate(xgb_valid_out, y_valid)  # evaluate the global model


Round 0/6
Round 1/6, Client 1/6
Round 1/6, Client 2/6
Round 1/6, Client 3/6
Round 1/6, Client 4/6
Round 1/6, Client 5/6
Round 1/6, Client 6/6
Round 2/6, Client 1/6
Round 2/6, Client 2/6
Round 2/6, Client 3/6
Round 2/6, Client 4/6
Round 2/6, Client 5/6
Round 2/6, Client 6/6
Round 3/6, Client 1/6
Round 3/6, Client 2/6
Round 3/6, Client 3/6
Round 3/6, Client 4/6
Round 3/6, Client 5/6
Round 3/6, Client 6/6
Round 4/6, Client 1/6
Round 4/6, Client 2/6
Round 4/6, Client 3/6
Round 4/6, Client 4/6
Round 4/6, Client 5/6
Round 4/6, Client 6/6
Round 5/6, Client 1/6
Round 5/6, Client 2/6
Round 5/6, Client 3/6
Round 5/6, Client 4/6
Round 5/6, Client 5/6
Round 5/6, Client 6/6
Round 6/6, Client 1/6
Round 6/6, Client 2/6
Round 6/6, Client 3/6
Round 6/6, Client 4/6
Round 6/6, Client 5/6
Round 6/6, Client 6/6


Final testing on stroke data

In [9]:
import scipy.io as sio

y_hat_xgbb = model_global.predict(xgb_valid_out)
y_hat_xgb = y_hat_xgbb >= 0.5 # binary estimator (CNN model has sigmoid output)

error_fed = accuracy(y_valid, y_hat_xgb)

# performance and confusion matrix
cm = confusion_matrix(y_valid, y_hat_xgb)
TPR_fed = cm[1,1] / (cm[1,0] + cm[1,1])
TNR_fed = cm[0,0] / (cm[0,0] + cm[0,1])


print(f"Accuracy (Centralized), TPR, TNR: {100*error_centr :.5f} {100*TPR_centralized :.5f} {100*TNR_centralized :.5f}%")
for c, error in enumerate(errors_clients):
    print(f"Accuracy, TPR, TNR: (Client {c}): {100*error :.5f} {100*TPR_clients[c] :.5f} {100*TNR_clients[c] :.5f}%")
print(f"Accuracy (Federated), TPR, TNR: {100*error_fed :.5f} {100*TPR_fed :.5f} {100*TNR_fed :.5f}%")

# saving results
checkpointpath = 'xgb_models/XGB_federated_model.h5'
model_global.save(checkpointpath)
# joblib.dump(model_global, checkpointpath, compress=0)
dict_1 = {"Accuracy_centralized": error_centr,
          "TPR_centralized":  TPR_centralized,
          "TNR_centralized":  TNR_centralized,
          "Accuracy_clients": errors_clients,
          "TPR_clients": TPR_clients,
          "TNR_clients": TNR_clients,
          "Accuracy_federation": error_fed,
          "TPR_federation": TPR_fed,
          "TNR_federation": TNR_fed,

          }
sio.savemat(
    "results/fedXGboost_{}_alpha_{}_samples_{}_run_{}.mat".format('iid',0,100,run), dict_1)


Accuracy (Centralized), TPR, TNR: 85.85209 85.80247 85.93482%
Accuracy, TPR, TNR: (Client 0): 73.82637 78.80658 65.52316%
Accuracy, TPR, TNR: (Client 1): 70.54662 73.35391 65.86621%
Accuracy, TPR, TNR: (Client 2): 72.54019 78.80658 62.09262%
Accuracy, TPR, TNR: (Client 3): 71.70418 64.71193 83.36192%
Accuracy, TPR, TNR: (Client 4): 74.21222 81.99588 61.23499%
Accuracy, TPR, TNR: (Client 5): 69.90354 77.16049 57.80446%
Accuracy (Federated), TPR, TNR: 75.24116 84.67078 59.51973%


  saving_api.save_model(
