In [1]:

import os.path
import wget
import bz2
import shutil
import numpy as np


In [2]:
os.environ["PROTOBUF_PYTHON_IMPLEMENTATION"] = "python"
os.environ["CUDA_LAUNCH_BLOCKING"]="1"

In [3]:

import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_squared_error
from sklearn.model_selection import KFold



In [4]:
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

2023-07-26 22:08:07.332234: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [5]:
from utils.data_utils import load_data_cross_validation, load_movielens
from model.fl_model import VerticalFLModel
from model.single_party_model import SingleParty
from model.split_nn_model import SplitNNModel

In [6]:
num_parties = 10
xs_train_val, y_train_val, xs_test, y_test, counts = load_movielens("data/movielens/", use_cache=True,
                                                            download=True, test_rate=0.1)

Loading MovieLens from cache


In [13]:
len(counts)

8

In [14]:
xs_train_val[0].shape

(900188, 2)

In [15]:
y_test.shape

(100021,)

In [17]:
len(b[1])

900188

In [8]:
counts

[6040, 3952, 2, 7, 21, 4, 81, 18]

In [19]:
x_train_val = np.concatenate(xs_train_val, axis=1)
x_test = np.concatenate(xs_test, axis=1)
rmse_list = []
kfold = KFold(n_splits=5, shuffle=True, random_state=0).split(y_train_val)

In [20]:
# SecureBoost without DP (use XGBoost instead since SecureBoost is lossless)

In [21]:
for i, (train_idx, val_idx) in enumerate(kfold):
    print("Cross Validation Fold {}".format(i))
    x_train = x_train_val[train_idx]
    y_train = y_train_val[train_idx]
    x_val = x_train_val[val_idx]
    y_val = y_train_val[val_idx]
    xg_cls = xgb.XGBRegressor(learning_rate=0.1,
                              max_depth=6,
                              n_estimators=200,
                              reg_alpha=10,
                              verbosity=2)
    xg_cls.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_val, y_val)], eval_metric='rmse')
    y_pred = xg_cls.predict(x_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_list.append(rmse)
print("Finished training.")
print("-------------------------------------------------")
print("RMSE=" + str(rmse_list))

Cross Validation Fold 0




[21:14:40] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 116 extra nodes, 0 pruned nodes, max_depth=6
[0]	validation_0-rmse:2.98711	validation_1-rmse:2.98483
[21:14:41] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 112 extra nodes, 0 pruned nodes, max_depth=6
[1]	validation_0-rmse:2.72838	validation_1-rmse:2.72604
[21:14:41] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 108 extra nodes, 0 pruned nodes, max_depth=6
[2]	validation_0-rmse:2.49918	validation_1-rmse:2.49691
[21:14:41] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 112 extra nodes, 0 pruned nodes, max_depth=6
[3]	validation_0-rmse:2.29674	validation_1-rmse:2.29446
[21:14:41] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 118 extra nodes, 0 pruned nodes, max_depth=6
[4]	validation_0-rmse:2.11855	validation_1-rmse:2.11626
[21:14:41] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 116 extra nodes, 0 pruned nodes, max_depth=6
[5]	validation_0-rmse:1.96173	valida



[21:15:24] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 112 extra nodes, 0 pruned nodes, max_depth=6
[0]	validation_0-rmse:2.98676	validation_1-rmse:2.98622
[21:15:24] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 116 extra nodes, 0 pruned nodes, max_depth=6
[1]	validation_0-rmse:2.72809	validation_1-rmse:2.72751
[21:15:24] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 114 extra nodes, 0 pruned nodes, max_depth=6
[2]	validation_0-rmse:2.49887	validation_1-rmse:2.49824
[21:15:24] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 110 extra nodes, 0 pruned nodes, max_depth=6
[3]	validation_0-rmse:2.29642	validation_1-rmse:2.29574
[21:15:24] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 116 extra nodes, 0 pruned nodes, max_depth=6
[4]	validation_0-rmse:2.11820	validation_1-rmse:2.11750
[21:15:25] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 120 extra nodes, 0 pruned nodes, max_depth=6
[5]	validation_0-rmse:1.96138	valida



[21:16:11] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 112 extra nodes, 0 pruned nodes, max_depth=6
[0]	validation_0-rmse:2.98656	validation_1-rmse:2.98723
[21:16:11] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 116 extra nodes, 0 pruned nodes, max_depth=6
[1]	validation_0-rmse:2.72785	validation_1-rmse:2.72856
[21:16:11] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 112 extra nodes, 0 pruned nodes, max_depth=6
[2]	validation_0-rmse:2.49873	validation_1-rmse:2.49946
[21:16:12] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 114 extra nodes, 0 pruned nodes, max_depth=6
[3]	validation_0-rmse:2.29595	validation_1-rmse:2.29669
[21:16:12] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 112 extra nodes, 0 pruned nodes, max_depth=6
[4]	validation_0-rmse:2.11751	validation_1-rmse:2.11827
[21:16:12] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 116 extra nodes, 0 pruned nodes, max_depth=6
[5]	validation_0-rmse:1.96086	valida



[21:16:52] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 112 extra nodes, 0 pruned nodes, max_depth=6
[0]	validation_0-rmse:2.98631	validation_1-rmse:2.98823
[21:16:52] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 114 extra nodes, 0 pruned nodes, max_depth=6
[1]	validation_0-rmse:2.72764	validation_1-rmse:2.72963
[21:16:52] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 104 extra nodes, 0 pruned nodes, max_depth=6
[2]	validation_0-rmse:2.49844	validation_1-rmse:2.50056
[21:16:52] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 114 extra nodes, 0 pruned nodes, max_depth=6
[3]	validation_0-rmse:2.29593	validation_1-rmse:2.29820
[21:16:53] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 112 extra nodes, 0 pruned nodes, max_depth=6
[4]	validation_0-rmse:2.11744	validation_1-rmse:2.11981
[21:16:53] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 114 extra nodes, 0 pruned nodes, max_depth=6
[5]	validation_0-rmse:1.96071	valida



[21:17:40] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 114 extra nodes, 0 pruned nodes, max_depth=6
[0]	validation_0-rmse:2.98665	validation_1-rmse:2.98696
[21:17:40] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 112 extra nodes, 0 pruned nodes, max_depth=6
[1]	validation_0-rmse:2.72800	validation_1-rmse:2.72835
[21:17:41] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 106 extra nodes, 0 pruned nodes, max_depth=6
[2]	validation_0-rmse:2.49883	validation_1-rmse:2.49920
[21:17:41] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 112 extra nodes, 0 pruned nodes, max_depth=6
[3]	validation_0-rmse:2.29644	validation_1-rmse:2.29678
[21:17:41] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 112 extra nodes, 0 pruned nodes, max_depth=6
[4]	validation_0-rmse:2.11788	validation_1-rmse:2.11820
[21:17:41] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 118 extra nodes, 0 pruned nodes, max_depth=6
[5]	validation_0-rmse:1.96123	valida

In [6]:
# SplitNN

In [7]:
num_parties = 2
xs_train_val, y_train_val, xs_test, y_test, counts = load_movielens("data/movielens/", use_cache=True,
                                                                    download=True, test_rate=0.1,
                                                                    num_parties=num_parties)

Loading MovieLens from cache


In [8]:
xs_train_val

[array([[1995, 2405],
        [ 920, 2070],
        [ 322, 3808],
        ...,
        [1940, 2189],
        [3327, 2687],
        [3956, 1026]]),
 array([[ 1,  3,  0,  0, 64,  0],
        [ 1,  2, 17,  0, 73,  7],
        [ 1,  4, 12,  0, 71,  4],
        ...,
        [ 1,  3, 17,  0, 78,  7],
        [ 1,  6, 19,  0, 79,  7],
        [ 1,  3, 11,  0, 71,  7]])]

In [9]:
y_train_val

array([4, 4, 4, ..., 3, 4, 4])

In [10]:
rmse_list = []
kfold = KFold(n_splits=5, shuffle=True, random_state=0).split(y_train_val)

In [11]:
counts

[6040, 3952, 2, 7, 21, 4, 81, 18]

In [12]:
print("Cross Validation Fold {}".format(i))
    xs_train = [data[train_idx] for data in xs_train_val]
    y_train = y_train_val[train_idx]
    xs_val = [data[val_idx] for data in xs_train_val]
    y_val = y_train_val[val_idx]
    start = datetime.now()
    name = "splitnn_movielens_party_{}_fold_{}".format(num_parties, i)
    writer = SummaryWriter("runs/{}".format(name))
    ncf_counts = [counts[:2], counts[2:]]
    embed_dims = [[32, 32], [1, 4, 10, 4, 15, 5]]
    aggregate_model = SplitNNModel(
        num_parties=num_parties,
        name=name,
        num_epochs=100,
        local_hidden_layers=[32, 16],
        local_output_size=3,
        lr=3e-5,
        agg_hidden_layers=[10],
        batch_size=128,
        weight_decay=1e-5,
        writer=writer,
        device='cuda:{}'.format("0"),
        update_target_freq=1,
        task='regression',
        n_classes=10,
        test_batch_size=1000,
        test_freq=1,
        cuda_parallel=False,
        n_channels=1,
        model_type='ncf',
        optimizer='sgd',
        privacy=None,
        batches_per_lot=5,
        epsilon=1,
        delta=1.0 / xs_train[0].shape[0],
        num_workers=0,
        ncf_counts=ncf_counts,
        ncf_embed_dims=embed_dims
    )
    _, _, rmse, _ = aggregate_model.train(xs_train, y_train, xs_val, y_val, xs_test, y_test, use_cache=False)
    y_pred_test, y_score_test = aggregate_model.predict(xs_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_score_test))
    rmse_list.append(test_rmse)
    print(aggregate_model.params)
    print("-------------------------------------------------")
    time_min = (datetime.now() - start).seconds / 60
    print("Time(min) {}: ".format(time_min))
print("Best RMSE={}".format(rmse_list))for i, (train_idx, val_idx) in enumerate(kfold):

Cross Validation Fold 0
Start training SplitNN
[SplitNN] Epoch 1: training loss 3.405515207338867
[Final] Epoch 1: train rmse 1.132574961054325, val rmse 1.1318933346380877
[Final] Epoch 1: best val rmse 1.1318933346380877
Epoch 1 duration 54 sec
[SplitNN] Epoch 2: training loss 1.2365889824875214
[Final] Epoch 2: train rmse 1.0967913663451994, val rmse 1.0963034008219397
[Final] Epoch 2: best val rmse 1.0963034008219397
Epoch 2 duration 53 sec
[SplitNN] Epoch 3: training loss 1.181764822754488
[Final] Epoch 3: train rmse 1.0775109112962094, val rmse 1.0773609686066374
[Final] Epoch 3: best val rmse 1.0773609686066374
Epoch 3 duration 53 sec
[SplitNN] Epoch 4: training loss 1.1446011290258724
[Final] Epoch 4: train rmse 1.061736504936834, val rmse 1.0618329052444018
[Final] Epoch 4: best val rmse 1.0618329052444018
Epoch 4 duration 53 sec
[SplitNN] Epoch 5: training loss 1.1131069746940285
[Final] Epoch 5: train rmse 1.047625650130199, val rmse 1.0478781590624313
[Final] Epoch 5: best 

KeyboardInterrupt: 

True

In [7]:
# FedOnce
num_parties = 2
xs_train_val, y_train_val, xs_test, y_test, counts = load_movielens("data/movielens/", use_cache=True,
                                                            download=True, test_rate=0.1)
score_summary = []
print("Start training")

Loading MovieLens from cache
Start training


In [15]:
xs_train_val[0]

array([[1995, 2405],
       [ 920, 2070],
       [ 322, 3808],
       ...,
       [1940, 2189],
       [3327, 2687],
       [3956, 1026]])

In [16]:
kfold = KFold(n_splits=5, shuffle=True, random_state=0).split(y_train_val)

In [31]:
num_parties

2

In [11]:
for party_id in range(num_parties):
    kfold = KFold(n_splits=5, shuffle=True, random_state=0).split(y_train_val)
    rmse_list = []
    for i, (train_idx, val_idx) in enumerate(kfold):
        print("Cross Validation Fold {}".format(i))
        xs_train = [data[train_idx] for data in xs_train_val]
        y_train = y_train_val[train_idx]
        xs_val = [data[val_idx] for data in xs_train_val]
        y_val = y_train_val[val_idx]
        start = datetime.now()
        name = "fedonce_movielens_party_{}_active_{}_fold_{}".format(num_parties, party_id, i)
        writer = SummaryWriter("runs/{}".format(name))
        ncf_counts = [counts[:2], counts[2:]]
        embed_dims = [[32, 32], [1, 4, 10, 4, 15, 5]]
        aggregate_model = VerticalFLModel(
            num_parties=num_parties,
            active_party_id=party_id,
            name=name,
            full_name=name,
            num_epochs=40 if party_id == 0 else 100,
            num_local_rounds=30 if party_id == 0 else 30,
            local_lr=1e-4,
            local_hidden_layers=[32, 16] if party_id == 0 else [128],
            local_batch_size=64,
            local_weight_decay=1e-5,
            local_output_size=3 if party_id == 0 else 64,
            num_agg_rounds=1,
            agg_lr=1e-4 if party_id == 0 else 5e-4,
            agg_hidden_layers=[10] if party_id == 0 else [32, 32],
            agg_batch_size=64,
            agg_weight_decay=2e-4 if party_id == 0 else 1e-5,
            writer=writer,
            device='cuda:{}'.format("0"),
            update_target_freq=1,
            task='regression',
            n_classes=10,
            test_batch_size=1000,
            test_freq=1,
            cuda_parallel=False,
            n_channels=1,
            model_type='ncf',
            optimizer='adam',
            privacy=None,
            ncf_counts=ncf_counts,
            ncf_embed_dims=embed_dims,
            num_workers=0
        )
        _, _, rmse, _ = aggregate_model.train(xs_train, y_train, xs_val, y_val, use_cache=False)
        y_test_score = aggregate_model.predict_agg(xs_test)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_test_score))
        rmse_list.append(test_rmse)
        print(aggregate_model.params)
        time_min = (datetime.now() - start).seconds / 60
        print("Time(min) {}: ".format(time_min))
        break
    score_summary.append(rmse_list)
    print("RMSE for active party {}".format(party_id) + str(rmse_list))
    print("-------------------------------------------------")

for i, result in enumerate(score_summary):
    print("Party {}: RMSE={}".format(i, result))

Cross Validation Fold 0
Initializing and local labels
Finished initializing
Start training local models
[Local] Party 1, Epoch 1: training loss 0.21204972893087054
Epoch 1 duration 71 sec
[Local] Party 1, Epoch 2: training loss 0.097382461225069
Epoch 2 duration 70 sec
[Local] Party 1, Epoch 3: training loss 0.06701953237041317
Epoch 3 duration 69 sec
[Local] Party 1, Epoch 4: training loss 0.052082669526077686
Epoch 4 duration 68 sec
[Local] Party 1, Epoch 5: training loss 0.04298054412920445
Epoch 5 duration 70 sec
[Local] Party 1, Epoch 6: training loss 0.03674565196246534
Epoch 6 duration 69 sec
[Local] Party 1, Epoch 7: training loss 0.03215482517354484
Epoch 7 duration 69 sec
[Local] Party 1, Epoch 8: training loss 0.02864839266846602
Epoch 8 duration 69 sec
[Local] Party 1, Epoch 9: training loss 0.02588511520776192
Epoch 9 duration 69 sec
[Local] Party 1, Epoch 10: training loss 0.023634075065641018
Epoch 10 duration 69 sec
[Local] Party 1, Epoch 11: training loss 0.02177250244

# Explore FedOnce

In [6]:
num_parties = 2
xs_train_val, y_train_val, xs_test, y_test, counts = load_movielens("data/movielens/", use_cache=True,
                                                            download=True, test_rate=0.1)
score_summary = []

Loading MovieLens from cache


In [7]:
xs_train_val[0]

array([[1995, 2405],
       [ 920, 2070],
       [ 322, 3808],
       ...,
       [1940, 2189],
       [3327, 2687],
       [3956, 1026]])

In [8]:
y_train_val.shape

(900188,)

In [9]:
kfold = KFold(n_splits=5, shuffle=True, random_state=0).split(y_train_val)

In [10]:
a = list(kfold)

In [11]:
a[0]

(array([     0,      1,      2, ..., 900185, 900186, 900187]),
 array([     5,      6,      7, ..., 900172, 900179, 900183]))

In [12]:
for party_id in range(num_parties):
    kfold = KFold(n_splits=5, shuffle=True, random_state=0).split(y_train_val)
    rmse_list = []
    for i, (train_idx, val_idx) in enumerate(kfold):
        print("Cross Validation Fold {}".format(i))
        xs_train = [data[train_idx] for data in xs_train_val]
        y_train = y_train_val[train_idx]
        xs_val = [data[val_idx] for data in xs_train_val]
        y_val = y_train_val[val_idx]
        break

Cross Validation Fold 0
Cross Validation Fold 0


In [13]:
xs_train

[array([[1995, 2405],
        [ 920, 2070],
        [ 322, 3808],
        ...,
        [1940, 2189],
        [3327, 2687],
        [3956, 1026]]),
 array([[ 1,  3,  0,  0, 64,  0],
        [ 1,  2, 17,  0, 73,  7],
        [ 1,  4, 12,  0, 71,  4],
        ...,
        [ 1,  3, 17,  0, 78,  7],
        [ 1,  6, 19,  0, 79,  7],
        [ 1,  3, 11,  0, 71,  7]])]