# Install fed_rf_mk package

Package available at: https://pypi.org/project/fed-rf-mk/

Uncomment the following line to install the package

The usage of a virtual environment is recommended. ``python -m venv venv`` then ``source venv/bin/activate``


In [None]:
# !pip install fed-rf-mk

In [None]:
# !pip install -r requirements.txt

# Data Treatment

In [None]:
import random
import os
import pandas as pd
from ucimlrepo import fetch_ucirepo
from sklearn.utils import resample

Import the aids clinical trial dataset from ucirepo.

In [None]:

# Fetch dataset
aids_clinical_trials_group_study_175 = fetch_ucirepo(id=890)

# Extract data
X = aids_clinical_trials_group_study_175.data.features  # Features DataFrame
y = aids_clinical_trials_group_study_175.data.targets  # Target DataFrame

# Debug: Print available column names
print("Features (X) columns:", X.columns.tolist())
print("Target (y) columns:", y.columns.tolist())

# Ensure 'cid' is in y
if "cid" not in y.columns:
    print("Error: 'cid' column is missing from the target DataFrame!")
    print("Available target columns:", y.columns.tolist())
    exit()

# Combine X and y into a single DataFrame
df = pd.concat([X, y], axis=1)
all_features = [col for col in df.columns if col != 'cid']  # Exclude target

Simulate a distributed enviroment with 3 clients.

Last client is the test client and have 20% of the data.

The remaining clients have 40% of the data each which will be used for training.

In [None]:
from sklearn.utils import shuffle

# Define number of partitions
N = 3  
TRAIN_RATIO = 0.8  # 80% for training, 20% for testing


# Load dataset (Assuming df is already loaded and contains 'cid' column)
df["cid"].value_counts(normalize=True) * 100

# Separate classes
df_majority = df[df["cid"] == 0]  # cid = 0 (majority)
df_minority = df[df["cid"] == 1]  # cid = 1 (minority)

print("Majority class (cid=0) count:", len(df_majority))
print("Minority class (cid=1) count:", len(df_minority))

# Shuffle data for randomness
df_majority = shuffle(df_majority, random_state=42).reset_index(drop=True)
df_minority = shuffle(df_minority, random_state=42).reset_index(drop=True)

# Split data into 80% training and 20% testing
majority_train_size = int(len(df_majority) * TRAIN_RATIO)
minority_train_size = int(len(df_minority) * TRAIN_RATIO)

df_majority_train = df_majority.iloc[:majority_train_size]
df_majority_test = df_majority.iloc[majority_train_size:]

df_minority_train = df_minority.iloc[:minority_train_size]
df_minority_test = df_minority.iloc[minority_train_size:]

# Allocate 80% training data into N-1 partitions
train_partitions = [[] for _ in range(N - 1)]

for i in range(len(df_majority_train)):
    train_partitions[i % (N - 1)].append(df_majority_train.iloc[i])

for i in range(len(df_minority_train)):
    train_partitions[i % (N - 1)].append(df_minority_train.iloc[i])

# Convert training partitions into DataFrames
train_partitions = [pd.DataFrame(part) for part in train_partitions]

# The last partition gets the 20% test data
test_partition = pd.concat([df_majority_test, df_minority_test]).sample(frac=1, random_state=42).reset_index(drop=True)


In [None]:

# Create output directory
output_dir = "train_datasets/aids_clinical"
os.makedirs(output_dir, exist_ok=True)

# Save training partitions
for i, part in enumerate(train_partitions):
    file_path = os.path.join(output_dir, f"part_{i}.csv")
    part.to_csv(file_path, index=False)

    # Check file size
    size_in_bytes = os.path.getsize(file_path)
    size_in_megabytes = size_in_bytes / (1024 * 1024)

    print(f"\n📊 Distribution of 'cid' in part_{i} (Training):")
    print(part["cid"].value_counts(normalize=True) * 100)
    print(f"Space occupied by part_{i}: {size_in_bytes} bytes ({size_in_megabytes:.2f} MB)")

# Save the last partition as the test set
file_path = os.path.join(output_dir, f"part_{N-1}.csv")
test_partition.to_csv(file_path, index=False)

# Check file size
size_in_bytes = os.path.getsize(file_path)
size_in_megabytes = size_in_bytes / (1024 * 1024)

print(f"\n📊 Distribution of 'cid' in part_{N-1} (Testing):")
print(test_partition["cid"].value_counts(normalize=True) * 100)
print(f"Space occupied by part_{N-1}: {size_in_bytes} bytes ({size_in_megabytes:.2f} MB)")

print("\n✅ Partitioning with 80%-20% split completed successfully!")

# Launch Servers

Start each datasite in a different terminal.

The auto_accept function can be set to True to automatically accept requests from clients. Otherwise, each dataowner will have to accept the requests manually.

The weight of each datasite can also be set. It is usefull if the datasites have different number of samples or samples with different importance. If not set, the weight is equally distributed by all training datasites.


In [None]:
from fed_rf_mk.server import FLServer
import threading

server = FLServer("aids_clinical_part_0", 8080, "train_datasets/aids_clinical/part_0.csv", auto_accept=False)
server_thread = threading.Thread(target=server.start, daemon=True)
server_thread.start()


In [None]:

server2 = FLServer("aids_clinical_part_1", 8081, "train_datasets/aids_clinical/part_1.csv", auto_accept=True)
server_thread = threading.Thread(target=server2.start, daemon=True)
server_thread.start()


In [None]:

server3 = FLServer("aids_clinical_part_2", 8082, "train_datasets/aids_clinical/part_2.csv", auto_accept=False)
server_thread = threading.Thread(target=server3.start, daemon=True)
server_thread.start()

In [None]:
# sleep time to make sure datasites are up before proceeding

import time
time.sleep(10)

## Client Notebook

With the server running, the client is initiated and connected to the datasites (2 training sites and 1 test site).


In [None]:
from fed_rf_mk.client import FLClient

rf_client = FLClient()

## COnnect to N client
for i in range(N-1):
    port = 8080 + i
    rf_client.add_train_client(name=f"aids_clinical_part_{i+1}",
                               url=f"http://localhost:{port}", email="fedlearning@rf.com", password="****")

# CONNECT EVAL CLIENT
eval_port = 8080 + N - 1
rf_client.add_eval_client(name=f"aids_clinical_part_{N}",
    url=f"http://localhost:{eval_port}", email="fedlearning@rf.com", password="****")

rf_client.add_train_client(name="aids_clinical_part_1", url="http://localhost:8080", email="fedlearning@rf.com", password="****")
rf_client.add_train_client(name="aids_clinical_part_2", url="http://localhost:8081", email="fedlearning@rf.com", password="****")
rf_client.add_eval_client(name="aids_clinical_part_3", url="http://localhost:8082", email="fedlearning@rf.com", password="****")


Specification of parameters for the training process.

dataParams are the parameters involving the data treatment, such as specifying the target variable and the features to be ignored.

modelParams are the parameters for the model, such as the number of initial trees, train/test size for each epoch, and the number of epochs.


In [None]:

dataParams = {
    "target": "cid",
    "ignored_columns": ["cid"]
}

modelParams = {
    "model": None,
    "n_base_estimators": 10,
    "n_incremental_estimators": 2,
    "train_size": 0.7,
    "test_size": 0.5,
    "sample_size": None,
    "fl_epochs": 1
}

rf_client.set_data_params(dataParams)
rf_client.set_model_params(modelParams)


After specifying the parameters, the request can be sent to the datasites and the status checked.

In [None]:

rf_client.send_request()

rf_client.check_status_last_code_requests()


In [None]:
server.list_pending_requests()


In [None]:
# server.inspect_request(0)


Since datasite 1 and 3 auto_accept was set to False, the requests will have to be accepted manually using the following command.

In [None]:
server.approve_request(0)
# server2.approve_request(0)
server3.approve_request(0)

After checking that all the requests have been accepted, the training can be started.
If, for some reason, a request is not accepted, the training skips that datasite and continues with the others.

In [None]:
rf_client.run_model()

In [None]:
# rf_client.get_model_params()

Finally, the model can be evaluated on the test site.

In [None]:
rf_client.run_evaluate()