In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import random

# SETTINGS
NUM_CLIENTS = 10
CLIENT_DATA_PATH = "/content/drive/Shareddrives/ML4Net/Seminar5/dataset_Seminar5/client_datasets/"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS_LOCAL = 3
ROUNDS = 5
CLIENTS_PER_ROUND = 5
LR = 0.01
BATCH_SIZE = 32

# Step 1: Collect all labels globally to build global label mapping
all_labels = []

for client_id in range(1, NUM_CLIENTS + 1):
    labels_path = os.path.join(CLIENT_DATA_PATH, f"client_{client_id}_labels.csv")
    if os.path.exists(labels_path):
        y = pd.read_csv(labels_path, header=None).values.flatten()
        all_labels.extend(y)
    else:
        print(f"❌ Missing labels file for client {client_id} — Skipping")

all_labels = np.array(all_labels)
global_unique_labels = np.unique(all_labels)
global_label_map = {old_label: new_label for new_label, old_label in enumerate(global_unique_labels)}
num_classes = len(global_unique_labels)
print(f"Global unique labels ({num_classes} classes): {global_unique_labels}")

# Step 2: Load client data, apply global label map and scale features
client_datasets = []
for client_id in range(1, NUM_CLIENTS + 1):
    features_path = os.path.join(CLIENT_DATA_PATH, f"client_{client_id}_features.csv")
    labels_path = os.path.join(CLIENT_DATA_PATH, f"client_{client_id}_labels.csv")

    if os.path.exists(features_path) and os.path.exists(labels_path):
        X = pd.read_csv(features_path, header=None).values  # shape: (N_samples, N_features)
        y = pd.read_csv(labels_path, header=None).values.flatten()

        if X.shape[0] != len(y):
            print(f"⚠️ Size mismatch for client {client_id} — Skipping (X: {X.shape}, y: {len(y)})")
            continue

        # Map labels using global mapping
        y_mapped = np.array([global_label_map[label] for label in y])

        # Scale features per client (or you could fit scaler globally across all data if preferred)
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        client_datasets.append((client_id, X_scaled, y_mapped))
    else:
        print(f"❌ Missing files for client {client_id} — Skipping")

if not client_datasets:
    raise RuntimeError("❌ No valid client data available. Check dataset paths and integrity.")

print(f"\n✅ Loaded clients: {[cid for cid, _, _ in client_datasets]}")

input_size = client_datasets[0][1].shape[1]

# Define model
class MLPClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        return self.model(x)

# Local training function
def local_train(model, X, y):
    model.train()
    optimizer = optim.SGD(model.parameters(), lr=LR)
    loss_fn = nn.CrossEntropyLoss()
    X_tensor = torch.tensor(X, dtype=torch.float32).to(DEVICE)
    y_tensor = torch.tensor(y, dtype=torch.long).to(DEVICE)

    dataset = torch.utils.data.TensorDataset(X_tensor, y_tensor)
    loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    for _ in range(EPOCHS_LOCAL):
        for xb, yb in loader:
            optimizer.zero_grad()
            output = model(xb)
            loss = loss_fn(output, yb)
            loss.backward()
            optimizer.step()

    return model.state_dict()

# Federated averaging
def average_weights(weight_list):
    avg_weights = {}
    for key in weight_list[0].keys():
        avg_weights[key] = sum(weights[key] for weights in weight_list) / len(weight_list)
    return avg_weights

# Initialize global model
global_model = MLPClassifier(input_size, num_classes).to(DEVICE)
global_weights = global_model.state_dict()

# Training rounds
for rnd in range(ROUNDS):
    print(f"\n📡 --- Round {rnd + 1} ---")
    selected = random.sample(client_datasets, min(CLIENTS_PER_ROUND, len(client_datasets)))
    local_weights = []

    for client_id, X, y in selected:
        print(f" → Training on client {client_id} with {len(y)} samples")
        client_model = MLPClassifier(input_size, num_classes).to(DEVICE)
        client_model.load_state_dict(global_weights)
        updated_weights = local_train(client_model, X, y)
        local_weights.append(updated_weights)

    global_weights = average_weights(local_weights)
    global_model.load_state_dict(global_weights)

print("\n✅ Federated training complete!")


Global unique labels (12 classes): [ 1  2  3  4  5  6  7  8  9 10 11 12]

✅ Loaded clients: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

📡 --- Round 1 ---
 → Training on client 7 with 448 samples
 → Training on client 6 with 202 samples
 → Training on client 10 with 64 samples
 → Training on client 3 with 365 samples
 → Training on client 5 with 209 samples

📡 --- Round 2 ---
 → Training on client 9 with 412 samples
 → Training on client 10 with 64 samples
 → Training on client 3 with 365 samples
 → Training on client 5 with 209 samples
 → Training on client 7 with 448 samples

📡 --- Round 3 ---
 → Training on client 9 with 412 samples
 → Training on client 4 with 207 samples
 → Training on client 6 with 202 samples
 → Training on client 7 with 448 samples
 → Training on client 5 with 209 samples

📡 --- Round 4 ---
 → Training on client 9 with 412 samples
 → Training on client 5 with 209 samples
 → Training on client 4 with 207 samples
 → Training on client 2 with 113 samples
 → Training on client

A federated learning experiment was conducted using 10 clients and a simple MLP model over 5 rounds, with 5 randomly selected clients participating in each round. A global label mapping and client-specific feature scaling were applied. All clients contributed at least once, with client 7 appearing most frequently and having the highest number of samples. The global model was updated through weight averaging, and training completed successfully without centralizing any data.

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import random

# SETTINGS
NUM_CLIENTS = 10
CLIENT_DATA_PATH = "/content/drive/Shareddrives/ML4Net/Seminar5/dataset_Seminar5/client_datasets/"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS_LOCAL = 20
ROUNDS = 50
CLIENTS_PER_ROUND = 5
LR = 0.01
BATCH_SIZE = 32

# Step 1: Collect all labels globally to build global label mapping
all_labels = []

for client_id in range(1, NUM_CLIENTS + 1):
    labels_path = os.path.join(CLIENT_DATA_PATH, f"client_{client_id}_labels.csv")
    if os.path.exists(labels_path):
        y = pd.read_csv(labels_path, header=None).values.flatten()
        all_labels.extend(y)
    else:
        print(f"Missing labels file for client {client_id} — Skipping")

all_labels = np.array(all_labels)
global_unique_labels = np.unique(all_labels)
global_label_map = {old_label: new_label for new_label, old_label in enumerate(global_unique_labels)}
num_classes = len(global_unique_labels)
print(f"Global unique labels ({num_classes} classes): {global_unique_labels}")

# Step 2: Load client data, apply global label map and scale features
client_datasets = []
for client_id in range(1, NUM_CLIENTS + 1):
    features_path = os.path.join(CLIENT_DATA_PATH, f"client_{client_id}_features.csv")
    labels_path = os.path.join(CLIENT_DATA_PATH, f"client_{client_id}_labels.csv")

    if os.path.exists(features_path) and os.path.exists(labels_path):
        X = pd.read_csv(features_path, header=None).values  # shape: (N_samples, N_features)
        y = pd.read_csv(labels_path, header=None).values.flatten()

        if X.shape[0] != len(y):
            print(f"⚠️ Size mismatch for client {client_id} — Skipping (X: {X.shape}, y: {len(y)})")
            continue

        # Map labels using global mapping
        y_mapped = np.array([global_label_map[label] for label in y])

        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        if client_id == 1:
          scaler_global = scaler


        client_datasets.append((client_id, X_scaled, y_mapped))
    else:
        print(f"Missing files for client {client_id} — Skipping")

if not client_datasets:
    raise RuntimeError("No valid client data available. Check dataset paths and integrity.")

print(f"\n✅ Loaded clients: {[cid for cid, _, _ in client_datasets]}")

input_size = client_datasets[0][1].shape[1]

# Define model
class MLPClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )


    def forward(self, x):
        return self.model(x)

# Local training function
def local_train(model, X, y):
    model.train()
    optimizer = optim.SGD(model.parameters(), lr=LR)
    loss_fn = nn.CrossEntropyLoss()
    X_tensor = torch.tensor(X, dtype=torch.float32).to(DEVICE)
    y_tensor = torch.tensor(y, dtype=torch.long).to(DEVICE)

    dataset = torch.utils.data.TensorDataset(X_tensor, y_tensor)
    loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    for _ in range(EPOCHS_LOCAL):
        for xb, yb in loader:
            optimizer.zero_grad()
            output = model(xb)
            loss = loss_fn(output, yb)
            loss.backward()
            optimizer.step()

    return model.state_dict()

# Federated averaging with data-size-based weights (FedAvg)
def weighted_average_weights(weight_list, data_sizes):
    avg_weights = {}
    total_data = sum(data_sizes)
    for key in weight_list[0].keys():
        weighted_sum = sum(weights[key] * (n_samples / total_data)
                           for weights, n_samples in zip(weight_list, data_sizes))
        avg_weights[key] = weighted_sum
    return avg_weights


# Initialize global model
global_model = MLPClassifier(input_size, num_classes).to(DEVICE)
global_weights = global_model.state_dict()

for rnd in range(ROUNDS):
    print(f"\n📡 --- Round {rnd + 1} ---")
    selected = random.sample(client_datasets, min(CLIENTS_PER_ROUND, len(client_datasets)))
    local_weights = []
    local_sizes = []

    for client_id, X, y in selected:
        print(f" → Training on client {client_id} with {len(y)} samples")
        client_model = MLPClassifier(input_size, num_classes).to(DEVICE)
        client_model.load_state_dict(global_weights)
        updated_weights = local_train(client_model, X, y)
        local_weights.append(updated_weights)
        local_sizes.append(len(y))

    # FedAvg ponderado
    global_weights = weighted_average_weights(local_weights, local_sizes)
    global_model.load_state_dict(global_weights)


print("\n✅ Federated training complete!")

Global unique labels (12 classes): [ 1  2  3  4  5  6  7  8  9 10 11 12]

✅ Loaded clients: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

📡 --- Round 1 ---
 → Training on client 4 with 207 samples
 → Training on client 2 with 113 samples
 → Training on client 1 with 314 samples
 → Training on client 9 with 412 samples
 → Training on client 6 with 202 samples

📡 --- Round 2 ---
 → Training on client 8 with 142 samples
 → Training on client 10 with 64 samples
 → Training on client 2 with 113 samples
 → Training on client 6 with 202 samples
 → Training on client 5 with 209 samples

📡 --- Round 3 ---
 → Training on client 2 with 113 samples
 → Training on client 7 with 448 samples
 → Training on client 9 with 412 samples
 → Training on client 10 with 64 samples
 → Training on client 1 with 314 samples

📡 --- Round 4 ---
 → Training on client 5 with 209 samples
 → Training on client 10 with 64 samples
 → Training on client 3 with 365 samples
 → Training on client 6 with 202 samples
 → Training on client

We improved the Federated Learning setup by implementing weighted model aggregation using the FedAvg algorithm. Instead of averaging client models equally, each client's contribution was weighted based on the size of its local dataset. This ensured a more accurate and fair update of the global model. The approach was integrated into the training loop and executed successfully across multiple rounds.

In [None]:
# Load and prepare test data
test_features_path = "/content/drive/Shareddrives/ML4Net/Seminar5/dataset_Seminar5/test_features.csv"
test_labels_path = "/content/drive/Shareddrives/ML4Net/Seminar5/dataset_Seminar5/test_labels.csv"

X_test = pd.read_csv(test_features_path, header=None).values
y_test_raw = pd.read_csv(test_labels_path, header=None).values.flatten()

# Apply the same label mapping as in training
y_test = np.array([global_label_map[label] for label in y_test_raw])

X_test_scaled = scaler_global.transform(X_test)


# Convert to tensors
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(DEVICE)
y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(DEVICE)

# Evaluation
global_model.eval()
with torch.no_grad():
    outputs = global_model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    correct = (predicted == y_test_tensor).sum().item()
    total = y_test_tensor.size(0)
    accuracy = correct / total

print(f"\n📊 Test Accuracy of the global model: {accuracy * 100:.2f}%")



📊 Test Accuracy of the global model: 51.40%


In the final stage, we trained and evaluated the global model using Federated Learning (FL) across 10 clients with non-centralized Wi-Fi CSI data. We improved the model architecture with a deeper MLP and increased the training rounds to 50 and local epochs to 20. This setup significantly boosted performance, reaching a test accuracy of 50.40% on a 12-class classification task. The weighted averaging strategy (FedAvg) ensured fair contribution from clients based on their dataset sizes. Despite data heterogeneity, the model was able to learn meaningful patterns collaboratively. The results demonstrate the effectiveness of FL in preserving privacy while maintaining competitive performance.