In [46]:
%load_ext autoreload
%autoreload 2

import os, sys
import copy
import socket
from tqdm import tqdm
import torch
import pickle
from torch import optim
from torch.utils.tensorboard import SummaryWriter

#!pip install networkx matplotlib
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt

import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../")))
from libs.topology_manager import *
from libs import fl, nn, data, log

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [47]:
# Save Logs To File (info | debug | warning | error | critical) [optional]
log.init("info")
#log.init("info", "federated.log")
#log.init("debug", "flkafka.log")

In [48]:
adj_mat = np.array([[0,1,1,1,1,1],
             [1,0,0,0,0,0],
             [1,0,0,0,0,0],
             [1,0,0,0,0,0],
             [1,0,0,0,0,0], 
             [1,0,0,0,0,0]])

node_types = {'aggregator': [0,1], 'trainer': [2,3], 'broadcaster': [4,5]}

di_graph = nx.DiGraph(adj_mat)
#nx.draw(di_graph)

In [49]:
class FedArgs():
    def __init__(self):
        self.num_clients = len(adj_mat)
        self.epochs = 10
        self.local_rounds = 1
        self.client_batch_size = 32
        self.test_batch_size = 128
        self.learning_rate = 1e-4
        self.weight_decay = 1e-5
        self.cuda = False
        self.seed = 1
        self.topic = 'pyflx-c'
        self.server_topic = "pyflx-cs"
        self.tb = SummaryWriter('../out/runs/flkafka', comment="Mnist Distributed Federated training")

fedargs = FedArgs()

RuntimeError: can't start new thread

In [50]:
use_cuda = fedargs.cuda and torch.cuda.is_available()
torch.manual_seed(fedargs.seed)
device = torch.device("cuda" if use_cuda else "cpu")
kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}

In [51]:
host = socket.gethostname()
server = host + "(server)"
clients = [host + "(" + str(client + 1) + ")" for client in range(fedargs.num_clients)]
ctp = CentralizedTopology(adj_mat, server, clients, node_types)

In [52]:
#Initialize Global and Client models
global_model = nn.ModelMNIST()
client_models = {client: copy.deepcopy(global_model) for client in clients}

# Function for training
def train_model(model, train_loader, fedargs, device):
    model, loss = fl.client_update(model,
                                train_loader,
                                fedargs.learning_rate,
                                fedargs.weight_decay,
                                fedargs.local_rounds,
                                device)
    return model, loss

In [53]:
# Load MNIST Data to clients
train_data, test_data = data.load_dataset("mnist")
clients_data = data.split_data(train_data, clients)
client_train_loaders, client_test_loaders = data.load_client_data(clients_data, fedargs.client_batch_size, 0.2, **kwargs)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=fedargs.test_batch_size, shuffle=True, **kwargs)

clients_info = {
        client: {"train_loader": client_train_loaders[client],
                 "test_loader": client_test_loaders[client]}
        for client in clients
    }

In [44]:
import asyncio
import time

def background(f):
    def wrapped(*args, **kwargs):
        return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)

    return wrapped

@background
def process(client, epoch, ctp, model, train_loader, test_loader, fedargs, device):
    log.info("Processing Client {}".format(client))

    # Consume and Average, epoch passed is actually prev epoch, for which we want to consume updates
    rcvd_models = ctp.consume_model(client, fedargs.server_topic, model, epoch)
    log.info("Client {} received {} model updates from {}".format(client, len(rcvd_models), list(rcvd_models.keys())))
    if len(rcvd_models) != 0:
        model = fl.federated_avg(rcvd_models)

    # Train  
    model, loss = fl.client_update(model,
                                train_loader,
                                fedargs.learning_rate,
                                fedargs.weight_decay,
                                fedargs.local_rounds,
                                device)
    
    ctp.produce_model(client, fedargs.topic, model, epoch)

    # Plot and Log
    for local_epoch, loss in enumerate(list(loss.values())):
        fedargs.tb.add_scalars("Training Loss/" + client, {str(epoch): loss}, str(local_epoch + 1))

    log.jsondebug(loss, "Epoch {} of {} : Federated Training loss, Client {}".format(epoch, fedargs.epochs, client))
    log.modeldebug(model, "Epoch {} of {} : Client {} Update".format(epoch, fedargs.epochs, client))

    # Test
    test_output = fl.eval(model, test_loader, device)
    fedargs.tb.add_scalar("Accuracy/" + client, test_output["accuracy"], epoch)
    log.jsoninfo(test_output, "Test Outut after Epoch {} of {} for Client {}".format(epoch, fedargs.epochs, client))

In [45]:
import time
start_time = time.time()

# Federated Training
for _epoch in tqdm(range(fedargs.epochs)):

    epoch = _epoch + 1
    log.info("Federated Training Epoch {} of {}".format(epoch, fedargs.epochs))
    
    # Server, Consume and Average, epoch passed is actually prev epoch, for which we want to consume updates
    rcvd_models = ctp.consume_model(server, fedargs.topic, global_model, epoch - 1)
    log.info("Server received {} model updates from {}".format(len(rcvd_models), list(rcvd_models.keys())))
    if len(rcvd_models) != 0:
        global_model = fl.federated_avg(rcvd_models)
        
    ctp.produce_model(server, fedargs.server_topic, global_model, epoch)
        
    # Gloabal Test
    log.modeldebug(global_model, "Epoch {} of {} : Server Update".format(epoch, fedargs.epochs))
    global_test_output = fl.eval(global_model, test_loader, device)
    fedargs.tb.add_scalar("Gloabl Accuracy/", global_test_output["accuracy"], epoch)
    log.jsoninfo(global_test_output, "Gloabl Test Outut after Epoch {} of {}".format(epoch, fedargs.epochs))
    
    # Clients
    tasks = [process(client, epoch, ctp, client_models[client],
                     clients_info[client]['train_loader'],
                     clients_info[client]['test_loader'],
                     fedargs, device) for client in clients]
    await asyncio.wait(tasks)

print(time.time() - start_time)

  0%|          | 0/10 [00:00<?, ?it/s]2021-08-16 23:22:04,125 - <ipython-input-45-94a59301310e>::<module>(l:8) : Federated Training Epoch 1 of 10 [MainProcess : MainThread (INFO)]
2021-08-16 23:22:14,159 - <ipython-input-45-94a59301310e>::<module>(l:12) : Server received 0 model updates from [] [MainProcess : MainThread (INFO)]
2021-08-16 23:22:14,479 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic pyflx-cs. ^C to exit. [MainProcess : MainThread (INFO)]
2021-08-16 23:22:14,636 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : MainThread (INFO)]
2021-08-16 23:22:15,558 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::delivery_report(l:50) : User record b'bladecluster.iitp.org(server)' successfully produced to pyflx-cs [0] at offset 0 [MainProcess : MainThread (INFO)]
2021-08-16 23:22:21,407 - <ipython-input-45-94a59301310e>::<module>(l:22) : Gloabl Te

2021-08-16 23:22:56,174 - <ipython-input-44-2e03a472c65d>::process(l:40) : Test Outut after Epoch 1 of 10 for Client bladecluster.iitp.org(5) {
    "accuracy": 90.14999999999999,
    "attack": {
        "attack_success_count": 0,
        "attack_success_rate": 0,
        "instances": 0,
        "misclassification_rate": 0,
        "misclassifications": 0
    },
    "correct": 1803,
    "test_loss": 0.32805385929346087
} [MainProcess : asyncio_1 (INFO)]
2021-08-16 23:22:56,418 - <ipython-input-44-2e03a472c65d>::process(l:40) : Test Outut after Epoch 1 of 10 for Client bladecluster.iitp.org(3) {
    "accuracy": 91.45,
    "attack": {
        "attack_success_count": 0,
        "attack_success_rate": 0,
        "instances": 0,
        "misclassification_rate": 0,
        "misclassifications": 0
    },
    "correct": 1829,
    "test_loss": 0.32179259514808656
} [MainProcess : asyncio_3 (INFO)]
2021-08-16 23:22:56,694 - <ipython-input-44-2e03a472c65d>::process(l:40) : Test Outut after Epoch 

2021-08-16 23:23:47,915 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::delivery_report(l:50) : User record b'bladecluster.iitp.org(6)' successfully produced to pyflx-c [0] at offset 11 [MainProcess : asyncio_2 (INFO)]
2021-08-16 23:23:48,353 - <ipython-input-44-2e03a472c65d>::process(l:40) : Test Outut after Epoch 2 of 10 for Client bladecluster.iitp.org(2) {
    "accuracy": 92.85,
    "attack": {
        "attack_success_count": 0,
        "attack_success_rate": 0,
        "instances": 0,
        "misclassification_rate": 0,
        "misclassifications": 0
    },
    "correct": 1857,
    "test_loss": 0.23333732426166534
} [MainProcess : asyncio_0 (INFO)]
2021-08-16 23:23:49,598 - <ipython-input-44-2e03a472c65d>::process(l:40) : Test Outut after Epoch 2 of 10 for Client bladecluster.iitp.org(1) {
    "accuracy": 93.30000000000001,
    "attack": {
        "attack_success_count": 0,
        "attack_success_rate": 0,
        "instances": 0,
        "misclassification_rate": 

2021-08-16 23:24:39,004 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::delivery_report(l:50) : User record b'bladecluster.iitp.org(5)' successfully produced to pyflx-c [0] at offset 14 [MainProcess : asyncio_5 (INFO)]
2021-08-16 23:24:39,012 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic pyflx-c. ^C to exit. [MainProcess : asyncio_1 (INFO)]
2021-08-16 23:24:39,279 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_3 (INFO)]
2021-08-16 23:24:39,308 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_1 (INFO)]
2021-08-16 23:24:39,967 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::delivery_report(l:50) : User record b'bladecluster.iitp.org(3)' successfully produced to pyflx-c [0] at offset 15 [MainProcess : asyncio_3 (INFO)]
2021-08-16 23:24:39,975 - /home/hars

2021-08-16 23:25:28,042 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_4 (INFO)]
2021-08-16 23:25:28,910 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::delivery_report(l:50) : User record b'bladecluster.iitp.org(1)' successfully produced to pyflx-c [0] at offset 18 [MainProcess : asyncio_4 (INFO)]
2021-08-16 23:25:29,364 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic pyflx-c. ^C to exit. [MainProcess : asyncio_0 (INFO)]
2021-08-16 23:25:29,517 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_0 (INFO)]
2021-08-16 23:25:30,421 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::delivery_report(l:50) : User record b'bladecluster.iitp.org(2)' successfully produced to pyflx-c [0] at offset 19 [MainProcess : asyncio_0 (INFO)]
2021-08-16 23:25:30,753 - /home/hars

2021-08-16 23:25:53,084 - <ipython-input-44-2e03a472c65d>::process(l:12) : Processing Client bladecluster.iitp.org(5) [MainProcess : asyncio_5 (INFO)]
2021-08-16 23:26:03,069 - <ipython-input-44-2e03a472c65d>::process(l:16) : Client bladecluster.iitp.org(1) received 1 model updates from ['bladecluster.iitp.org(server)'] [MainProcess : asyncio_4 (INFO)]
2021-08-16 23:26:03,158 - <ipython-input-44-2e03a472c65d>::process(l:16) : Client bladecluster.iitp.org(2) received 1 model updates from ['bladecluster.iitp.org(server)'] [MainProcess : asyncio_0 (INFO)]
2021-08-16 23:26:03,193 - <ipython-input-44-2e03a472c65d>::process(l:16) : Client bladecluster.iitp.org(5) received 1 model updates from ['bladecluster.iitp.org(server)'] [MainProcess : asyncio_5 (INFO)]
2021-08-16 23:26:03,225 - <ipython-input-44-2e03a472c65d>::process(l:16) : Client bladecluster.iitp.org(6) received 1 model updates from ['bladecluster.iitp.org(server)'] [MainProcess : asyncio_2 (INFO)]
2021-08-16 23:26:03,226 - <ipytho

2021-08-16 23:26:38,494 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::delivery_report(l:50) : User record b'bladecluster.iitp.org(server)' successfully produced to pyflx-cs [0] at offset 5 [MainProcess : MainThread (INFO)]
2021-08-16 23:26:44,322 - <ipython-input-45-94a59301310e>::<module>(l:22) : Gloabl Test Outut after Epoch 6 of 10 {
    "accuracy": 97.13000000000001,
    "attack": {
        "attack_success_count": 0,
        "attack_success_rate": 0,
        "instances": 0,
        "misclassification_rate": 0,
        "misclassifications": 0
    },
    "correct": 9713,
    "test_loss": 0.09265403573513031
} [MainProcess : MainThread (INFO)]
2021-08-16 23:26:44,379 - <ipython-input-44-2e03a472c65d>::process(l:12) : Processing Client bladecluster.iitp.org(1) [MainProcess : asyncio_0 (INFO)]
2021-08-16 23:26:44,400 - <ipython-input-44-2e03a472c65d>::process(l:12) : Processing Client bladecluster.iitp.org(5) [MainProcess : asyncio_3 (INFO)]
2021-08-16 23:26:44,405 - <ip

2021-08-16 23:27:19,522 - <ipython-input-44-2e03a472c65d>::process(l:40) : Test Outut after Epoch 6 of 10 for Client bladecluster.iitp.org(3) {
    "accuracy": 97.2,
    "attack": {
        "attack_success_count": 0,
        "attack_success_rate": 0,
        "instances": 0,
        "misclassification_rate": 0,
        "misclassifications": 0
    },
    "correct": 1944,
    "test_loss": 0.09767933869361878
} [MainProcess : asyncio_5 (INFO)]
 60%|██████    | 6/10 [05:15<03:29, 52.30s/it]2021-08-16 23:27:19,543 - <ipython-input-45-94a59301310e>::<module>(l:8) : Federated Training Epoch 7 of 10 [MainProcess : MainThread (INFO)]
2021-08-16 23:27:29,582 - <ipython-input-45-94a59301310e>::<module>(l:12) : Server received 6 model updates from ['bladecluster.iitp.org(2)', 'bladecluster.iitp.org(4)', 'bladecluster.iitp.org(1)', 'bladecluster.iitp.org(6)', 'bladecluster.iitp.org(5)', 'bladecluster.iitp.org(3)'] [MainProcess : MainThread (INFO)]
2021-08-16 23:27:29,902 - /home/harsh_1921cs01/hub/F

2021-08-16 23:28:10,740 - <ipython-input-44-2e03a472c65d>::process(l:40) : Test Outut after Epoch 7 of 10 for Client bladecluster.iitp.org(1) {
    "accuracy": 97.15,
    "attack": {
        "attack_success_count": 0,
        "attack_success_rate": 0,
        "instances": 0,
        "misclassification_rate": 0,
        "misclassifications": 0
    },
    "correct": 1943,
    "test_loss": 0.10397138421237469
} [MainProcess : asyncio_1 (INFO)]
2021-08-16 23:28:10,824 - <ipython-input-44-2e03a472c65d>::process(l:40) : Test Outut after Epoch 7 of 10 for Client bladecluster.iitp.org(6) {
    "accuracy": 97.25,
    "attack": {
        "attack_success_count": 0,
        "attack_success_rate": 0,
        "instances": 0,
        "misclassification_rate": 0,
        "misclassifications": 0
    },
    "correct": 1945,
    "test_loss": 0.08601928333938122
} [MainProcess : asyncio_5 (INFO)]
2021-08-16 23:28:10,877 - <ipython-input-44-2e03a472c65d>::process(l:40) : Test Outut after Epoch 7 of 10 for 

2021-08-16 23:28:59,271 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_5 (INFO)]
2021-08-16 23:28:59,528 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::delivery_report(l:50) : User record b'bladecluster.iitp.org(1)' successfully produced to pyflx-c [0] at offset 46 [MainProcess : asyncio_0 (INFO)]
2021-08-16 23:28:59,992 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::delivery_report(l:50) : User record b'bladecluster.iitp.org(4)' successfully produced to pyflx-c [0] at offset 47 [MainProcess : asyncio_5 (INFO)]
2021-08-16 23:29:02,222 - <ipython-input-44-2e03a472c65d>::process(l:40) : Test Outut after Epoch 8 of 10 for Client bladecluster.iitp.org(2) {
    "accuracy": 97.15,
    "attack": {
        "attack_success_count": 0,
        "attack_success_rate": 0,
        "instances": 0,
        "misclassification_rate": 0,
        "misclassifications": 0
    },
    "correct": 1943,
    "t

2021-08-16 23:29:49,212 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::delivery_report(l:50) : User record b'bladecluster.iitp.org(4)' successfully produced to pyflx-c [0] at offset 49 [MainProcess : asyncio_0 (INFO)]
2021-08-16 23:29:49,222 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic pyflx-c. ^C to exit. [MainProcess : asyncio_1 (INFO)]
2021-08-16 23:29:49,576 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_3 (INFO)]
2021-08-16 23:29:49,634 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_5 (INFO)]
2021-08-16 23:29:49,717 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_1 (INFO)]
2021-08-16 23:29:50,016 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::produce(l:56) : Producing us

2021-08-16 23:30:20,943 - <ipython-input-44-2e03a472c65d>::process(l:16) : Client bladecluster.iitp.org(6) received 1 model updates from ['bladecluster.iitp.org(server)'] [MainProcess : asyncio_2 (INFO)]
2021-08-16 23:30:37,517 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic pyflx-c. ^C to exit. [MainProcess : asyncio_4 (INFO)]
2021-08-16 23:30:37,718 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_4 (INFO)]
2021-08-16 23:30:38,805 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::delivery_report(l:50) : User record b'bladecluster.iitp.org(1)' successfully produced to pyflx-c [0] at offset 54 [MainProcess : asyncio_4 (INFO)]
2021-08-16 23:30:38,807 - /home/harsh_1921cs01/hub/F3IA/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic pyflx-c. ^C to exit. [MainProcess : asyncio_5 (INFO)]
2021-08-16 23:30:39,033 - /home/

520.8064320087433



