In [1]:
%load_ext autoreload
%autoreload 2

import asyncio, copy, os, socket, sys, time
from functools import partial
from multiprocessing import Pool, Process
from pathlib import Path
from tqdm import tqdm

import torch
from torch import optim
from torch.utils.tensorboard import SummaryWriter

sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../")))
from libs import agg, data, fl, log, nn, plot, poison, resnet, sim, wandb
from libs.distributed import *
from cfgs.fedargs import *

In [2]:
project = 'fl-kafka'
fedargs.epochs = 10
fedargs.num_clients = 5
fedargs.model = resnet.ResNet18()
fedargs.dataset = 'cifar10'
name = 'fedavg-cnn-cifar10-na'

# Save Logs To File (info | debug | warning | error | critical) [optional]
log.init("info")
#log.init("info", name)
#log.init("debug", name)

fedargs.tb = SummaryWriter('../out/runs/' + project + '/' + name, comment="fl")
#plot = plot.init(name, project)
wb = wandb.init(name, project)

[34m[1mwandb[0m: Currently logged in as: [33mkasyah[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [3]:
use_cuda = fedargs.cuda and torch.cuda.is_available()
torch.manual_seed(fedargs.seed)
device = torch.device("cuda" if use_cuda else "cpu")
kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}

In [4]:
host = socket.gethostname()
clients = [host + "(" + str(client + 1) + ")" for client in range(fedargs.num_clients)]

In [5]:
# Distributed topology
dt = Distributed(clients, fedargs.broker_ip, fedargs.schema_ip, fedargs.wait_to_consume)

In [6]:
# Initialize Global and Client models
global_model = copy.deepcopy(fedargs.model)
# Load Data to clients
train_data, test_data = data.load_dataset(fedargs.dataset)

Files already downloaded and verified
Files already downloaded and verified


In [7]:
clients_data = data.split_data(train_data, clients)

In [8]:
client_train_loaders, client_test_loaders = data.load_client_data(clients_data, fedargs.client_batch_size, 0.2, **kwargs)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=fedargs.test_batch_size, shuffle=True, **kwargs)

client_details = {
        client: {"train_loader": client_train_loaders[client],
                 "test_loader": client_test_loaders[client],
                 "model": copy.deepcopy(global_model),
                 "model_update": None}
        for client in clients
    }

In [9]:
def background(f):
    def wrapped(*args, **kwargs):
        return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)

    return wrapped

@background
def process(client, epoch, dt, model, train_loader, test_loader, fedargs, device):
    log.info("Epoch: {}, Processing Client {}".format(epoch, client))
    
    # Consume and Average, epoch passed is actually prev epoch, for which we want to consume updates
    client_model_updates = dt.consume_model(client, fedargs.topic, model, epoch)
    
    # Pop one's own update
    if client in client_model_updates:
        client_model_updates.pop(client)

    log.info("Epoch: {}, Client {} received {} model update(s) from {}".format(epoch, client, 
                                                                               len(client_model_updates), 
                                                                               list(client_model_updates.keys())))
    
    if len(client_model_updates) != 0:
        model = fl.federated_avg(client_model_updates, model)

    # Train    
    model_update, model, loss = fedargs.train_func(model, train_loader, 
                                                   fedargs.learning_rate,
                                                   fedargs.weight_decay,
                                                   fedargs.local_rounds, device)

    # Publish
    epoch = epoch + 1
    dt.produce_model(client, fedargs.topic, model_update, epoch)

    log.jsondebug(loss, "Epoch {} : Federated Training loss, Client {}".format(epoch, client))
    log.modeldebug(model, "Epoch {}: Client {} Update".format(epoch, client))

    # Test, Plot and Log
    test_output = fedargs.eval_func(model, test_loader, device)
    fedargs.tb.add_scalar("Accuracy/" + client, test_output["accuracy"], epoch)
    fedargs.tb.add_scalar("Test Loss/" + client, test_output["test_loss"], epoch)
    #plot.alog(client, {epoch: {"time": time.time(), "acc": test_output["accuracy"], "loss": test_output["test_loss"]}})
    wb.log({client: {"epoch": epoch, "time": time.time(), "acc": test_output["accuracy"], "loss": test_output["test_loss"]}})
    log.jsoninfo(test_output, "Test Outut after Epoch {} of {} for Client {}".format(epoch, fedargs.epochs, client))

    return model

In [10]:
import time
start_time = time.time()

# Federated Training
for epoch in tqdm(range(fedargs.epochs)):
    log.info("Federated Training Epoch {} of {}".format(epoch, fedargs.epochs))

    # Clients  
    tasks = [process(client, epoch, dt, client_details[client]['model'],
                     client_details[client]['train_loader'],
                     client_details[client]['test_loader'],
                     fedargs, device) for client in clients]
    try:
        updates = fedargs.loop.run_until_complete(asyncio.gather(*tasks))
    except KeyboardInterrupt as e:
        log.error("Caught keyboard interrupt. Canceling tasks...")
        tasks.cancel()
        fedargs.loop.run_forever()
        tasks.exception()
    
    for client, update in zip(clients, updates):
        client_details[client]['model'] = update
    
print(time.time() - start_time)

  0%|          | 0/10 [00:00<?, ?it/s]2021-11-08 13:22:16,272 - <ipython-input-10-d06f8893748b>::<module>(l:6) : Federated Training Epoch 0 of 10 [MainProcess : MainThread (INFO)]
2021-11-08 13:22:16,376 - <ipython-input-9-c3c9e283dc74>::process(l:9) : Epoch: 0, Processing Client bladecluster.iitp.org(4) [MainProcess : asyncio_3 (INFO)]
2021-11-08 13:22:16,377 - <ipython-input-9-c3c9e283dc74>::process(l:9) : Epoch: 0, Processing Client bladecluster.iitp.org(5) [MainProcess : asyncio_4 (INFO)]
2021-11-08 13:22:16,378 - <ipython-input-9-c3c9e283dc74>::process(l:9) : Epoch: 0, Processing Client bladecluster.iitp.org(3) [MainProcess : asyncio_2 (INFO)]
2021-11-08 13:22:16,381 - <ipython-input-9-c3c9e283dc74>::process(l:9) : Epoch: 0, Processing Client bladecluster.iitp.org(2) [MainProcess : asyncio_1 (INFO)]
2021-11-08 13:22:16,382 - <ipython-input-9-c3c9e283dc74>::process(l:9) : Epoch: 0, Processing Client bladecluster.iitp.org(1) [MainProcess : asyncio_0 (INFO)]
2021-11-08 13:22:17,453 -

Exception list index out of range occured consuming update for client bladecluster.iitp.org(1)
Exception list index out of range occured consuming update for client bladecluster.iitp.org(2)Exception list index out of range occured consuming update for client bladecluster.iitp.org(4)

Exception list index out of range occured consuming update for client bladecluster.iitp.org(5)
Exception list index out of range occured consuming update for client bladecluster.iitp.org(3)


2021-11-08 13:26:45,041 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic pyflx. ^C to exit. [MainProcess : asyncio_1 (INFO)]
2021-11-08 13:26:45,406 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:64) : Exception raised [MainProcess : asyncio_1 (ERROR)]
2021-11-08 13:26:45,424 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_1 (INFO)]
2021-11-08 13:26:47,856 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic pyflx. ^C to exit. [MainProcess : asyncio_4 (INFO)]
2021-11-08 13:26:48,174 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:64) : Exception raised [MainProcess : asyncio_4 (ERROR)]
2021-11-08 13:26:48,196 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_4 (I

Exception list index out of range occured consuming update for client bladecluster.iitp.org(2)
Exception list index out of range occured consuming update for client bladecluster.iitp.org(1)
Exception list index out of range occured consuming update for client bladecluster.iitp.org(5)Exception list index out of range occured consuming update for client bladecluster.iitp.org(3)
Exception list index out of range occured consuming update for client bladecluster.iitp.org(4)



2021-11-08 13:31:36,600 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic pyflx. ^C to exit. [MainProcess : asyncio_0 (INFO)]
2021-11-08 13:31:36,982 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:64) : Exception raised [MainProcess : asyncio_0 (ERROR)]
2021-11-08 13:31:37,011 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_0 (INFO)]
2021-11-08 13:31:39,666 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic pyflx. ^C to exit. [MainProcess : asyncio_4 (INFO)]
2021-11-08 13:31:39,995 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:64) : Exception raised [MainProcess : asyncio_4 (ERROR)]
2021-11-08 13:31:40,015 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_4 (I

Exception list index out of range occured consuming update for client bladecluster.iitp.org(5)
Exception list index out of range occured consuming update for client bladecluster.iitp.org(4)
Exception list index out of range occured consuming update for client bladecluster.iitp.org(2)
Exception list index out of range occured consuming update for client bladecluster.iitp.org(1)
Exception list index out of range occured consuming update for client bladecluster.iitp.org(3)


2021-11-08 13:32:11,814 - <ipython-input-9-c3c9e283dc74>::process(l:18) : Epoch: 2, Client bladecluster.iitp.org(3) received 0 model update(s) from [] [MainProcess : asyncio_1 (INFO)]
2021-11-08 13:36:21,266 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic pyflx. ^C to exit. [MainProcess : asyncio_3 (INFO)]
2021-11-08 13:36:22,734 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:64) : Exception raised [MainProcess : asyncio_3 (ERROR)]
2021-11-08 13:36:23,776 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_3 (INFO)]
2021-11-08 13:36:23,782 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic pyflx. ^C to exit. [MainProcess : asyncio_0 (INFO)]
2021-11-08 13:36:24,212 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:64) : Exception raised [M

Exception list index out of range occured consuming update for client bladecluster.iitp.org(5)
Exception list index out of range occured consuming update for client bladecluster.iitp.org(2)
Exception list index out of range occured consuming update for client bladecluster.iitp.org(4)
Exception list index out of range occured consuming update for client bladecluster.iitp.org(3)
Exception list index out of range occured consuming update for client bladecluster.iitp.org(1)


2021-11-08 13:41:12,581 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic pyflx. ^C to exit. [MainProcess : asyncio_3 (INFO)]
2021-11-08 13:41:12,885 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:64) : Exception raised [MainProcess : asyncio_3 (ERROR)]
2021-11-08 13:41:12,911 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_3 (INFO)]
2021-11-08 13:41:18,557 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic pyflx. ^C to exit. [MainProcess : asyncio_0 (INFO)]
2021-11-08 13:41:18,984 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:64) : Exception raised [MainProcess : asyncio_0 (ERROR)]
2021-11-08 13:41:19,014 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_0 (I

Exception list index out of range occured consuming update for client bladecluster.iitp.org(5)Exception list index out of range occured consuming update for client bladecluster.iitp.org(2)

Exception list index out of range occured consuming update for client bladecluster.iitp.org(4)Exception list index out of range occured consuming update for client bladecluster.iitp.org(1)

Exception list index out of range occured consuming update for client bladecluster.iitp.org(3)


2021-11-08 13:45:59,961 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic pyflx. ^C to exit. [MainProcess : asyncio_0 (INFO)]
2021-11-08 13:46:00,332 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:64) : Exception raised [MainProcess : asyncio_0 (ERROR)]
2021-11-08 13:46:00,359 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_0 (INFO)]
2021-11-08 13:46:04,565 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic pyflx. ^C to exit. [MainProcess : asyncio_3 (INFO)]
2021-11-08 13:46:04,765 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:64) : Exception raised [MainProcess : asyncio_3 (ERROR)]
2021-11-08 13:46:04,780 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_3 (I

2021-11-08 13:51:23,241 - <ipython-input-9-c3c9e283dc74>::process(l:44) : Test Outut after Epoch 6 of 10 for Client bladecluster.iitp.org(1) {
    "accuracy": 42.1,
    "correct": 842,
    "test_loss": -130.49165563964843
} [MainProcess : asyncio_0 (INFO)]
2021-11-08 13:51:27,261 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic pyflx. ^C to exit. [MainProcess : asyncio_2 (INFO)]
2021-11-08 13:51:27,598 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:64) : Exception raised [MainProcess : asyncio_2 (ERROR)]
2021-11-08 13:51:27,616 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_2 (INFO)]
2021-11-08 13:51:32,226 - <ipython-input-9-c3c9e283dc74>::process(l:44) : Test Outut after Epoch 6 of 10 for Client bladecluster.iitp.org(4) {
    "accuracy": 42.449999999999996,
    "correct": 849,
    "test_loss": -130.22928894042968
} [M

2021-11-08 13:56:41,598 - <ipython-input-9-c3c9e283dc74>::process(l:18) : Epoch: 7, Client bladecluster.iitp.org(5) received 0 model update(s) from [] [MainProcess : asyncio_2 (INFO)]
2021-11-08 13:56:41,613 - <ipython-input-9-c3c9e283dc74>::process(l:18) : Epoch: 7, Client bladecluster.iitp.org(3) received 0 model update(s) from [] [MainProcess : asyncio_3 (INFO)]
2021-11-08 13:56:41,618 - <ipython-input-9-c3c9e283dc74>::process(l:18) : Epoch: 7, Client bladecluster.iitp.org(1) received 0 model update(s) from [] [MainProcess : asyncio_4 (INFO)]
2021-11-08 14:00:53,829 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic pyflx. ^C to exit. [MainProcess : asyncio_3 (INFO)]
2021-11-08 14:00:54,186 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:64) : Exception raised [MainProcess : asyncio_3 (ERROR)]
2021-11-08 14:00:54,217 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:66) 

2021-11-08 14:06:07,232 - <ipython-input-9-c3c9e283dc74>::process(l:44) : Test Outut after Epoch 9 of 10 for Client bladecluster.iitp.org(3) {
    "accuracy": 43.6,
    "correct": 872,
    "test_loss": -225.0173839111328
} [MainProcess : asyncio_4 (INFO)]
2021-11-08 14:06:09,572 - <ipython-input-9-c3c9e283dc74>::process(l:44) : Test Outut after Epoch 9 of 10 for Client bladecluster.iitp.org(2) {
    "accuracy": 45.4,
    "correct": 908,
    "test_loss": -214.16035278320314
} [MainProcess : asyncio_0 (INFO)]
2021-11-08 14:06:11,404 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic pyflx. ^C to exit. [MainProcess : asyncio_2 (INFO)]
2021-11-08 14:06:11,762 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:64) : Exception raised [MainProcess : asyncio_2 (ERROR)]
2021-11-08 14:06:11,780 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : as

2945.708108663559



[34m[1mwandb[0m: Network error resolved after 0:00:42.729678, resuming normal operation.


<h1> End </h1>