In [1]:
%load_ext autoreload
%autoreload 2

import asyncio, copy, os, socket, sys, time
from functools import partial
from multiprocessing import Pool, Process
from pathlib import Path
from tqdm import tqdm

import torch
from torch import optim
from torch.utils.tensorboard import SummaryWriter

sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../")))
from libs import agg, data, fl, hdc, log, nn, plot, poison, resnet, sim, wandb
from libs.distributed import *
from cfgs.fedargs import *

In [2]:
project = 'fl-hdc-kafka'
fedargs.epochs = 10
fedargs.topic = 'hdc'
fedargs.num_clients = 5
fedargs.one_d_len = 3072
fedargs.dataset = 'cifar10'
name = 'fedavg-cnn-cifar10-na'

# Save Logs To File (info | debug | warning | error | critical) [optional]
log.init("info")
#log.init("info", name)
#log.init("debug", name)

fedargs.tb = SummaryWriter('../out/runs/' + project + '/' + name, comment="fl")
#plot = plot.init(name, project)
wb = wandb.init(name, project)

[34m[1mwandb[0m: Currently logged in as: [33mkasyah[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [3]:
use_cuda = fedargs.cuda and torch.cuda.is_available()
torch.manual_seed(fedargs.seed)
device = torch.device("cuda" if use_cuda else "cpu")
kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}

In [4]:
host = socket.gethostname()
clients = [host + "(" + str(client + 1) + ")" for client in range(fedargs.num_clients)]

In [5]:
# Distributed topology
dt = Distributed(clients, fedargs.broker_ip, fedargs.schema_ip, fedargs.wait_to_consume)

In [6]:
# Initialize Global and Client models
fedargs.model = hdc.HDC(fedargs.one_d_len, fedargs.hdc_proj_len, len(fedargs.labels), device)
global_model = copy.deepcopy(fedargs.model)
# Load Data to clients
train_data, test_data = data.load_dataset(fedargs.dataset, only_to_tensor = True)

Files already downloaded and verified
Files already downloaded and verified


In [7]:
clients_data = data.split_data(train_data, clients)

In [8]:
client_train_loaders, client_test_loaders = data.load_client_data(clients_data, batch_size=-1, 0.2, **kwargs)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=len(test_data), shuffle=True, **kwargs)

client_details = {
        client: {"train_loader": client_train_loaders[client],
                 "test_loader": client_test_loaders[client],
                 "model": copy.deepcopy(global_model),
                 "model_update": None}
        for client in clients
    }

In [9]:
def background(f):
    def wrapped(*args, **kwargs):
        return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)

    return wrapped

@background
def process(client, epoch, dt, model, train_loader, test_loader, fedargs, device):
    log.info("Epoch: {}, Processing Client {}".format(epoch, client))
    
    # Consume and Average, epoch passed is actually prev epoch, for which we want to consume updates
    client_model_updates = dt.consume_model(client, fedargs.topic, model, epoch)

    log.info("Epoch: {}, Client {} received {} model update(s) from {}".format(epoch, client, 
                                                                               len(client_model_updates), 
                                                                               list(client_model_updates.keys())))
    
    if len(client_model_updates) != 0:
        model.avg(list(client_model_updates.values()))

    # Train    
    train_acc = model.train(train_loader, device)

    # Publish
    epoch = epoch + 1
    dt.produce_model(client, fedargs.topic, model, epoch)

    log.info("Epoch {} : Federated Training accuracy {}, Client {}".format(epoch, train_acc, client))

    # Test, Plot and Log
    test_acc = model.test(test_loader, device)
    fedargs.tb.add_scalar("Accuracy/" + client, test_acc, epoch)
    #plot.alog(client, {epoch: {"time": time.time(), "acc": test_acc}})
    wb.log({client: {"epoch": epoch, "time": time.time(), "acc": test_acc}})
    log.info("Test accuracy {} after Epoch {} of {} for Client {}".format(test_acc, epoch, fedargs.epochs, client))

    return model

In [10]:
import time
start_time = time.time()

# Federated Training
for epoch in tqdm(range(fedargs.epochs)):
    log.info("Federated Training Epoch {} of {}".format(epoch, fedargs.epochs))

    # Clients  
    tasks = [process(client, epoch, dt, client_details[client]['model'],
                     client_details[client]['train_loader'],
                     client_details[client]['test_loader'],
                     fedargs, device) for client in clients]
    try:
        updates = fedargs.loop.run_until_complete(asyncio.gather(*tasks))
    except KeyboardInterrupt as e:
        log.error("Caught keyboard interrupt. Canceling tasks...")
        tasks.cancel()
        fedargs.loop.run_forever()
        tasks.exception()
    
print(time.time() - start_time)

  0%|          | 0/10 [00:00<?, ?it/s]2021-11-08 14:41:30,072 - <ipython-input-10-cc316da67b86>::<module>(l:6) : Federated Training Epoch 0 of 10 [MainProcess : MainThread (INFO)]
2021-11-08 14:41:30,171 - <ipython-input-9-d25a439eb414>::process(l:9) : Epoch: 0, Processing Client bladecluster.iitp.org(1) [MainProcess : asyncio_0 (INFO)]
2021-11-08 14:41:30,177 - <ipython-input-9-d25a439eb414>::process(l:9) : Epoch: 0, Processing Client bladecluster.iitp.org(3) [MainProcess : asyncio_2 (INFO)]
2021-11-08 14:41:30,178 - <ipython-input-9-d25a439eb414>::process(l:9) : Epoch: 0, Processing Client bladecluster.iitp.org(4) [MainProcess : asyncio_3 (INFO)]
2021-11-08 14:41:30,179 - <ipython-input-9-d25a439eb414>::process(l:9) : Epoch: 0, Processing Client bladecluster.iitp.org(5) [MainProcess : asyncio_4 (INFO)]
2021-11-08 14:41:30,179 - <ipython-input-9-d25a439eb414>::process(l:9) : Epoch: 0, Processing Client bladecluster.iitp.org(2) [MainProcess : asyncio_1 (INFO)]
2021-11-08 14:41:40,211 -

2021-11-08 14:42:15,103 - <ipython-input-9-d25a439eb414>::process(l:9) : Epoch: 1, Processing Client bladecluster.iitp.org(5) [MainProcess : asyncio_2 (INFO)]
2021-11-08 14:42:25,099 - <ipython-input-9-d25a439eb414>::process(l:14) : Epoch: 1, Client bladecluster.iitp.org(1) received 5 model update(s) from ['bladecluster.iitp.org(2)', 'bladecluster.iitp.org(5)', 'bladecluster.iitp.org(1)', 'bladecluster.iitp.org(4)', 'bladecluster.iitp.org(3)'] [MainProcess : asyncio_1 (INFO)]
2021-11-08 14:42:25,172 - <ipython-input-9-d25a439eb414>::process(l:14) : Epoch: 1, Client bladecluster.iitp.org(4) received 5 model update(s) from ['bladecluster.iitp.org(2)', 'bladecluster.iitp.org(5)', 'bladecluster.iitp.org(1)', 'bladecluster.iitp.org(4)', 'bladecluster.iitp.org(3)'] [MainProcess : asyncio_3 (INFO)]
2021-11-08 14:42:25,234 - <ipython-input-9-d25a439eb414>::process(l:14) : Epoch: 1, Client bladecluster.iitp.org(5) received 5 model update(s) from ['bladecluster.iitp.org(2)', 'bladecluster.iitp.o

2021-11-08 14:43:11,679 - <ipython-input-9-d25a439eb414>::process(l:14) : Epoch: 2, Client bladecluster.iitp.org(2) received 5 model update(s) from ['bladecluster.iitp.org(1)', 'bladecluster.iitp.org(2)', 'bladecluster.iitp.org(3)', 'bladecluster.iitp.org(5)', 'bladecluster.iitp.org(4)'] [MainProcess : asyncio_4 (INFO)]
2021-11-08 14:43:11,686 - <ipython-input-9-d25a439eb414>::process(l:14) : Epoch: 2, Client bladecluster.iitp.org(3) received 5 model update(s) from ['bladecluster.iitp.org(1)', 'bladecluster.iitp.org(2)', 'bladecluster.iitp.org(3)', 'bladecluster.iitp.org(5)', 'bladecluster.iitp.org(4)'] [MainProcess : asyncio_0 (INFO)]
2021-11-08 14:43:11,686 - <ipython-input-9-d25a439eb414>::process(l:14) : Epoch: 2, Client bladecluster.iitp.org(4) received 5 model update(s) from ['bladecluster.iitp.org(1)', 'bladecluster.iitp.org(2)', 'bladecluster.iitp.org(3)', 'bladecluster.iitp.org(5)', 'bladecluster.iitp.org(4)'] [MainProcess : asyncio_2 (INFO)]
2021-11-08 14:43:40,510 - /home/ha

2021-11-08 14:44:37,041 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic hdc. ^C to exit. [MainProcess : asyncio_4 (INFO)]
2021-11-08 14:44:37,780 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_4 (INFO)]
2021-11-08 14:44:37,781 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic hdc. ^C to exit. [MainProcess : asyncio_1 (INFO)]
2021-11-08 14:44:37,889 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::delivery_report(l:50) : User record b'bladecluster.iitp.org(1)' successfully produced to hdc [0] at offset 295 [MainProcess : asyncio_4 (INFO)]
2021-11-08 14:44:37,918 - <ipython-input-9-d25a439eb414>::process(l:28) : Epoch 4 : Federated Training accuracy 30.537500381469727, Client bladecluster.iitp.org(1) [MainProcess : asyncio_4 (INFO)]
2021-11-08 14:44:38,023 - /home/harsh_

2021-11-08 14:45:33,056 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic hdc. ^C to exit. [MainProcess : asyncio_0 (INFO)]
2021-11-08 14:45:33,359 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_0 (INFO)]
2021-11-08 14:45:34,054 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::delivery_report(l:50) : User record b'bladecluster.iitp.org(5)' successfully produced to hdc [0] at offset 301 [MainProcess : asyncio_0 (INFO)]
2021-11-08 14:45:34,074 - <ipython-input-9-d25a439eb414>::process(l:28) : Epoch 5 : Federated Training accuracy 29.92500114440918, Client bladecluster.iitp.org(5) [MainProcess : asyncio_0 (INFO)]
2021-11-08 14:45:36,231 - <ipython-input-9-d25a439eb414>::process(l:35) : Test accuracy 29.600000381469727 after Epoch 5 of 10 for Client bladecluster.iitp.org(1) [MainProcess : asyncio_4 (INFO)]
2021-11-08 14:45:39,773 - <ipy

2021-11-08 14:46:30,194 - <ipython-input-9-d25a439eb414>::process(l:28) : Epoch 6 : Federated Training accuracy 29.024999618530273, Client bladecluster.iitp.org(3) [MainProcess : asyncio_1 (INFO)]
2021-11-08 14:46:30,598 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::delivery_report(l:50) : User record b'bladecluster.iitp.org(1)' successfully produced to hdc [0] at offset 307 [MainProcess : asyncio_4 (INFO)]
2021-11-08 14:46:30,666 - <ipython-input-9-d25a439eb414>::process(l:28) : Epoch 6 : Federated Training accuracy 30.1875, Client bladecluster.iitp.org(1) [MainProcess : asyncio_4 (INFO)]
2021-11-08 14:46:32,409 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic hdc. ^C to exit. [MainProcess : asyncio_2 (INFO)]
2021-11-08 14:46:32,537 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:56) : Producing user records to topic hdc. ^C to exit. [MainProcess : asyncio_0 (INFO)]
2021-11-08

2021-11-08 14:47:20,473 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::produce(l:66) : Flushing records... [MainProcess : asyncio_2 (INFO)]
2021-11-08 14:47:20,854 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::delivery_report(l:50) : User record b'bladecluster.iitp.org(4)' successfully produced to hdc [0] at offset 312 [MainProcess : asyncio_2 (INFO)]
2021-11-08 14:47:20,874 - <ipython-input-9-d25a439eb414>::process(l:28) : Epoch 7 : Federated Training accuracy 29.887500762939453, Client bladecluster.iitp.org(4) [MainProcess : asyncio_2 (INFO)]
2021-11-08 14:47:20,996 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::delivery_report(l:50) : User record b'bladecluster.iitp.org(5)' successfully produced to hdc [0] at offset 313 [MainProcess : asyncio_0 (INFO)]
2021-11-08 14:47:21,031 - <ipython-input-9-d25a439eb414>::process(l:28) : Epoch 7 : Federated Training accuracy 30.112499237060547, Client bladecluster.iitp.org(5) [MainProcess : 

2021-11-08 14:48:14,055 - <ipython-input-9-d25a439eb414>::process(l:35) : Test accuracy 29.049999237060547 after Epoch 8 of 10 for Client bladecluster.iitp.org(3) [MainProcess : asyncio_0 (INFO)]
2021-11-08 14:48:14,220 - /home/harsh_1921cs01/hub/AgroFed/fl/libs/protobuf_producer.py::delivery_report(l:50) : User record b'bladecluster.iitp.org(4)' successfully produced to hdc [0] at offset 319 [MainProcess : asyncio_2 (INFO)]
2021-11-08 14:48:14,236 - <ipython-input-9-d25a439eb414>::process(l:28) : Epoch 8 : Federated Training accuracy 29.887500762939453, Client bladecluster.iitp.org(4) [MainProcess : asyncio_2 (INFO)]
2021-11-08 14:48:15,882 - <ipython-input-9-d25a439eb414>::process(l:35) : Test accuracy 29.499998092651367 after Epoch 8 of 10 for Client bladecluster.iitp.org(2) [MainProcess : asyncio_1 (INFO)]
2021-11-08 14:48:16,893 - <ipython-input-9-d25a439eb414>::process(l:35) : Test accuracy 28.60000228881836 after Epoch 8 of 10 for Client bladecluster.iitp.org(5) [MainProcess : a

2021-11-08 14:49:08,091 - <ipython-input-9-d25a439eb414>::process(l:35) : Test accuracy 28.700000762939453 after Epoch 9 of 10 for Client bladecluster.iitp.org(5) [MainProcess : asyncio_2 (INFO)]
2021-11-08 14:49:09,545 - <ipython-input-9-d25a439eb414>::process(l:35) : Test accuracy 29.399999618530273 after Epoch 9 of 10 for Client bladecluster.iitp.org(2) [MainProcess : asyncio_1 (INFO)]
2021-11-08 14:49:09,779 - <ipython-input-9-d25a439eb414>::process(l:35) : Test accuracy 29.249998092651367 after Epoch 9 of 10 for Client bladecluster.iitp.org(4) [MainProcess : asyncio_3 (INFO)]
 90%|█████████ | 9/10 [07:40<00:51, 51.68s/it]2021-11-08 14:49:09,855 - <ipython-input-10-cc316da67b86>::<module>(l:6) : Federated Training Epoch 9 of 10 [MainProcess : MainThread (INFO)]
2021-11-08 14:49:09,931 - <ipython-input-9-d25a439eb414>::process(l:9) : Epoch: 9, Processing Client bladecluster.iitp.org(3) [MainProcess : asyncio_2 (INFO)]
2021-11-08 14:49:09,965 - <ipython-input-9-d25a439eb414>::process

513.5412526130676





<h1> End </h1>