__Purpose:__ Introduce Federated Learning, specifically by implementing FedAveraging on our dataset and moving on to more advanced methods.  Start by modifying the Simulations code, worry about (a)synchronicity later.
<br>
1. The dec matrix is the weights to pass back an forth (I think), although it comes out of SmoothBatch first
1. We are assuming we can test on the second half (updates 10-19ish) since learning should be complete by then!
1. Scipy.optimize.minimize() runs many iters to fully minimize its cost function.  You can change it to run as many iters as you'd like, although AFAIK you won't know how many it takes to converge.  But this is still a good set up for FL.
1. Hmm minimize() is doing BFGS rn and not SGD... not sure if that matters really.  Could probably implement SGD on my own or find it.  BFGS is 2nd order but we don't have a lot of parameters, I don't think.  Plus we can (already have?) solved analytically for the Hessian.

In [1]:
import pandas as pd
import os
import numpy as np
#from numpy.matlib import repmat
#from matplotlib import pyplot as plt
#from scipy.signal import detrend, firwin, freqz, lfilter
#from sklearn.model_selection import train_test_split, ShuffleSplit
from scipy.optimize import minimize, least_squares
import copy
from itertools import permutations

In [2]:
from experiment_params import *
from simulations import *
import time
# Do the below if you're in the pytch environment
#import pickle5 as pickle
import pickle

# Reminder of Conditions Order

NOTE: 

* **CONDITIONS** = array(['D_1', 'D_2', 'D_5', 'D_6', 'D_3', 'D_4', 'D_7','D_8']
* **LEARNING RATES:** alpha = 0.25 and 0.75; alpha = 0.25 for D1, D2, D5, D6; alpha = 0.75 for D3, D4, D7, D8
* **SMOOTHBATCH:** W_next = alpha*W_old + ((1 - alpha) * W_calc)

* **DECODER INIT:** pos for D1 - D4, neg for D5 - D8

* **PENALTY TERM:** $\lambda_E$ = 1e-6 for all, $\lambda_F$ = 1e-7 for all, $\lambda_D$ = 1e-3 for 1, 3, 5, 7 and 1e-4 for 2, 4, 6, 8 


| DECODER | ALPHA | PENALTY | DEC INIT |
| --- | --- | --- | --- |
| 1 | 0.25 | 1e-3 | + |
| 2 | 0.25 | 1e-4 | + |
| 3 | 0.75 | 1e-3 | + |
| 4 | 0.75 | 1e-4 | + |
| 5 | 0.25 | 1e-3 | - |
| 6 | 0.25 | 1e-4 | - |
| 7 | 0.75 | 1e-3 | - |
| 8 | 0.75 | 1e-4 | - |


## Load Our Data In

In [3]:
t0 = time.time()

with open('Data\continuous_full_data_block1.pickle', 'rb') as handle:
    #refs_block1, poss_block1, dec_vels_block1, int_vel_block1, emgs_block1, Ws_block1, Hs_block1, alphas_block1, pDs_block1, times_block1, conditions_block1 = pickle.load(handle)
    refs_block1, _, _, _, emgs_block1, Ws_block1, _, _, _, _, _ = pickle.load(handle)

#with open('Data\continuous_full_data_block2.pickle', 'rb') as handle:
    #refs_block2, poss_block2, dec_vels_block2, int_vel_block2, emgs_block2, Ws_block2, Hs_block2, alphas_block2, pDs_block2, times_block2, conditions_block2 = pickle.load(handle)
    #refs_block2, _, _, _, emgs_block2, Ws_block2, _, _, _, _, _ = pickle.load(handle)

t1 = time.time()
total = t1-t0  
print(total)

10.269370555877686


In [4]:
emgs_block1[keys[0]].shape

(8, 20770, 64)

In [5]:
refs_block1[keys[0]].shape

(8, 20770, 2)

In [6]:
list_length = 14
cond0_dict_list = [0]*list_length
for idx in range(list_length):
    cond0_dict_list[idx] = {'training':emgs_block1[keys[idx]][0,:,:], 'labels':refs_block1[keys[idx]][0,:,:]}

In [7]:
path = r'C:\Users\kdmen\Desktop\Research\personalization-privacy-risk\Data'
filename = r'\cond0_dict_list.p'

with open(path+filename, 'wb') as fp:
    pickle.dump(cond0_dict_list, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
# 8 conditions, 20770 data points (only 19 unique sets!), xy, channels
Ws_block1[keys[0]].shape

(8, 20770, 2, 64)

In [9]:
update_ix

array([    0,  1200,  2402,  3604,  4806,  6008,  7210,  8412,  9614,
       10816, 12018, 13220, 14422, 15624, 16826, 18028, 19230, 20432,
       20769])

In [10]:
dec_cond0_user1_update0 = Ws_block1[keys[0]][0,0,:,:]
dec_cond0_user1_update1 = Ws_block1[keys[0]][0,update_ix[1],:,:]
dec_cond0_user1_update2 = Ws_block1[keys[0]][0,update_ix[2],:,:]

print(f"Shape of decoder: {dec_cond0_user1_update0.shape}")
print()
print(f"Total difference between dec0 and dec1: {(dec_cond0_user1_update0 - dec_cond0_user1_update1).sum()}")
print("E.g., as previously shown, the first two decs are the same")
print()
print(f"Total difference between dec0 and dec2: {(dec_cond0_user1_update0 - dec_cond0_user1_update2).sum()}")

Shape of decoder: (2, 64)

Total difference between dec0 and dec1: 0.0
E.g., as previously shown, the first two decs are the same

Total difference between dec0 and dec2: 3.1981579823181594


# Create Federated Set Up

In [11]:
# python src/emg_fl_main.py --num_users=14, --model=___, --dataset=___, --num_classes=___, --iid=0)

def run_fl_sim(data_path,training_data,labels,epochs=10,num_users=14,C=0.1,local_epochs=10,local_batch_sz=10,lr=0.01,SGD_momentum=0.5,optimizer='sgd',iid=0,unequal=0,stopping_rounds=10,verbose=True,seed=1):
    # Other possible parameters
    #'num_channels'=64,
    #'norm'='batch_norm',    
    # Figure out what dataset to use... all EMG data?
    # Idk how many classes... we are doing regression...
    #parser.add_argument('num_classes', type=int, default=10
    # Explanation kept for these
    # Our application is probably non-IID?
    #parser.add_argument('iid', type=int, default=1,help='Default set to IID. Set to 0 for non-IID.')
    # Our splits are currently equal but irl they would not be
    #parser.add_argument('unequal', type=int, default=0,
    #                    help='whether to use unequal data splits for  \
    #                    non-i.i.d setting (use 0 for equal splits)')
    
    # Probably need to also pass in alphaF/E/D, maybe D_0?
    
    
    return


In [12]:
# Different training approaches
def train_1gradstep(w, eta, F, D, H, V, learning_batch, alphaF, alphaD):
        w = w - eta*gradient_cost_l2(F, D, H, V, learning_batch, alphaF, alphaD)
        return w
    
def train_1scipyminstep(w, eta, F, D, H, V, learning_batch, alphaF, alphaD, D0, display_info):
    w = minimize(lambda D: cost_l2(F,D,H,V,learning_batch,alphaF,alphaD), D0, method='BFGS', jac=lambda D: gradient_cost_l2(F,D,H,V,learning_batch,alphaF,alphaD), options={'disp': display_info, 'maxiter':1})
    return w

In [13]:
class model_base:
    def __init__(self, ID, w, verbose=False):
        #self.type # If this isn't even input, does it need to be here? I guess so for the repr function, but it needs to get overwritten/supered
        #current_round?
        # Client ID number
        self.ID = ID
        # Linear regression weights AKA the decoder
        self.w = w
        self.verbose = verbose
        
    def get_weights(self):
        return self.w
    
    def __repr__(self): 
        return f"{self.type} model: {self.ID}\n{self.type} Round: {self.current_round}\nTraining Method: {self.method}"

In [19]:
class server:
    def __init__(self, ID, all_clients, D0, method='FedAvg', C=0.1, current_round=0, lr=0.25, verbose=False):
        # Not input
        self.type = 'Server'
        self.num_avail_clients = 0
        self.available_clients_list = [0]*len(all_clients)
        self.num_chosen_clients = 0
        self.chosen_clients_lst = [0]*len(all_clients)
        # Input
        self.ID = ID
        self.all_clients = all_clients
        self.w = D0
        self.method = method
        self.current_round = current_round
        # ML Parameters / Conditions
        self.lr = lr
        self.verbose = verbose
        
    def __repr__(self): 
        return f"{self.type} model: {self.ID}\n{self.type} Round: {self.current_round}\nTraining Method: {self.method}"
        
    def vanilla_FL_loop(self):        
        # Choose fraction C of available clients
        self.set_num_available_clients()
        self.choose_clients()
        # Send those clients the current global model
        for my_client in self.chosen_clients_lst:
            my_client.set_update(self.w)
        # Let those clients train
        for my_client in self.chosen_clients_lst:
            # This really ought to be happening in parallel
            my_client.execute_training_loop()
        # Recieve local models from those clients
        ## Right now I just take lr and w separately within agg_local_weights().  Not ideal set up probably
        # AGGREGATION
        self.agg_local_weights()  # This func sets self.w, eg the new decoder
        
    def get_weights(self):
        return self.w
    
    def get_num_available_clients(self):
        return len(self.available_clients_list)
            
    def set_available_clients_list(self):
        self.num_avail_clients = 0
        self.available_clients_list = [0]*len(self.all_clients)
        for idx, my_client in enumerate(self.all_clients):
            self.num_avail_clients += my_client.get_availability()
            if my_client.get_availability():
                self.available_clients_list[idx] = my_client
    
    def choose_clients(self):
        # First reset all clients to be not chosen
        for my_client in self.all_clients:
            my_client.reset_chosen()
        
        # Then check what client are available this round
        self.set_available_clients_list()
        
        if self.num_avail_clients > 0:
            self.num_chosen_clients = np.ceil(self.num_avail_clients*self.C)
            print(f"Choosing {self.num_chosen_clients} clients for computation")
            self.chosen_clients_lst = random.shuffle(copy.deepcopy(self.available_clients_list))[:self.num_chosen_clients]
            
            for my_client in self.chosen_clients_lst:
                my_client.you_have_been_chosen()
        else:
            print(f"ERROR: Number of available clients must be greater than 0: {self.num_avail_clients}")
    
    def agg_local_weights(self):
        # Update global round number
        self.current_round += 1
        
        # From McMahan 2017 (vanilla FL)
        # Aggregate learning rates from each local model
        # When aggregating irl it would be better to query each client for weights and lr at the same time
        summed_lr = 0
        for my_client in self.chosen_clients_lst:
            summed_lr += my_client.get_learning_rate()
        # Aggregate local model weights, weighted by normalized local learning rate
        aggr_w = 0
        for my_client in self.chosen_clients_lst:
            aggr_w += (my_client.get_learning_rate()/summed_lr) * my_client.get_weights()
        # Loop is complete, new global decoder is self.w
        self.w = aggr_w
        # ^ Is this just gonna grow to infinity since all the values are positive?
        # E.g. it seems like more clients just means bigger dec?
        # Still not clear how the global decoder will be able to adapt to different channels for different orientations
  

In [20]:
class client:
    def __init__(self, ID, local_data, method, availability, num_steps=1, delay_scaling=5, random_delays=False, download_delay=1, upload_delay=1, current_round=0, lr=0.25, alphaF=1e-7, alphaD=1e-3, verbose=False):
        # NOT INPUT
        self.type = 'Client'
        self.chosen_status = 0
        # INPUT
        # Client ID number
        self.ID = ID
        # Local dataset
        self.training_data = local_data['training']
        self.labels = local_data['labels']
        # Linear regression weights AKA the decoder
        self.w = np.zeros((2,64))
        # Which training algorithm to use
        self.method = method
        # Availability for training
        self.availability = availability
        # Number of gradient steps to take when training (eg amount of local computation)
        self.num_steps = num_steps
        # Boolean setting whether or not up/download delays should be random or predefined
        self.random_delays = random_delays
        # Scaling from random [0,1] to number of seconds
        self.delay_scaling = delay_scaling
        # Set the delay times
        if self.random_delays: 
            self.download_delay = random()*self.delay_scaling
            self.upload_delay = random()*self.delay_scaling
        else:
            self.download_delay = download_delay
            self.upload_delay = upload_delay
        # Local round number (for asynch FL)
        self.current_round = current_round
        # ML Parameters / Conditions
        self.lr = lr
        self.alphaF = alphaF
        self.alphaD = alphaD
        self.verbose = verbose
        
        #w, eta, F, D, H, V, learning_batch, alphaF, alphaD, D0, display_info
        
        # Hard coded attributes
        # THESE SHOULD NOT CHANGE, SHARED FOR THE ENTIRE CLASS
        num_updates = 19
        starting_update = 10
        self.local_update = starting_update
        # I didn't load this in just copied it in cause it's faster
        update_ix = [0,  1200,  2402,  3604,  4806,  6008,  7210,  8412,  9614, 10816, 12018, 13220, 14422, 15624, 16826, 18028, 19230, 20432, 20769]
        
        # Handle train-test split
        # Should probably create F, D, H, V, learning_batch, alphaF, alphaD
        # Note that these would need to be updated with each update
        
    def __repr__(self): 
        return f"{self.type} model: {self.ID}\nCurrent Round: {self.current_round}\nTraining Method: {self.method}"
    
    def execute_training_loop(self):
        self.set_up_update()
        self.train_model()
        #self.send_update()  # Should this be contained in the loop or no
    
    def simulate_delay(self, incoming):
        '''
        Inputs:
            incoming: [0, 1] --> [upload, download]
            
        Purpose:
            Simulate the random delay associated with ___
        '''
        
        if incoming:
            time.sleep(self.download_delay+random())
        else:
            time.sleep(self.upload_delay+random())
            
    def set_up_update(self):
        lower_bound = update_ix[self.current_round-1]
        upper_bound = update_ix[self.current_round]
        self.learning_batch = upper_bound - lower_bound
        # FIX THIS BASED NO NB200
        s = np.transpose(self.emg_dataset[lower_bound:upper_bound,:])
        p_intended = np.transpose(self.cued_target_position[lower_bound:upper_bound,:])
        v_intended, p_constrained = output_new_decoder(s, self.w, p_intended)
        
        self.F = s[:,:-1] # note: truncate F for estimate_decoder
        self.V = v_intended
        self.D = self.w
        self.H = np.zeros((2,2))
        # Do I need to return anything since I'm already setting them with self?
    
    def train_model(self):
        for i in range(self.num_steps):
            if self.method==1:
                self.w = train_1gradstep(self.w, self.eta, self.F, self.D, self.H, self.V, self.learning_batch, self.alphaF, self.alphaD)
            elif self.method==2:
                D0 = np.random.rand(2,64)
                self.w = train_1scipyminstep(self.w, self.eta, self.F, self.D, self.H, self.V, self.learning_batch, self.alphaF, self.alphaD, D0, self.verbose)
            else:
                print("Unrecognized method")
            
    def test_model(self):
        # Execute a training loop but don't update D?
        # Then report accuracy in terms of predicted vs ground truth
        pass
    
    def set_update(self, new_model):
        #simulate_delay(incoming=True)
        self.global_model = new_model
        # Update the local round number to reflect the new data
        # I don't think it matters if the update happens on up/download, as long as everyone is consistent
        self.current_round += 1
    
    def get_update(self):
        #simulate_delay(incoming=False)
        return self.local_model
    
    def you_have_been_chosen(self):
        self.chosen_status = 1
    
    def set_availability(self, input_avail):
        self.availability = input_avail
    
    def get_availability(self):
        return self.availability
    
    def get_chosen_status(self):
        return self.chosen_status
    
    def reset_chosen(self):
        self.chosen_status = 0
        
    def get_learning_rate(self):
        return self.lr
    
    #####################################################################################################################
    #####################################################################################################################
    #####################################################################################################################
    
    def do_train_test_split(self):
        pass
    
    def simulate_data_streams(self):
        pass


In [21]:
epochs=10
num_users=14
C=0.1
local_epochs=10
local_batch_sz=10
lr=0.01
SGD_momentum=0.5
optimizer='sgd'
iid=0
unequal=0
stopping_rounds=10
verbose=True
seed=1

with open(path+filename, 'rb') as fp:
    cond0_dict_list = pickle.load(fp)

# There has to be a better way to do this lol
user0 = client(0, cond0_dict_list[0], 2, 1, delay_scaling=0)
user1 = client(1, cond0_dict_list[1], 2, 1, delay_scaling=0)
user2 = client(2, cond0_dict_list[2], 2, 1, delay_scaling=0)
user_database = [user0, user1, user2]
#user_database = [user0, user1, user2, user3, user4, user5, user6, user7, user8, user9, user10, user11, user12, user13]

In [22]:
D_0 = np.random.rand(2,64)
#ID, all_clients, D0
global_model = server(0, user_database, D_0)

In [None]:
#import os
#import copy
#import time
#import pickle
#import numpy as np
#from tqdm import tqdm
#from ARJ_update import LocalUpdate, test_inference
#from ARJ_my_models import MLP, CNNMnist, CNNFashion_Mnist, CNNCifar
#from ARJ_utils import get_dataset, average_weights, exp_details

######################################################################

# define paths
#path_project = os.path.abspath('..')
#logger = SummaryWriter('../logs')

# load dataset and user groups
#train_dataset, test_dataset, user_groups = get_dataset(args)
# Change the above to use these:
#'data_path','training_data','labels'

# Set the model to train and send it to device.
# KAI: I think this actually sends it to the GPU, not a "client" in the FL sense
#global_model.to(device)
#global_model.train()
D_0 = np.random.rand(2,64)  #Ws_block1[keys[0]][0,0,:,:]
#global_model_obj = global_model()
print(global_model_obj)

# copy weights
# In this case, our "model" [linear regression] is represented by the matrix D
# And D is also just the weights
global_weights = global_model_obj.get_weights()

# Training
train_loss, train_accuracy = [], []
val_acc_list, net_list = [], []
cv_loss, cv_acc = [], []
print_every = 2
val_loss_pre, counter = 0, 0

# Recall, I am not working with epochs
# I don't think? Maybe replace epochs with iterations
for epoch in tqdm(range(args.epochs)):
    local_weights, local_losses = [], []
    print(f'\n | Global Training Round : {epoch+1} |\n')

    global_model.train()
    m = max(int(args.frac * args.num_users), 1)
    idxs_users = np.random.choice(range(args.num_users), m, replace=False)

    for idx in idxs_users:
        local_model = LocalUpdate(args=args, dataset=train_dataset,
                                  idxs=user_groups[idx], logger=logger)
        w, loss = local_model.update_weights(
            model=copy.deepcopy(global_model), global_round=epoch)
        local_weights.append(copy.deepcopy(w))
        local_losses.append(copy.deepcopy(loss))

    # update global weights
    global_weights = average_weights(local_weights)

    # update global weights
    global_model.load_state_dict(global_weights)

    loss_avg = sum(local_losses) / len(local_losses)
    train_loss.append(loss_avg)

    # Calculate avg training accuracy over all users at every epoch
    list_acc, list_loss = [], []
    global_model.eval()
    for c in range(args.num_users):
        local_model = LocalUpdate(args=args, dataset=train_dataset,
                                  idxs=user_groups[idx], logger=logger)
        acc, loss = local_model.inference(model=global_model)
        list_acc.append(acc)
        list_loss.append(loss)
    train_accuracy.append(sum(list_acc)/len(list_acc))

    # print global training loss after every 'i' rounds
    if (epoch+1) % print_every == 0:
        print(f' \nAvg Training Stats after {epoch+1} global rounds:')
        print(f'Training Loss : {np.mean(np.array(train_loss))}')
        print('Train Accuracy: {:.2f}% \n'.format(100*train_accuracy[-1]))

# Test inference after completion of training
test_acc, test_loss = test_inference(args, global_model, test_dataset)

print(f' \n Results after {args.epochs} global rounds of training:')
print("|---- Avg Train Accuracy: {:.2f}%".format(100*train_accuracy[-1]))
print("|---- Test Accuracy: {:.2f}%".format(100*test_acc))

# Saving the objects train_loss and train_accuracy:
file_name = '../save/objects/{}_{}_{}_C[{}]_iid[{}]_E[{}]_B[{}].pkl'.\
    format(args.dataset, args.model, args.epochs, args.frac, args.iid,
           args.local_ep, args.local_bs)

with open(file_name, 'wb') as f:
    pickle.dump([train_loss, train_accuracy], f)

print('\n Total Run Time: {0:0.4f}'.format(time.time()-start_time))

# PLOTTING (optional)
# import matplotlib
# import matplotlib.pyplot as plt
# matplotlib.use('Agg')

# Plot Loss curve
# plt.figure()
# plt.title('Training Loss vs Communication rounds')
# plt.plot(range(len(train_loss)), train_loss, color='r')
# plt.ylabel('Training loss')
# plt.xlabel('Communication Rounds')
# plt.savefig('../save/fed_{}_{}_{}_C[{}]_iid[{}]_E[{}]_B[{}]_loss.png'.
#             format(args.dataset, args.model, args.epochs, args.frac,
#                    args.iid, args.local_ep, args.local_bs))
#
# # Plot Average Accuracy vs Communication rounds
# plt.figure()
# plt.title('Average Accuracy vs Communication rounds')
# plt.plot(range(len(train_accuracy)), train_accuracy, color='k')
# plt.ylabel('Average Accuracy')
# plt.xlabel('Communication Rounds')
# plt.savefig('../save/fed_{}_{}_{}_C[{}]_iid[{}]_E[{}]_B[{}]_acc.png'.
#             format(args.dataset, args.model, args.epochs, args.frac,
#                    args.iid, args.local_ep, args.local_bs))