# Imports

In [1]:
import torchvision
import torchvision.transforms as transforms
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split, Subset, WeightedRandomSampler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import random
import math
import torch.nn as nn
from torch.utils.data import TensorDataset
from sklearn.metrics import accuracy_score
random.seed(7)

# Data preparation

In [3]:
from IPython.display import clear_output 


! pip install gdown
clear_output()

In [4]:
import gdown
url = 'https://drive.google.com/uc?id=1MAakvkGyUNiqZNKnnpsreVGDCJnS5Czx' 
output_X = 'X_UCM.pickle'
gdown.download(url, output_X, quiet=False)

url_y = 'https://drive.google.com/uc?id=1mSLimMpZXqYuUJgL4D4bhtnlwzhKTgOh' 
output_Y = 'Y_UCM.pickle'
gdown.download(url_y, output_Y, quiet=False)

In [5]:
import pickle

pickle_in = open("./X_UCM.pickle","rb")
X = pickle.load(pickle_in)

pickle_in = open("./Y_UCM.pickle","rb")
Y = pickle.load(pickle_in)

In [6]:
import cv2
from matplotlib import pyplot as plt
import random

fig = plt.figure(figsize=(10, 7))

rows = 2
columns = 2


for i in range (0,4):
    fig.add_subplot(rows, columns, i+1)
    rand = random.randint(0, 2000)
    plt.imshow(X[rand])
    plt.axis('off')
    plt.title(Y[rand])

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
X, Y, test_size=0.1, random_state=42,stratify=Y)

x_test, x_valid, y_test, y_valid = train_test_split(
x_test, y_test, test_size=0.2, random_state=42,stratify=y_test)

In [8]:
x_train.shape

In [9]:
y_test.shape

In [10]:
x_train=torch.tensor(x_train)
x_train.shape
x_train=torch.reshape(x_train, (1890,3,256, 256))
x_train.shape
y_train = torch.tensor(y_train)
y_test = torch.tensor(y_test)

In [11]:
x_test=torch.tensor(x_test)
x_test.shape
x_test=torch.reshape(x_test, (168,3,256, 256))
x_test.shape

In [12]:
y_train.shape

# Analyze distributions

In [13]:
output_mapping = {
                 0: "agricultural",
                 1:"airplane",
                 2: "baseballdiamond",
                 3: "beach",
                 4: "buildings",
                 5: "chaparral", 
                 6: "denseresidential", 
                 7: "forest",
                 8: "freeway",
                 9: "golfcourse",
                 10:"harbor",
                 11:"intersection",
                 12:"mediumresidential",
                 13:"mobilehomepark",
                 14:"overpass",
                 15:"parkinglot",
                 16:"river",
                 17:"runway",
                 18:"sparseresidential",
                 19:"storagetanks",
                 20:"tenniscourt"
                 }
    
 

In [14]:
train_counts=y_train.unique(return_counts=True)
test_counts=y_test.unique(return_counts=True)

for i in range(21):
    t=(100*train_counts[1][i]/(train_counts[1][i] + test_counts[1][i])).item()
    v=(100*test_counts[1][i]/(train_counts[1][i] + test_counts[1][i])).item()
    print( (output_mapping[i] + ": Train= {t:.2f}%;  Test= {v:.2f}%").format(t=t, v=v))
    

# Prepare data for federated learning

In [15]:
def split_and_shuffle_labels(y_data, seed, amount):
    y_data=pd.DataFrame(y_data,columns=["labels"])
    y_data["i"]=np.arange(len(y_data))
    label_dict = dict()
    for i in range(21):
        var_name=  output_mapping[i]
        label_info=y_data[y_data["labels"]==i]
        np.random.seed(seed)
        label_info=np.random.permutation(label_info)
        label_info=label_info[0:amount]
        label_info=pd.DataFrame(label_info, columns=["labels","i"])
        label_dict.update({var_name: label_info })
    return label_dict

This function splis the data with respect to the labels to 21 dataframes (each dataframe contains one unique label and the indices of the different pîctures annotated with that label).

They are stored in a dictionnary where the key is the label and the value is the considered dataframe.

In [16]:
def get_iid_subsamples_indices(label_dict, number_of_samples, amount):
    sample_dict= dict()
    batch_size=int(math.floor(amount/number_of_samples))
    print(batch_size)
    for i in range(number_of_samples):
        sample_name="sample"+str(i)
        dumb=pd.DataFrame()
        for j in range(21):
            label_name=output_mapping[j]
            a=label_dict[label_name][i*batch_size:(i+1)*batch_size]
            dumb=pd.concat([dumb,a], axis=0)
        dumb.reset_index(drop=True, inplace=True)    
        sample_dict.update({sample_name: dumb}) 
    return sample_dict

This function returns a dictionnary of 8 keys (8 devices), each key is given by "sample+i" and contains the same number of samples equally distributed to all the devices, except for the last device which takes the remaining pictures.

In [17]:
label_dict_train=split_and_shuffle_labels(y_data=y_train, seed=1, amount=90)
sample_dict_train=get_iid_subsamples_indices(label_dict=label_dict_train, number_of_samples=8, amount=90)


label_dict_test=split_and_shuffle_labels(y_data=y_test, seed=1, amount=8)
sample_dict_test=get_iid_subsamples_indices(label_dict=label_dict_test, number_of_samples=8, amount=8)

In [18]:
sample_dict_train['sample0']['labels'] 

Each sample contains roughly around 231 images and the last sample contains the rest of the images

In [19]:
def create_iid_subsamples(sample_dict, x_data, y_data, x_name, y_name):
    x_data_dict= dict()
    y_data_dict= dict()
    
    for i in range(len(sample_dict)): 
        xname= x_name+str(i)
        yname= y_name+str(i)
        sample_name="sample"+str(i)
        
        indices=np.sort(np.array(sample_dict[sample_name]["i"]))
        
        x_info= x_data[indices,:]
        x_data_dict.update({xname : x_info})
        
        y_info= y_data[indices]
        y_data_dict.update({yname : y_info})
        
    return x_data_dict, y_data_dict

This functions extracts x_train, y_train, x_test and y_test dictionnaries from the latest obtained dictionnary. Each element from the dictionnary is specific to a device. 

In [20]:
x_train_dict, y_train_dict = create_iid_subsamples(sample_dict=sample_dict_train, x_data=x_train, y_data=y_train, x_name="x_train", y_name="y_train")
x_test_dict, y_test_dict = create_iid_subsamples(sample_dict=sample_dict_test, x_data=x_test, y_data=y_test, x_name="x_test", y_name="y_test")

# Model

In [22]:
class cnn(nn.Module):
    def __init__(self):
        super(cnn, self).__init__()

        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=32, kernel_size=5, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=16, kernel_size=5, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        self.fc1 = nn.Linear(in_features=16*62*62, out_features=300)
        self.fc2 = nn.Linear(in_features=300, out_features=120)
        self.fc3 = nn.Linear(in_features=120, out_features=21)
        
    def forward(self, x):
        out = self.layer1(x.float())
        print(out.shape)
        out = self.layer2(out)
        print(out.shape)
        out = out.view(out.size(0), -1)
        print(out.shape)
        out = self.fc1(out)
        out = self.fc2(out)
        x = self.fc3(out)
        
        return x

# class cnn(nn.Module):
#     def __init__(self):
#         super(cnn, self).__init__()
#         self.conv1 = nn.Conv2d(3, 12, 5, 1,1)
#         self.conv2 = nn.Conv2d(12,20,5,1,1)
#         self.fc1 = nn.Linear(20*62*62, 120)
#         self.fc2 = nn.Linear(120, 84)
#         self.fc3 = nn.Linear(84, 21)

#     def forward(self, x):
#         x = F.relu(self.conv1(x))
#         x = F.max_pool2d(x, 2, 2)
#         x = F.relu(self.conv2(x))
#         x = F.max_pool2d(x, 2, 2)
#         x = x.view(-1, 20*62*62)
#         x = F.relu(self.fc1(x))
#         x = F.relu(self.fc2(x))
#         x = self.fc3(x)
#         return F.log_softmax(x, dim=1)

In [23]:
def create_model_optimizer_criterion_dict(number_of_samples):
    model_dict = dict()
    optimizer_dict= dict()
    criterion_dict = dict()
    
    for i in range(number_of_samples):
        model_name="model"+str(i)
        print(model_name)
        model_info=cnn()
        model_dict.update({model_name : model_info })
        
        optimizer_name="optimizer"+str(i)
        optimizer_info = torch.optim.SGD(model_info.parameters(), lr=learning_rate, momentum=momentum)
        optimizer_dict.update({optimizer_name : optimizer_info })
        
        criterion_name = "criterion"+str(i)
        criterion_info = nn.CrossEntropyLoss()
        criterion_dict.update({criterion_name : criterion_info})
        
    return model_dict, optimizer_dict, criterion_dict 

Similarly to the datasets dictionnaries, we create a local model with respect to each device and we assign each model to its respective key. 

In [24]:
# parameters for train
number_of_samples=8
learning_rate = 0.03
numEpoch = 5
batch_size = 32
momentum = 0.9

In [25]:
def get_averaged_weights(model_dict, number_of_samples):
   
    fc1_mean_weight = torch.zeros(size=model_dict[name_of_models[0]].fc1.weight.shape)
    fc1_mean_bias = torch.zeros(size=model_dict[name_of_models[0]].fc1.bias.shape)
    
    
    fc2_mean_weight = torch.zeros(size=model_dict[name_of_models[0]].fc2.weight.shape)
    fc2_mean_bias = torch.zeros(size=model_dict[name_of_models[0]].fc2.bias.shape)
    
    fc3_mean_weight = torch.zeros(size=model_dict[name_of_models[0]].fc3.weight.shape)
    fc3_mean_bias = torch.zeros(size=model_dict[name_of_models[0]].fc3.bias.shape)
    
    conv2d1_mean_weight = torch.zeros(size=model_dict[name_of_models[0]].layer1[0].weight.shape)
    conv2d1_mean_bias = torch.zeros(size=model_dict[name_of_models[0]].layer1[0].bias.shape)

    conv2d2_mean_weight = torch.zeros(size=model_dict[name_of_models[0]].layer2[0].weight.shape)
    conv2d2_mean_bias = torch.zeros(size=model_dict[name_of_models[0]].layer2[0].bias.shape)
    
    with torch.no_grad():
    
    
        for i in range(number_of_samples):
            fc1_mean_weight += model_dict[name_of_models[i]].fc1.weight.data.clone()
            fc1_mean_bias += model_dict[name_of_models[i]].fc1.bias.data.clone()
        
            fc2_mean_weight += model_dict[name_of_models[i]].fc2.weight.data.clone()
            fc2_mean_bias += model_dict[name_of_models[i]].fc2.bias.data.clone()
        
            fc3_mean_weight += model_dict[name_of_models[i]].fc3.weight.data.clone()
            fc3_mean_bias += model_dict[name_of_models[i]].fc3.bias.data.clone()
            
            conv2d1_mean_weight+=model_dict[name_of_models[0]].layer1[0].weight.data.clone()
            conv2d1_mean_bias+=model_dict[name_of_models[0]].layer1[0].bias.data.clone()

            conv2d2_mean_weight+=model_dict[name_of_models[0]].layer2[0].weight.data.clone()
            conv2d2_mean_bias+=model_dict[name_of_models[0]].layer2[0].bias.data.clone()
        
        fc1_mean_weight =fc1_mean_weight/number_of_samples
        fc1_mean_bias = fc1_mean_bias/ number_of_samples
    
        fc2_mean_weight =fc2_mean_weight/number_of_samples
        fc2_mean_bias = fc2_mean_bias/ number_of_samples
    
        fc3_mean_weight =fc3_mean_weight/number_of_samples
        fc3_mean_bias = fc3_mean_bias/ number_of_samples
        
        conv2d1_mean_weight =conv2d1_mean_weight/number_of_samples
        conv2d1_mean_bias = conv2d1_mean_bias/ number_of_samples
        
        conv2d2_mean_weight =conv2d2_mean_weight/number_of_samples
        conv2d2_mean_bias = conv2d2_mean_bias/ number_of_samples
   
    return fc1_mean_weight, fc1_mean_bias, fc2_mean_weight, fc2_mean_bias, fc3_mean_weight, fc3_mean_bias, conv2d1_mean_weight, conv2d1_mean_bias, conv2d2_mean_weight, conv2d2_mean_bias


This function takes the average of weights & biases in individual nodes.
We started by initializing an instant for both weights and biases of each layer, then fill it with the updated weights and biases of each node (i.e sample).
Finally, we calculate the average  weight and biases from all the collected parameters of all samples. 


In [26]:
def set_averaged_weights_as_main_model_weights_and_update_main_model(main_model,model_dict, number_of_samples):
    fc1_mean_weight, fc1_mean_bias, fc2_mean_weight, fc2_mean_bias, fc3_mean_weight, fc3_mean_bias, conv2d1_mean_weight, conv2d1_mean_bias, conv2d2_mean_weight, conv2d2_mean_bias = get_averaged_weights(model_dict, number_of_samples=number_of_samples)
    
    with torch.no_grad():
        main_model.fc1.weight.data = fc1_mean_weight.data.clone()
        main_model.fc2.weight.data = fc2_mean_weight.data.clone()
        main_model.fc3.weight.data = fc3_mean_weight.data.clone()
        main_model.layer1[0].weight.data = conv2d1_mean_weight.data.clone()
        main_model.layer2[0].weight.data = conv2d2_mean_weight.data.clone()

        main_model.fc1.bias.data = fc1_mean_bias.data.clone()
        main_model.fc2.bias.data = fc2_mean_bias.data.clone()
        main_model.fc3.bias.data = fc3_mean_bias.data.clone() 
        main_model.layer1[0].bias.data = conv2d1_mean_bias.data.clone()
        main_model.layer2[0].bias.data = conv2d2_mean_bias.data.clone()

    return main_model

This function sends the averaged weights of individual local devices to the main model and sets them as the new weights of the main model.

In [27]:
def send_main_model_to_nodes_and_update_model_dict(main_model, model_dict, number_of_samples):
    with torch.no_grad():
        for i in range(number_of_samples):

            model_dict[name_of_models[i]].fc1.weight.data =main_model.fc1.weight.data.clone()
            model_dict[name_of_models[i]].fc2.weight.data =main_model.fc2.weight.data.clone()
            model_dict[name_of_models[i]].fc3.weight.data =main_model.fc3.weight.data.clone()
            model_dict[name_of_models[i]].layer1[0].weight.data =main_model.layer1[0].weight.data.clone()
            model_dict[name_of_models[i]].layer2[0].weight.data =main_model.layer2[0].weight.data.clone()
            
            model_dict[name_of_models[i]].fc1.bias.data =main_model.fc1.bias.data.clone()
            model_dict[name_of_models[i]].fc2.bias.data =main_model.fc2.bias.data.clone()
            model_dict[name_of_models[i]].fc3.bias.data =main_model.fc3.bias.data.clone() 
            model_dict[name_of_models[i]].layer1[0].bias.data =main_model.layer1[0].bias.data.clone()
            model_dict[name_of_models[i]].layer2[0].bias.data =main_model.layer2[0].bias.data.clone()
    
    return model_dict

This function sends the parameters of the main model to the nodes. It returns an updated version of  model_dict.


In [28]:
def train(model, train_loader, criterion, optimizer):
    model.train()
    train_loss = 0.0
    correct = 0

    for data, target in train_loader:
        output = model(data).cuda()
        loss = criterion(output, target.cuda().type( torch.int64))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        prediction = output.argmax(dim=1, keepdim=True).cuda()
        correct += prediction.cpu().eq(target.cpu().view_as(prediction.cpu())).sum().item()
        

    return train_loss / len(train_loader), correct/len(train_loader.dataset)

This function is used to train a model on its data (train_loader), with a specified criterion and optimizer. 

In [29]:
def validation(model, test_loader, criterion):
    model.eval()
    test_loss = 0.0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data).cuda()
            test_loss += criterion(output, target.cuda().type( torch.int64) ).item()
            prediction = output.argmax(dim=1, keepdim=True).cuda()
            correct += prediction.cpu().eq(target.cpu().view_as(prediction.cpu())).sum().item()

    test_loss /= len(test_loader)
    correct /= len(test_loader.dataset)

    return (test_loss, correct)


This function evaluates the model on the test data

In [30]:
def start_train_end_node_process(number_of_samples):
    for i in range (number_of_samples): 

        train_ds = TensorDataset(x_train_dict[name_of_x_train_sets[i]], y_train_dict[name_of_y_train_sets[i]])
        train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

        test_ds = TensorDataset(x_test_dict[name_of_x_test_sets[i]], y_test_dict[name_of_y_test_sets[i]])
        test_dl = DataLoader(test_ds, batch_size=batch_size*2 )
        
        
    
        model=model_dict[name_of_models[i]]
        criterion=criterion_dict[name_of_criterions[i]]
        optimizer=optimizer_dict[name_of_optimizers[i]]
    
        print("Subset" ,i)
        for epoch in range(numEpoch):        
            train_loss, train_accuracy = train(model, train_dl, criterion, optimizer)
            val_loss, val_accuracy = validation(model, test_dl, criterion)
    
            print("epoch: {:3.0f}".format(epoch+1) + " | train accuracy: {:7.5f}".format(train_accuracy) + " | validation accuracy: {:7.5f}".format(val_accuracy))

This function does the train of a specific node (a local device) for 5 epochs on its own mini-dataset.

In [31]:
def start_train_end_node_process_without_print(number_of_samples):
    for i in range (number_of_samples): 

        train_ds = TensorDataset(x_train_dict[name_of_x_train_sets[i]], y_train_dict[name_of_y_train_sets[i]])
        train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

        test_ds = TensorDataset(x_test_dict[name_of_x_test_sets[i]], y_test_dict[name_of_y_test_sets[i]])
        test_dl = DataLoader(test_ds, batch_size=batch_size)
        
        
    
        model=model_dict[name_of_models[i]]
        criterion=criterion_dict[name_of_criterions[i]]
        optimizer=optimizer_dict[name_of_optimizers[i]]
    
        for epoch in range(numEpoch):        
            train_loss, train_accuracy = train(model, train_dl, criterion, optimizer)
            val_loss, val_accuracy = validation(model, test_dl, criterion)


This is the same previous function except that we don't print the process in each node.

In [32]:
main_model = cnn()
main_optimizer = torch.optim.SGD(main_model.parameters(), lr=0.01, momentum=0.9)
main_criterion = nn.CrossEntropyLoss()

The created model is a Convolutional network. We create an instance from the defined cnn class along with the SGD optimizer and CrossEntropyLoss to track the error.

In [33]:
model_dict, optimizer_dict, criterion_dict = create_model_optimizer_criterion_dict(number_of_samples)
#fc1_mean_weight, fc1_mean_bias, fc2_mean_weight, fc2_mean_bias, fc3_mean_weight, fc3_mean_bias, conv2d1_mean_weight, conv2d1_mean_bias, conv2d2_mean_weight, conv2d2_mean_bias = get_averaged_weights(model_dict, number_of_samples)

In [34]:
#List of the names of data partitions
name_of_x_train_sets=list(x_train_dict.keys())
name_of_y_train_sets=list(y_train_dict.keys())
name_of_x_test_sets=list(x_test_dict.keys())
name_of_y_test_sets=list(y_test_dict.keys())

#List of the names of models, optimizers and criterions
name_of_models=list(model_dict.keys())
name_of_optimizers=list(optimizer_dict.keys())
name_of_criterions=list(criterion_dict.keys())


In [35]:
print(main_model.fc2.weight[0:1,0:5])
print(model_dict["model1"].fc2.weight[0:1,0:5])

In [36]:
model_dict=send_main_model_to_nodes_and_update_model_dict(main_model, model_dict, number_of_samples)

In [37]:
print(main_model.fc2.weight[0:1,0:5])
print(model_dict["model1"].fc2.weight[0:1,0:5])

We can see that the updates from the main model have been made successfully.

In [None]:
start_train_end_node_process(number_of_samples)

This training is only done for 1 iteration.
We can see that the accuracies are not that great.

It's normal because we haven't sent the local trained weights to the main model. Once we send the weights to the main model, they're averaged and sent back to the local nodes. Then, this process is repeated until we achieve 10 iterations.

In [None]:
test_ds = TensorDataset(x_test, y_test)
test_dl = DataLoader(test_ds, batch_size=batch_size * 2)

After training local models for 1 iteration and after sending the weights and before averaging them by the main model we can see that the main model has very bad accuracy because it was randomly initialiazed. 
\
We must send the weights of the local nodes to the main i order to average them and get more reasonable accuracy.

After training local models for 1 iteration and after sending the weights and after averaging them by the main model we can see that the main model still has medium accuracy. 
\
We must train for more iterations in local nodes and repeat the same process.

In [43]:
for i in range(10):
    #send weights to local nodes
    model_dict=send_main_model_to_nodes_and_update_model_dict(main_model, model_dict, number_of_samples)
    #train local nodes
    start_train_end_node_process_without_print(number_of_samples)
    
    main_model= set_averaged_weights_as_main_model_weights_and_update_main_model(main_model,model_dict, number_of_samples) 
    test_loss, test_accuracy = validation(main_model, test_dl, main_criterion)
    print("Iteration", str(i+2), ": main_model accuracy on all test data: {:7.4f}".format(test_accuracy))

The same iterations are repeated 10 times and the accuracy improves gradually as the main model learns more from the local nodes each iteration.
\
It is not a perfect result and may be less performant than the normal training on ine unique device; but the results are quite intresting.