In [1]:
from PIL import Image
import numpy as np
import os
import cv2
import keras
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Conv2D,MaxPooling2D,Dense,Flatten,Dropout
import pandas as pd
import sys
%matplotlib inline
from scipy.spatial.distance import euclidean as euc
import matplotlib.pyplot as plt
import random
import plotly.express as px
import numpy
import tensorflow as tf
import requests

In [2]:
def readData(filepath, label):
    cells = []
    labels = []
    file = os.listdir(filepath)
    for img in file:
        try:
            image = cv2.imread(filepath + img)
            image_from_array = Image.fromarray(image, 'RGB')
            size_image = image_from_array.resize((50, 50))
            cells.append(np.array(size_image))
            labels.append(label)
        except AttributeError as e:
            print('Skipping file: ', img, e)
    print(len(cells), ' Data Points Read!')
    return np.array(cells), np.array(labels)

In [3]:
TestParasitizedCells, TestParasitizedLabels = readData('./input/fed/test/Parasitized/', 1)
TestUninfectedCells, TestUninfectedLabels  = readData('./input/fed/test/Uninfected/', 0)

2740  Data Points Read!
2783  Data Points Read!


In [4]:
def update(name, Cells, Labels, globalId):
    
    s = np.arange(Cells.shape[0])
    np.random.shuffle(s)
    Cells = Cells[s]
    Labels = Labels[s]
    
    num_classes=len(np.unique(Labels))
    len_data=len(Cells)
    print(len_data, ' Data Points')
    
    (x_train)=Cells
    (y_train)=Labels
    
    # Since we're working on image data, we normalize data by divinding 255.
    x_train = x_train.astype('float32')/255 
    train_len=len(x_train)
    
    #Doing One hot encoding as classifier has multiple classes
    y_train=keras.utils.to_categorical(y_train,num_classes)
    
    #creating sequential model
    model=Sequential()
    model.add(Conv2D(filters=16,kernel_size=2,padding="same",activation="relu",input_shape=(50,50,3)))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation="relu"))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Conv2D(filters=64,kernel_size=2,padding="same",activation="relu"))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(500,activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(2,activation="softmax"))#2 represent output layer neurons 
    # model.summary()
    
    if globalId != 1:
        model.load_weights("./weights/global"+str(globalId)+".h5")

    # compile the model with loss as categorical_crossentropy and using adam optimizer
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    #Fit the model with min batch size as 50[can tune batch size to some factor of 2^power ] 
    model.fit(x_train, y_train, batch_size=10, epochs=10, verbose=1)
    print(model.summary())
    
    #Saving Model
    model.save("./weights/"+str(name)+".h5")
    return len_data, model

In [5]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = numpy.random.permutation(len(a))
    return a[p], b[p]

In [6]:
print('Reading Training Data')
ParasitizedCells, ParasitizedLabels = readData('./input/cell_images/Parasitized/', 1)
UninfectedCells, UninfectedLabels  = readData('./input/cell_images/Uninfected/', 0)

Reading Training Data
Skipping file:  Thumbs.db 'NoneType' object has no attribute '__array_interface__'
13779  Data Points Read!
Skipping file:  Thumbs.db 'NoneType' object has no attribute '__array_interface__'
13779  Data Points Read!


In [7]:
Cells = np.concatenate((ParasitizedCells, UninfectedCells))
Labels = np.concatenate((ParasitizedLabels, UninfectedLabels))
Cells, Labels = unison_shuffled_copies(Cells, Labels)

In [8]:
def getDataLen(trainingDict):
    n = 0
    for w in trainingDict:
#         print(w)
        n += trainingDict[w]
    print('Total number of data points after this round: ', n)
    return n

def assignWeights(trainingDf, trainingDict):
    n = getDataLen(trainingDict)
    trainingDf['Weightage'] = trainingDf['DataSize'].apply(lambda x: x/n)
    return trainingDf, n
    
def scale(weight, scaler):
    scaledWeights = []
    for i in range(len(weight)):
        scaledWeights.append(scaler * weight[i])
    return scaledWeights

def getWeight(d):
    #creating sequential model
    model=Sequential()
    model.add(Conv2D(filters=16,kernel_size=2,padding="same",activation="relu",input_shape=(50,50,3)))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation="relu"))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Conv2D(filters=64,kernel_size=2,padding="same",activation="relu"))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(500,activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(2,activation="softmax"))#2 represent output layer neurons 
    # model.summary()
    
    fpath = "./weights/"+d+".h5"
    model.load_weights(fpath)
    weight = model.get_weights()
    return weight

def getScaledWeight(d, scaler):
    #creating sequential model
    model=Sequential()
    model.add(Conv2D(filters=16,kernel_size=2,padding="same",activation="relu",input_shape=(50,50,3)))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation="relu"))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Conv2D(filters=64,kernel_size=2,padding="same",activation="relu"))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(500,activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(2,activation="softmax"))#2 represent output layer neurons 
    # model.summary()
    
    fpath = "./weights/"+d+".h5"
    model.load_weights(fpath)
    weight = model.get_weights()
    return scale(weight, scaler)

def avgWeights(scaledWeights):
    avg = list()
    for weight_list_tuple in zip(*scaledWeights):
        layer_mean = tf.math.reduce_sum(weight_list_tuple, axis=0)
        avg.append(layer_mean)
    return avg

def FedAvg(trainingDict):
    trainingDf = pd.DataFrame.from_dict(trainingDict, orient='index', columns=['DataSize']) 
    models = list(trainingDict.keys())
    scaledWeights = []
    trainingDf, dataLen = assignWeights(trainingDf, trainingDict)
    for m in models:
        scaledWeights.append(getScaledWeight(m, trainingDf.loc[m]['Weightage']))
    fedAvgWeight = avgWeights(scaledWeights)
    return fedAvgWeight, dataLen


def saveModel(weight, n):
    
    print('Reading Testing Data')
    
    TestCells = np.concatenate((TestParasitizedCells, TestUninfectedCells))
    TestLabels = np.concatenate((TestParasitizedLabels, TestUninfectedLabels))
    
    sTest = np.arange(TestCells.shape[0])
    np.random.shuffle(sTest)
    TestCells = TestCells[sTest]
    TestLabels = TestLabels[sTest]
    
    num_classes=len(np.unique(TestLabels))
    
    (x_test) = TestCells
    (y_test) = TestLabels
    
    # Since we're working on image data, we normalize data by divinding 255.
    x_test = x_test.astype('float32')/255
    test_len=len(x_test)
    
    #Doing One hot encoding as classifier has multiple classes
    y_test=keras.utils.to_categorical(y_test,num_classes)

    #creating sequential model
    model=Sequential()
    model.add(Conv2D(filters=16,kernel_size=2,padding="same",activation="relu",input_shape=(50,50,3)))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation="relu"))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Conv2D(filters=64,kernel_size=2,padding="same",activation="relu"))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(500,activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(2,activation="softmax"))#2 represent output layer neurons 
    # model.summary()
    
    model.set_weights(weight)

    # compile the model with loss as categorical_crossentropy and using adam optimizer
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    scores = model.evaluate(x_test, y_test)
    print("Loss: ", scores[0])        #Loss
    print("Accuracy: ", scores[1])    #Accuracy

    #Saving Model
    fpath = "./weights/global"+str(n)+".h5"
    model.save(fpath)
    return scores[0], scores[1]

def euclidean(m, n):
    distance = []
    for i in range(len(m)):
#         print(i)
        distance.append(euc(m[i].reshape(-1,1), n[i].reshape(-1,1)))
#     print(distance)
    distance = sum(distance)/len(m)
    return distance

def merge(trainingDict, b):
#     print(trainingDict)
    models = list(trainingDict.keys())
#     print(models)
    trainingDf = pd.DataFrame.from_dict(trainingDict, orient='index', columns=['DataSize'])
    l_weights = []
    g_weight = {}
#     print(models)
    for m in models:
#         print(m)
        if 'global' in m:
            g_weight['name'] = m
            g_weight['weight'] = getWeight(m)
        else:
            l_weights.append({
                'name': m,
                'weight': getWeight(m)
            })
#     print(g_weight)
    scores = {}
            
    for m in l_weights:
        scores[m['name']] = euclidean(m['weight'], g_weight['weight'])

    sortedScores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1])}
    
#     print(scores)
#     print(sortedScores)

    b = int(len(scores)*b)
    
    selected = []
    
    for i in range(b):
        selected.append((sortedScores.popitem())[0])

    newDict = {}
    for i in trainingDict.keys():
        if (i not in selected) and ('global' not in i):
            newDict[i] = trainingDict[i]

    print('Selections: ', newDict)
    
    NewGlobal, dataLen = FedAvg(newDict)
    
    return NewGlobal, dataLen
    

In [9]:
per_client_batch_size = 1000

In [10]:
curr_local = 0
curr_global = 0

In [None]:
local = {}
loss_array = []
acc_array = []
for i in range(0, len(Cells), per_client_batch_size):
    if int(curr_global) == 0:
        curr_global += 1
        name = 'global' + str(curr_global)
        l, m = update(name, Cells[i:i+per_client_batch_size], Labels[i:i+per_client_batch_size], curr_global)
        local[name] = l
    elif (curr_local != 0) and (int(curr_local)%5 == 0):
        curr_global += 1
        print('Current Global: ', curr_global)
        name = 'global' + str(curr_global)
        m, l = merge(local, 0.25)
        loss, acc = saveModel(m, curr_global)
        loss_array.append(loss)
        acc_array.append(acc)
        curr_local += 1
        local = {}
        local[name] = l
    else:
        print('Current Local: ', curr_local)
        name = str('local'+str(curr_local))
        curr_local += 1
        l, m = update(name, Cells[i:i+per_client_batch_size], Labels[i:i+per_client_batch_size], curr_global)
        local[name] = l

1000  Data Points
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 50, 50, 16)        208       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 25, 25, 16)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 25, 25, 32)        2080      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 12, 12, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 12, 12, 64)        8256      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 6, 6, 64)          0   

Current Local:  2
1000  Data Points
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_9 (Conv2D)            (None, 50, 50, 16)        208       
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 25, 25, 16)        0         
_________________________________________________________________
conv2d_10 (Conv2D)           (None, 25, 25, 32)        2080      
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 12, 12, 32)        0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 12, 12, 64)        8256      
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 6, 

In [20]:
#accuracy #per_client_batch_size: 2000
print(acc_array)
fig = px.line(y=acc_array)
fig.show()

[0.5038928389549255, 0.504254937171936]


In [24]:
#accuracy #per_client_batch_size: 1500
print(acc_array)
fig = px.line(y=acc_array)
fig.show()

[0.5038928389549255, 0.9167119264602661, 0.9427847266197205]


In [None]:
#accuracy #per_client_batch_size: 1000
print(acc_array)
fig = px.line(y=acc_array)
fig.show()

In [28]:
#accuracy #per_client_batch_size: 500
print(acc_array)
fig = px.line(y=acc_array)
fig.show()

[0.5038928389549255, 0.5066087245941162, 0.6737280488014221, 0.8229223489761353, 0.8978815674781799, 0.9264892339706421, 0.9460437893867493, 0.9514756202697754, 0.9429658055305481, 0.949846088886261]


In [32]:
#accuracy #per_client_batch_size: 100
print(acc_array)
fig = px.line(y=acc_array)
fig.show()

[0.5047981142997742, 0.49610719084739685, 0.49610719084739685, 0.5350353121757507, 0.49610719084739685, 0.49610719084739685, 0.49610719084739685, 0.49610719084739685, 0.6809704899787903, 0.7394531965255737, 0.6608726978302002, 0.7486873269081116, 0.7148289084434509, 0.7662502527236938, 0.7816404104232788, 0.7535759806632996, 0.752127468585968, 0.7787434458732605, 0.8361397981643677, 0.8064457774162292, 0.8658337593078613, 0.8920876383781433, 0.8960709571838379, 0.9042187333106995, 0.9016838669776917, 0.9027702212333679, 0.9071156978607178, 0.9076588749885559, 0.9013217687606812, 0.9125475287437439, 0.903313398361206, 0.9129096269607544, 0.9250407218933105, 0.9261270761489868, 0.9270324110984802, 0.927756667137146, 0.9188846349716187, 0.9279376864433289, 0.927756667137146, 0.938258171081543, 0.9299293756484985, 0.9435089826583862, 0.9407930374145508, 0.9409741163253784, 0.9409741163253784, 0.9404309391975403, 0.9330074191093445, 0.9422415494918823, 0.9378960728645325, 0.9455006122589111

In [None]:
#accuracy #per_client_batch_size: 100, 10 epochs
print(acc_array)
fig = px.line(y=acc_array)
fig.show()

In [36]:
#accuracy #per_client_batch_size: 50
print(acc_array)
fig = px.line(y=acc_array)
fig.show()

[0.4785442650318146, 0.49610719084739685, 0.6299113035202026, 0.49610719084739685, 0.5100488662719727, 0.5315951704978943, 0.6688393950462341, 0.6559840440750122, 0.5529603362083435, 0.6784356236457825, 0.6159695982933044, 0.6619590520858765, 0.6947311162948608, 0.6768060922622681, 0.6069165468215942, 0.7046895027160645, 0.6972659826278687, 0.6979902386665344, 0.7179069519042969, 0.683686375617981, 0.6940068602561951, 0.7249683141708374, 0.7213470935821533, 0.758826732635498, 0.766793429851532, 0.7729494571685791, 0.7227956056594849, 0.7805540561676025, 0.7838131189346313, 0.7743979692459106, 0.7410827279090881, 0.74053955078125, 0.795582115650177, 0.7856237292289734, 0.7899692058563232, 0.8332427740097046, 0.82256019115448, 0.8460981249809265, 0.85370272397995, 0.832337498664856, 0.8497193455696106, 0.8491761684417725, 0.8663769960403442, 0.8837588429450989, 0.8719898462295532, 0.8844830989837646, 0.867825448513031, 0.9009596109390259, 0.8944414258003235, 0.8935361504554749, 0.8656527

In [378]:
#loss
fig = px.line(y=loss_array)
fig.show()