In [1]:
# !pip install torch
# !pip install torchvision
# !pip install torchsummary
# !pip install pandas
# !pip install numpy
# !pip install matplotlib
# !pip install tqdm
# !pip install tensor
# !pip install tensorflow
# !pip install matplotlib
# !pip install scikit-learn

In [2]:
import os
import sys
import pickle
import json
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model, Sequential, clone_model, load_model
from tensorflow.keras.datasets import cifar10, cifar100, mnist, fashion_mnist
from tensorflow.keras.layers import Input, Dense, add, concatenate, Conv2D, Dropout, BatchNormalization, Flatten, MaxPooling2D, AveragePooling2D, Activation, Dropout, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.losses import SparseCategoricalCrossentropy,KLDivergence
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import StratifiedShuffleSplit

In [3]:
class Distiller(Model):
    def __init__(self, student, teacher):
        super(Distiller, self).__init__()
        self.teacher = teacher
        self.student = student

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
        """ Configure the distiller.

        Args:
            optimizer: Keras optimizer for the student weights
            metrics: Keras metrics for evaluation
            student_loss_fn: Loss function of difference between student
                predictions and ground-truth
            distillation_loss_fn: Loss function of difference between soft
                student predictions and soft teacher predictions
            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
            temperature: Temperature for softening probability distributions.
                Larger temperature gives softer distributions.
        """
        super(Distiller, self).compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        # Unpack data
        x, y = data

        # Forward pass of teacher
        teacher_predictions = self.teacher(x, training=False)

        with tf.GradientTape() as tape:
            # Forward pass of student
            student_predictions = self.student(x, training=True)

            # Compute losses
            student_loss = self.student_loss_fn(y, student_predictions)

            # Compute scaled distillation loss from https://arxiv.org/abs/1503.02531
            # The magnitudes of the gradients produced by the soft targets scale
            # as 1/T^2, multiply them by T^2 when using both hard and soft targets.
            distillation_loss = (
                self.distillation_loss_fn(
                    tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                    tf.nn.softmax(student_predictions / self.temperature, axis=1),
                )
                * self.temperature**2
            )

            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss

        # Compute gradients
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Update the metrics configured in `compile()`.
        self.compiled_metrics.update_state(y, student_predictions)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss}
        )
        return results

    def test_step(self, data):
        # Unpack the data
        x, y = data

        # Compute predictions
        y_prediction = self.student(x, training=False)

        # Calculate the loss
        student_loss = self.student_loss_fn(y, y_prediction)

        # Update the metrics.
        self.compiled_metrics.update_state(y, y_prediction)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results

In [4]:
def compile_model(cnn_model, lr):
    cnn_model.compile(optimizer=Adam(learning_rate=lr),
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=[SparseCategoricalAccuracy()])

    
def create_cnn_model(model_params, input_size, n_classes, lr=0.001):
    input_layer = Input(input_size) 
    reshape_layer = Reshape((input_size[0], input_size[1], 1)) if len(input_size) == 2 else Reshape(input_size)
  
    layers = []
    layers.append(input_layer)
    layers.append(reshape_layer)

    for i in range(0, len(model_params["conv_layers"]), 1):
        layers.append(Conv2D(model_params["conv_layers"][i], model_params["kernel_sizes"][i], strides=(2, 2), padding="same"))
        layers.append(Activation(model_params["activation"]))
        layers.append(MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"))
        layers.append(Dropout(model_params["dropout"]))
    
    flatten_layer = Flatten()
    dense_layer_1 = Dense(120, kernel_regularizer=tf.keras.regularizers.l2(0.001))
    output_layer = Dense(n_classes, kernel_regularizer=tf.keras.regularizers.l2(0.001))
    output_activation = Activation("softmax")
    layers.append(flatten_layer)
    layers.append(dense_layer_1)
    layers.append(output_layer)
#     layers.append(output_activation)
    cnn_model = Sequential(
        layers,
        name=model_params["model_name"],
      )
    
    compile_model(cnn_model, lr)
#     print("------------------- {}: Model Summary -------------------".format(model_params["model_name"]))
#     print(cnn_model.summary())

    return cnn_model


def remove_last_layer(model, loss = "mean_absolute_error"):
    """
    Input: Keras model, a classification model whose last layer is a softmax activation
    Output: Keras model, the same model with the last softmax activation layer removed,
        while keeping the same parameters 
    """
    
    new_model = Model(inputs = model.inputs, outputs = model.layers[-2].output)
    new_model.set_weights(model.get_weights())
    new_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), 
                      loss=loss)
    
    return new_model

In [5]:
batch_size = 32
epochs = 3
is_shuffle = True
client_learning_rate = 0.001
server_learning_rate = 0.001

In [6]:
client_params = {
    "client_info": [ 
        {
        "conv_layers": [128, 128], "kernel_sizes": [(3, 3), (2, 2)],
        "dropout": 0.1, "activation": "relu"
        },
        {
        "conv_layers": [128, 256], "kernel_sizes": [(3, 3), (2, 2)],
        "dropout": 0.1, "activation": "relu"
        },
        {
        "conv_layers": [256, 256], "kernel_sizes": [(3, 3), (2, 2)],
        "dropout": 0.2, "activation": "relu"
        },
        {
        "conv_layers": [256, 384], "kernel_sizes": [(3, 3), (3, 3)],
        "dropout": 0.2, "activation": "relu"
        },
        {
        "conv_layers": [128, 384], "kernel_sizes": [(2, 2), (3, 3)],
        "dropout": 0.1, "activation": "relu"
        },
                {
        "conv_layers": [128, 128], "kernel_sizes": [(3, 3), (2, 2)],
        "dropout": 0.1, "activation": "relu"
        },
        {
        "conv_layers": [128, 192], "kernel_sizes": [(3, 3), (2, 2)],
        "dropout": 0.1, "activation": "relu"
        },
        {
        "conv_layers": [192, 256], "kernel_sizes": [(3, 3), (2, 2)],
        "dropout": 0.2, "activation": "relu"
        },
        {
        "conv_layers": [192, 384], "kernel_sizes": [(3, 3), (3, 3)],
        "dropout": 0.2, "activation": "relu"
        },
        {
        "conv_layers": [256, 512], "kernel_sizes": [(2, 2), (3, 3)],
        "dropout": 0.2, "activation": "relu"
        },
#         {
#         "conv_layers": [128, 128, 128], "kernel_sizes": [(3, 3), (3, 3), (2, 2)],
#         "dropout": 0.1, "activation": "relu"
#         },
#         {
#         "conv_layers": [256, 256, 256], "kernel_sizes": [(3, 3), (3, 3), (2, 2)],
#         "dropout": 0.15, "activation": "relu"
#         },
#         {
#         "conv_layers": [128, 384, 256], "kernel_sizes": [(3, 3), (3, 3), (3, 3)],
#         "dropout": 0.2, "activation": "relu"
#         },
#         {
#         "conv_layers": [128, 256, 256], "kernel_sizes": [(3, 3), (3, 3), (2, 2)],
#         "dropout": 0.2, "activation": "relu"
#         },
#         {
#         "conv_layers": [256, 384, 256], "kernel_sizes": [(3, 3), (3, 3), (3, 3)],
#         "dropout": 0.2, "activation": "relu"
#         }
    ],
    "pre_train_params": {
        "min_delta": 0.005, "patience": 3, "verbose": 1,
        "batch_size": batch_size, "epochs": epochs, "is_shuffle": is_shuffle, 
    }
}

In [7]:
global_model_params = {
    "conv_layers": [256, 256],
    "kernel_sizes": [(3, 3), (3, 3)],
    "dropout": 0.2,
    "activation": "relu",
    "pre_train_params": {"min_delta": 0.005, "patience": 3,
                         "batch_size": batch_size, "epochs": epochs, "is_shuffle": is_shuffle, 
                         "verbose": 1},
}

In [8]:
# Data Utilities
def normalize(x):
    x_shape = x.shape
    # print("Initial X Shape:", x_shape)
    x = x.astype("float32") / 255.0
    # x = np.reshape(x, (-1, x_shape[1], x_shape[2], 1))
    # print("Final X Shape:", x.shape)
    return x

def get_class_count(y):
    y = y.tolist()
    y = y if type(y[0]) == int else [ele[0] for ele in y]
    return len(set(y))

def load_dataset(dataset_name):
    dataset_name = dataset_name.replace("-", "").upper()
    if dataset_name == "MNIST":
        (x_train, y_train), (x_test, y_test) = mnist.load_data()
    elif dataset_name == "FASHIONMNIST":
        (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
    elif dataset_name == "CIFAR10":
        (x_train, y_train), (x_test, y_test) = cifar10.load_data()
    elif dataset_name == "CIFAR100":
        (x_train, y_train), (x_test, y_test) = cifar100.load_data()
    else:
        print("Invalid Dataset Requested!")
        return None
    return normalize(x_train), y_train, normalize(x_test), y_test

In [9]:
num_clients = len(client_params["client_info"])
server_name_prefix, client_name_prefix = "Server_", "Client_"

# dataset_name, classes_per_client, training_samples_per_class = "mnist", 3, 1000
dataset_name, classes_per_client, training_samples_per_class = "fashionmnist", 3, 750
# dataset_name, classes_per_client, training_samples_per_class = "cifar10", 3, 1000
# dataset_name, classes_per_client, training_samples_per_class = "cifar100", 10, 100

dataset = load_dataset(dataset_name)

if dataset is None:
    raise Exception("Issue with Dataset Retrieval")
else: 
    x_train, y_train, x_test, y_test = dataset
    x_train_size, IMG_SIZE = x_train.shape[0], x_train.shape[1:]
    x_test_size = x_test.shape
    num_classes = get_class_count(y_train)
    print(type(x_train), type(y_train), type(x_test), type(y_test))
    print(type(x_train[0]), type(y_train[0]), type(x_test[0]), type(y_train[0]))
    print("Dataset:", dataset_name)
    print("Training Data Size:", x_train_size)
    print("Testing Data Size:", x_test_size)
    print("IMAGE SIZE:", IMG_SIZE)
    print("# of Classes:", num_classes)

<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.ndarray'> <class 'numpy.uint8'> <class 'numpy.ndarray'> <class 'numpy.uint8'>
Dataset: fashionmnist
Training Data Size: 60000
Testing Data Size: (10000, 28, 28)
IMAGE SIZE: (28, 28)
# of Classes: 10


In [10]:
def clean_data(data):
    data = np.ndarray(data)
    print(data.shape)
    np.random.shuffle(data)
    return data
            
def partition_into_classes(x, y):
    print("Begin Dataset Class Partition")
    dataset_class_indices, dataset_partitions = {}, {}
    y_list = y.tolist()
    for i in range(x.shape[0]):
        val = y_list[i] if type(y_list[i]) in [int] else y_list[i][0]
        if val in dataset_class_indices:
            dataset_class_indices[val].append(i)
        else:
            dataset_class_indices[val] = [i]
    
#     unique_classes = list(dataset_class_indices.keys())
#     for val in unique_classes:
#         dataset_partitions[val] = {"x": x[dataset_class_indices[val]], "y": y[dataset_class_indices[val]]}
# #     for c in unique_classes:
# #         print(c, len(dataset_partitions[c]["x"]), len(dataset_partitions[c]["y"]))
#     print(type(dataset_partitions[1]["x"]), dataset_partitions[1]["x"].shape)
    print("Completed Dataset Class Partitioning!")
    return dataset_class_indices

def partition_dataset(dataset, dataset_name="mnist", num_clients=10, classes_per_client=3, training_samples_per_class=500):
    x_train, y_train, x_test, y_test = dataset
    train_dataset_class_indices = partition_into_classes(x_train, y_train)
    test_dataset_class_indices = partition_into_classes(x_test, y_test)
    train_classes, test_classes = list(train_dataset_class_indices.keys()), list(test_dataset_class_indices.keys())
    train_classes.sort()
    test_classes.sort()
    client_data = {}
    print("Distributing Data to Clients:")
    if train_classes != test_classes:
        print("Mismatch in Class Info during Partitioning!")
        client_data = None
    elif dataset == "cifar100":
        np.random.shuffle(train_classes)
        distribution_factor = len(train_classes) // num_clients
        for client in range(1, num_clients, 1):
            final_training_indices, final_testing_indices = [], []
            client_data[client] = {}
            for i in range((client - 1) * distribution_factor, client * distribution_factor, 1):
                final_training_indices += train_dataset_class_indices[val][0: training_samples_per_class]
                final_testing_indices += test_dataset_class_indices[val]
            
            client_data[client]["x_train"] = x_train[final_training_indices]
            client_data[client]["y_train"] = y_train[final_training_indices]
            client_data[client]["x_test"] = x_test[final_testing_indices]
            client_data[client]["y_test"] = y_test[final_testing_indices]
            print(client, client_data[client]["x_train"].shape, client_data[client]["y_train"].shape, client_data[client]["x_test"].shape, client_data[client]["y_test"].shape)
    
    else:
        client_class_distribution = {1: [0, 2, 6], 2: [3, 4, 7], 3: [1, 5, 9], 4: [0, 7, 8], 5: [1, 2, 7], 6: [4, 6, 8], 7: [2, 3, 5], 8: [4, 8, 9], 9: [0, 5, 9], 10: [1, 3, 6]}
        class_count = {val: 0 for val in train_classes}
        for client in client_class_distribution:
            final_training_indices, final_testing_indices = [], []
            client_data[client] = {}
            for val in client_class_distribution[client]:
                start_idx, end_idx = class_count[val] * training_samples_per_class, (class_count[val] + 1) * training_samples_per_class
                final_training_indices += train_dataset_class_indices[val][start_idx: end_idx]
                final_testing_indices += test_dataset_class_indices[val]
                class_count[val] += 1
                
#             print("CC:", class_count)
            client_data[client]["x_train"] = x_train[final_training_indices]
            client_data[client]["y_train"] = y_train[final_training_indices]
            client_data[client]["x_test"] = x_test[final_testing_indices]
            client_data[client]["y_test"] = y_test[final_testing_indices]
            print(client, client_data[client]["x_train"].shape, client_data[client]["y_train"].shape, client_data[client]["x_test"].shape, client_data[client]["y_test"].shape)
    
    print("Data Distribution to Clients Complete!")
    return client_data

In [11]:
def create_client_models(model_list, input_size, n_classes, lr=0.001, model_name_prefix="Client_"):
    models = {}
    for i, model_params in enumerate(model_list):
        model_name = model_name_prefix + "".join([str(cnt) + "_" for cnt in model_params["conv_layers"]]).strip("_")
        model_params["model_name"] = model_name
        models[model_name_prefix + str(i + 1)] = {
            "model_name": model_name,
            "model_params": model_params,
            "model": create_cnn_model(model_params, input_size, n_classes, lr),
        }
    return models

In [12]:
partitioned_client_data = partition_dataset(dataset, dataset_name, num_clients, classes_per_client, training_samples_per_class)

Begin Dataset Class Partition
Completed Dataset Class Partitioning!
Begin Dataset Class Partition
Completed Dataset Class Partitioning!
Distributing Data to Clients:
1 (2250, 28, 28) (2250,) (3000, 28, 28) (3000,)
2 (2250, 28, 28) (2250,) (3000, 28, 28) (3000,)
3 (2250, 28, 28) (2250,) (3000, 28, 28) (3000,)
4 (2250, 28, 28) (2250,) (3000, 28, 28) (3000,)
5 (2250, 28, 28) (2250,) (3000, 28, 28) (3000,)
6 (2250, 28, 28) (2250,) (3000, 28, 28) (3000,)
7 (2250, 28, 28) (2250,) (3000, 28, 28) (3000,)
8 (2250, 28, 28) (2250,) (3000, 28, 28) (3000,)
9 (2250, 28, 28) (2250,) (3000, 28, 28) (3000,)
10 (2250, 28, 28) (2250,) (3000, 28, 28) (3000,)
Data Distribution to Clients Complete!


In [13]:
clients = client_params["client_info"]
server = [global_model_params]
client_model_data = create_client_models(model_list=clients, input_size=IMG_SIZE, n_classes=num_classes, lr=client_learning_rate, model_name_prefix="Client_")
server_model_data = create_client_models(model_list=server, input_size=IMG_SIZE, n_classes=num_classes, lr=server_learning_rate, model_name_prefix="Server_") 

2022-12-04 02:48:15.775439: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-04 02:48:15.785448: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-04 02:48:15.786205: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-04 02:48:15.787631: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [14]:
for client in partitioned_client_data:
    client_id = client_name_prefix + str(client)
    client_model_data[client_id]["dataset"] = partitioned_client_data[client]

print("Client Models Data:", client_model_data["Client_1"].keys())
print("Server Models Data:", server_model_data)

Client Models Data: dict_keys(['model_name', 'model_params', 'model', 'dataset'])
Server Models Data: {'Server_1': {'model_name': 'Server_256_256', 'model_params': {'conv_layers': [256, 256], 'kernel_sizes': [(3, 3), (3, 3)], 'dropout': 0.2, 'activation': 'relu', 'pre_train_params': {'min_delta': 0.005, 'patience': 3, 'batch_size': 32, 'epochs': 3, 'is_shuffle': True, 'verbose': 1}, 'model_name': 'Server_256_256'}, 'model': <keras.engine.sequential.Sequential object at 0x7fee30056890>}}


In [15]:
def recompile_models(clients, lr):
    client_labels = list(clients.keys())
    for client in client_labels:
        cnn_model = clients[client]["model"]
        compile_model(cnn_model, lr=lr)
    print("Model Re-compilation Complete!")

def average_model_weights(weights):
    averages = [1 / len(weights) for _ in range(len(weights))]
    n_layers = len(weights[0])

    avg_model_weights = list()
    for layer in range(n_layers):
        layer_weights = np.array([weight[layer] for weight in weights])
        avg_layer_weights = np.average(layer_weights, axis=0, weights=averages)
        avg_model_weights.append(avg_layer_weights)
 
    return avg_model_weights
        
def train_client(client_model, server_model, dataset, dt, train_epochs=2):
    x_train, y_train, x_test, y_test = dataset["x_train"], dataset["y_train"], dataset["x_test"], dataset["y_test"]
    
    print("Client Fitting:")
    client_model.fit(x_train, y_train, epochs=train_epochs, batch_size=16)
    
    distiller = Distiller(student=server_model, teacher=client_model)
    distiller.compile(
    optimizer=Adam(),
    metrics=[SparseCategoricalAccuracy()],
    student_loss_fn=SparseCategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=KLDivergence(),
    alpha=0.1,
    temperature=dt,
)
    print("Server Fitting:")
    distiller.fit(x_train, y_train, epochs=train_epochs, batch_size=16)
    return client_model, distiller.student


def federated_distillation(clients, server, comm_rounds, distillation_temperature, client_selection="all"):
    client_labels = list(clients.keys())
    for comm_round in range(1, comm_rounds + 1, 1):
        print("Communication Round: {}/{}".format(comm_round, comm_rounds))
        client_labels = client_labels if client_selection == "all" else []
        complete_server_model_weights = []
        for client in client_labels:
            print("Training:", client)
            _, server_model = train_client(clients[client]["model"], server["Server_1"]["model"], clients[client]["dataset"], distillation_temperature, train_epochs=1)
            complete_server_model_weights.append(server_model.get_weights())
        
        server["Server_1"]["model"].set_weights(average_model_weights(complete_server_model_weights))
        if comm_round % 5 == 0 or comm_round == comm_rounds or True:
            print("Validating for Communication Round: {}/{}".format(comm_round, comm_rounds))
            for client in client_labels:
                client_model, x_test, y_test = clients[client]["model"], clients[client]["dataset"]["x_test"], clients[client]["dataset"]["y_test"]
                server_model = server["Server_1"]["model"] 
                print(client, "Validation Metrics:", client_model.evaluate(x_test, y_test))
                print("Server Validation Metrics:", server_model.evaluate(x_test, y_test))

In [16]:
server_model = server_model_data["Server_1"]['model']
server_model.evaluate(x_test, y_test)

2022-12-04 02:48:17.584442: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-12-04 02:48:18.039149: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005




[2.554128885269165, 0.1274999976158142]

In [17]:
comm_rounds, distillation_temperature = 30, 3
recompile_models(client_model_data, client_learning_rate)
recompile_models(server_model_data, server_learning_rate)
federated_distillation(client_model_data, server_model_data, comm_rounds, distillation_temperature, client_selection="all")

Model Re-compilation Complete!
Model Re-compilation Complete!
Communication Round: 1/30
Training: Client_1
Client Fitting:
Server Fitting:
Training: Client_2
Client Fitting:
Server Fitting:
Training: Client_3
Client Fitting:
Server Fitting:
Training: Client_4
Client Fitting:
Server Fitting:
Training: Client_5
Client Fitting:
Server Fitting:
Training: Client_6
Client Fitting:
Server Fitting:
Training: Client_7
Client Fitting:
Server Fitting:
Training: Client_8
Client Fitting:
Server Fitting:
Training: Client_9
Client Fitting:
Server Fitting:
Training: Client_10
Client Fitting:
Server Fitting:
Validating for Communication Round: 1/30
Client_1 Validation Metrics: [0.7756354212760925, 0.706333339214325]
Server Validation Metrics: [2.6222739219665527, 0.08833333104848862]
Client_2 Validation Metrics: [0.24379299581050873, 0.9570000171661377]
Server Validation Metrics: [1.3799638748168945, 0.7726666927337646]
Client_3 Validation Metrics: [0.16354650259017944, 0.9833333492279053]
Server Valid

In [18]:
server_model = server_model_data["Server_1"]['model']
server_model.evaluate(x_test, y_test)



[11.546131134033203, 0.808899998664856]