# Turbofan POC: Training
CAA 05/08/2020

This notebook follows `distribute_dataset.ipynb` which set up the grid infrastructure, and populated the nodes with data.

In this notebook, we will run the training. You should be able to run this notebook on any server which is running a PyGridNetwork, or PyGridNode associated with the PyGridNetwork. 

NOTE: This notebook requires that instances associated with nodes have been sent data using PySyft's .send() method. Additionally, at the time of running this notebook, we were running the following processes.
- PyGridNetwork: server Bob (http://localhost:5000)
- PyGridNode: server Bob (http://localhost:3000)
- PyGridNode: server Alice (http://18.218.13.132:3001)
- This Jupyter Notebook: server Bob (http://localhost:8000)—you should be able to run this notebook on any server which is running a PyGridNetwork, or PyGridNode associated with the PyGridNetwork

## Import dependencies

In [1]:
import syft as sy
from syft.grid.public_grid import PublicGridNetwork
import torch as torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from time import time
from statistics import mean
from pathlib import Path

from turbofanpoc.federated_trainer.helper.trainings_helper import data_result_size, start_federated_training, history
from turbofanpoc.federated_trainer.helper.trainings_helper import get_model_error
from models import *

Falling back to insecure randomness since the required custom op could not be found for the installed version of TensorFlow. Fix this by compiling custom ops. Missing file was '/home/ubuntu/anaconda3/envs/pysyft27/lib/python3.7/site-packages/tf_encrypted/operations/secure_random/secure_random_module_tf_1.15.3.so'



## Set up network & device

In [2]:
hook = sy.TorchHook(torch)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if(torch.cuda.is_available()):
    torch.set_default_tensor_type(torch.cuda.FloatTensor)

## Set up train configs and helper functions

In [3]:
def save_model(model, training_rounds, id=""):
    if not Path(WEIGHTS_DIR).exists():
        Path(WEIGHTS_DIR).mkdir(parents=True)
    """ Save a torch model to disk.

    :param model: Model to save
    """
    torch.save(model.state_dict(), "{}/turbofan_{}_{}.pt".format(WEIGHTS_DIR, training_rounds, id))


def load_initial_model():
    """ Load the model from the initial training from disk.

    :return: The initial model
    """
    return torch.load("{}/turbofan_initial.pt".format(WEIGHTS_DIR))


def load_latest_model():
    """ Load the latest model created during federated learning from disk.

    :return: The latest model
    """
    index = training_rounds - 1
    if index == 0:
        index = "initial"
    return torch.load("{}/turbofan_{}.pt".format(WEIGHTS_DIR, index))

In [4]:
# Grid config
TRAIN_COLS = 11
WINDOW_SIZE = 80
GRID_ADDRESS = 'localhost'
GRID_PORT = '5000'
# Model training config
MODEL_NAME = "bnormfc"
DATA_TAGS = ("#X", "#turbofan", "#dataset")
LABEL_TAGS = ("#Y", "#turbofan", "#dataset")
# MODEL_ID = "turbofan"
LOAD_MODEL = '' #{'', 'model_name'}
SAVE_MODEL = True
SAVE_INTERVAL = 50
WEIGHTS_DIR = './saved_weights'
MAX_EPOCHS = 100 # used by Turbofan demo authors
METRICS_INTERVAL = 10
AGGREGATION = '' #{'', 'weight', 'gradients'}
# Differential privacy config
DP_TYPE = 'datadp' #{data, model, layer-wise, ''}
WEIGHTS_SAVE_NAME = f"{MODEL_NAME}_{DP_TYPE}_{AGGREGATION}"

name2model = {
    "bnormfc": BatchNormFCModel(WINDOW_SIZE, TRAIN_COLS), # modified to use batchnorm for normalisation
    "vanillalstm": VanillaLSTM(WINDOW_SIZE, TRAIN_COLS),
    "vanillagru": VanillaGRU(WINDOW_SIZE, TRAIN_COLS),
}

## Search grid for data

In [5]:
tfan_grid = PublicGridNetwork(hook,"http://" + GRID_ADDRESS + ":" + GRID_PORT)

In [6]:
data = tfan_grid.search(*DATA_TAGS)
target = tfan_grid.search(*LABEL_TAGS)

In [7]:
data = list(data.values())
target = list(target.values())

In [8]:
print(data)

[[(Wrapper)>[PointerTensor | me:35916416408 -> bob:8684701164]
	Tags: #dataset #turbofan #datadp #X 
	Shape: torch.Size([1423, 4, 80, 11])
	Description: The input datapoints to the Turbofan dataset....], [(Wrapper)>[PointerTensor | me:74396203589 -> alice:89148676862]
	Tags: #dataset #turbofan #datadp #X 
	Shape: torch.Size([1423, 4, 80, 11])
	Description: The input datapoints to the Turbofan dataset....]]


In [9]:
print(target)

[[(Wrapper)>[PointerTensor | me:94744671562 -> bob:52681149186]
	Tags: #dataset #turbofan #Y 
	Shape: torch.Size([1423, 4, 1])
	Description: The input labels to the Turbofan dataset....], [(Wrapper)>[PointerTensor | me:96620240690 -> alice:45053299857]
	Tags: #dataset #turbofan #Y 
	Shape: torch.Size([1423, 4, 1])
	Description: The input labels to the Turbofan dataset....]]


## Run training

In [10]:
# initialise model
model = name2model[MODEL_NAME]
optimizer = optim.SGD(model.parameters(), lr=0.01)
criterion = nn.L1Loss() # used by Turbofan demo authors
# criterion = nn.CrossEntropyLoss()                  
if LOAD_MODEL:
    try:
        print("Loading initial model...")
        model = load_initial_model()
        model.to_device
        print("Done.")
    except: "No initial model found"
else: 
    print("Initialising model...")
    model.apply(init_weights)
    model.to(device)

Initialising model...


In [13]:
def train(max_epochs):
    model.train()
    print("Beginning training...")
    for epoch in range(max_epochs):
        epoch_start_t = time()
         # loop over workers
        for i in range(len(data)):
            worker_start_t = time()
            # loop over batches
            for j in range(len(data[i])):
                worker = data[i][j].location
                model.send(worker)
                it_ts = []
                # loop over minibatches
                for k in range(len(data[i][j])):
                    mb_start_t = time()
                    mb_data = data[i][j][k]
                    mb_target = target[i][j][k]
                    optimizer.zero_grad()
                    pred = model(mb_data)
                    loss = criterion(pred, mb_target)
                    loss.backward()
                    optimizer.step()
                    it_ts.append(time()-mb_start_t)
                prediction = pred.get()
                # print(f"Prediction: {prediction}")
                model.get()
                loss = loss.get()

            worker_t = time()-worker_start_t
            if epoch % METRICS_INTERVAL==0 or epoch == MAX_EPOCHS:
                print('Train epoch: {}\t| Worker: {}\t| [{}/{} ({:.0f}%)] | \tLoss: {:.6f}\t| Time: {:.6f}s'.format(epoch, worker.id, epoch, MAX_EPOCHS, 100. *  epoch / MAX_EPOCHS, loss.item(), worker_t)) 
        mean_it_t = mean(it_ts)/len(data[0][0][0])
        if epoch % METRICS_INTERVAL==0:
            epoch_t = time()-epoch_start_t
            print(f'Train epoch: {epoch}\t complete| Time: {epoch_t:.6f}s | Mean iteration time: {mean_it_t:.6f}')
        if SAVE_MODEL==True and (epoch % SAVE_INTERVAL==0 or epoch==MAX_EPOCHS):
            print(f"Saving model trained with {epoch} epochs at {WEIGHTS_DIR}...")
            save_model(model, epoch, WEIGHTS_SAVE_NAME)

In [14]:
train(MAX_EPOCHS)

Beginning training...
Train epoch: 0	 complete| Time: 194.497076s | Mean iteration time: 0.016359
Saving model trained with 0 epochs at ./saved_weights...


KeyboardInterrupt: 