# Turbofan POC: Training
CAA 23/07/2020

This notebook follows Part 1 which set up the grid infrastructure, and populated the nodes with data.

In this notebook, we will run the training. You should be able to run this notebook on any server which is running a PyGridNetwork, or PyGridNode associated with the PyGridNetwork. 

NOTE: This notebook requires that instances associated with nodes have been sent data using PySyft's .send() method. Additionally, at the time of running this notebook, we were running the following processes.
- PyGridNetwork: server Bob (http://localhost:5000)
- PyGridNode: server Bob (http://localhost:3000)
- PyGridNode: server Alice (http://18.218.13.132:3001)
- This Jupyter Notebook: server Bob (http://localhost:8000)—you should be able to run this notebook on any server which is running a PyGridNetwork, or PyGridNode associated with the PyGridNetwork

## Import dependencies

In [21]:
import syft as sy
from syft.grid.public_grid import PublicGridNetwork
import torch as torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from time import time
from statistics import mean
from pathlib import Path

from turbofanpoc.federated_trainer.helper.trainings_helper import data_result_size, start_federated_training, history
from turbofanpoc.federated_trainer.helper.trainings_helper import get_model_error
from models import *

## Set up network & device

In [22]:
hook = sy.TorchHook(torch)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if(torch.cuda.is_available()):
    torch.set_default_tensor_type(torch.cuda.FloatTensor)



## Set up train configs and helper functions

In [23]:
# Grid config
MODEL_NAME = "fc"
GRID_ADDRESS = 'localhost'
GRID_PORT = '5000'
# Model training config
DATA_TAGS = ("#X", "#turbofan", "#dataset")
LABEL_TAGS = ("#Y", "#turbofan", "#dataset")
WEIGHTS_NAME = "BNFC_datanoise_0.2"
# MODEL_ID = "turbofan"
SAVE_MODEL = True
WEIGHTS_DIR = './saved_weights'
TRAIN_COLS = 11
WINDOW_SIZE = 80
MAX_EPOCHS = 100 # used by Turbofan demo authors
LOAD_MODEL = True
METRICS_INTERVAL = 10
AGGREGATION = 'weight' #{weight, gradients}
# Differential privacy config
DP_TYPE = 'local' #{local, global, layer-wise}

def save_model(model, training_rounds, id=""):
    if not Path(WEIGHTS_DIR).exists():
        Path(WEIGHTS_DIR).mkdir(parents=True)
    """ Save a torch model to disk.

    :param model: Model to save
    """
    torch.save(model.state_dict(), "{}/turbofan_{}_{}.pt".format(WEIGHTS_DIR, training_rounds, id))


def load_initial_model():
    """ Load the model from the initial training from disk.

    :return: The initial model
    """
    return torch.load("{}/turbofan_initial.pt".format(WEIGHTS_DIR))


def load_latest_model():
    """ Load the latest model created during federated learning from disk.

    :return: The latest model
    """
    index = training_rounds - 1
    if index == 0:
        index = "initial"
    return torch.load("{}/turbofan_{}.pt".format(WEIGHTS_DIR, index))

## Search grid for data

In [24]:
tfan_grid = PublicGridNetwork(hook,"http://" + GRID_ADDRESS + ":" + GRID_PORT)

In [25]:
if not DP_TYPE:
    data = tfan_grid.search(*DATA_TAGS)
# for data that has undergone local dp
elif DP_TYPE=='local':
    data = tfan_grid.search(*DATA_TAGS, "#localdp")
else: raise NotImplementedError
target = tfan_grid.search(*LABEL_TAGS)

In [26]:
data = list(data.values())
target = list(target.values())

In [27]:
print(data)

[[(Wrapper)>[PointerTensor | me:91448401662 -> bob:99366173661]
	Tags: #turbofan #dataset #localdp #X 
	Shape: torch.Size([155, 4, 80, 11])
	Description: The input datapoints to the Turbofan dataset....], [(Wrapper)>[PointerTensor | me:44250389676 -> alice:94445306550]
	Tags: #localdp #dataset #X #turbofan 
	Shape: torch.Size([155, 4, 80, 11])
	Description: The input datapoints to the Turbofan dataset....]]


In [28]:
print(target)

[[(Wrapper)>[PointerTensor | me:14654811455 -> bob:26279338334]
	Tags: #Y #dataset #turbofan 
	Shape: torch.Size([155, 4, 1])
	Description: The input labels to the Turbofan dataset....], [(Wrapper)>[PointerTensor | me:69085487007 -> alice:78518394709]
	Tags: #turbofan #dataset #Y 
	Shape: torch.Size([155, 4, 1])
	Description: The input labels to the Turbofan dataset....]]


## Run training

In [29]:
# initialise model
name2model = {
    "mbfc": BatchFCModel(), # modified to use mean of minibatch for normalisation
    "bnormfc": BatchNormFCModel(WINDOW_SIZE, TRAIN_COLS), # modified to use batchnorm for normalisation
}

model = name2model[MODEL_NAME]

model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01)
criterion = nn.L1Loss() # used by Turbofan demo authors
# criterion = nn.CrossEntropyLoss()                  
if LOAD_MODEL:
    try:
        print("Loading initial model...")
        model = load_initial_model()
        model.to_device
        print("Done.")
    except: "No initial model found"

def train(max_epochs):
    model.train()
    print("Beginning training...")
    for epoch in range(max_epochs):
        epoch_start_t = time()
        for i in range(len(data)):
            # loop over workers
            worker_start_t = time()
            for j in range(len(data[i])):
                # loop over batches
                worker = data[i][j].location
                model.send(worker)
                it_ts = []
                for k in range(len(data[i][j])):
                    mb_start_t = time()
                    # loop over minibatches
                    mb_data = data[i][j][k]
                    mb_target = target[i][j][k]
                    optimizer.zero_grad()
                    pred = model(mb_data)
                    loss = criterion(pred, mb_target)
                    loss.backward()
                    optimizer.step()
                    it_ts.append(time()-mb_start_t)
                model.get()
                loss = loss.get()

            worker_t = time()-worker_start_t
            if epoch % METRICS_INTERVAL==0 or epoch == MAX_EPOCHS:
                print('Train epoch: {}\t| Worker: {}\t| [{}/{} ({:.0f}%)] | \tLoss: {:.6f}\t| Time: {:.6f}s'.format(epoch, worker.id, epoch, MAX_EPOCHS, 100. *  epoch / MAX_EPOCHS, loss.item(), worker_t)) 
        mean_it_t = mean(it_ts)/len(data[0][0][0])
        if epoch % METRICS_INTERVAL==0:
            epoch_t = time()-epoch_start_t
            print(f'Train epoch: {epoch}\t complete| Time: {epoch_t:.6f}s | Mean iteration time: {mean_it_t:.6f}')
    
train(MAX_EPOCHS)

if SAVE_MODEL==True:
    print(f"Saving model trained with {MAX_EPOCHS} epochs at {WEIGHTS_DIR}...")
    save_model(model, MAX_EPOCHS, WEIGHTS_NAME)

Loading initial model...
Beginning training...
Train epoch: 0	 complete| Time: 20.584469s | Mean iteration time: 0.016175
Train epoch: 10	 complete| Time: 21.500218s | Mean iteration time: 0.016243
Train epoch: 20	 complete| Time: 21.528477s | Mean iteration time: 0.018213
Train epoch: 30	 complete| Time: 21.517339s | Mean iteration time: 0.016535
Train epoch: 40	 complete| Time: 20.706069s | Mean iteration time: 0.016808
Train epoch: 50	 complete| Time: 21.694795s | Mean iteration time: 0.016625
Train epoch: 60	 complete| Time: 20.376769s | Mean iteration time: 0.016508
Train epoch: 70	 complete| Time: 22.155184s | Mean iteration time: 0.018675
Train epoch: 80	 complete| Time: 20.311845s | Mean iteration time: 0.016280
Train epoch: 90	 complete| Time: 21.621624s | Mean iteration time: 0.017373
Saving model trained with 100 epochs at ./saved_weights...
