# Turbofan POC Part 2: Training
CAA 23/07/2020

This notebook follows Part 1 which set up the grid infrastructure, and populated the nodes with data.

In this notebook, we will run the training. You should be able to run this notebook on any server which is running a PyGridNetwork, or PyGridNode associated with the PyGridNetwork. 

NOTE: This notebook requires that instances associated with nodes have been sent data using PySyft's .send() method. Additionally, at the time of running this notebook, we were running the following processes.
- PyGridNetwork: server Bob (http://localhost:5000)
- PyGridNode: server Bob (http://localhost:3000)
- PyGridNode: server Alice (http://18.218.13.132:3001)
- This Jupyter Notebook: server Bob (http://localhost:8000)—you should be able to run this notebook on any server which is running a PyGridNetwork, or PyGridNode associated with the PyGridNetwork

## Import dependencies

In [14]:
import syft as sy
from syft.grid.public_grid import PublicGridNetwork
import torch as torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from time import time
from statistics import mean
from pathlib import Path

from turbofanpoc.federated_trainer.helper.trainings_helper import data_result_size, start_federated_training, history
from turbofanpoc.federated_trainer.helper.trainings_helper import get_model_error
from models import *

## Set up network & device

In [4]:
hook = sy.TorchHook(torch)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if(torch.cuda.is_available()):
    torch.set_default_tensor_type(torch.cuda.FloatTensor)

## Set up train configs and helper functions

In [None]:
TRAIN_ID = "BNFC"

# Grid config
GRID_ADDRESS = 'localhost'
GRID_PORT = '5000'
# Model training config
DATA_TAGS = ("#X", "#turbofan", "#dataset")
LABEL_TAGS = ("#Y", "#turbofan", "#dataset")
# MODEL_ID = "turbofan"
SAVE_MODEL = True
MODEL_DIR = './models'
SENSOR_MEAN = torch.tensor(5000) # for normalising data like Turbofan POC
SENSOR_STD = torch.tensor(50) # for normalising data like Turbofan POC
TRAIN_COLS = 11
WINDOW_SIZE = 80
MAX_EPOCHS = 100 # used by Turbofan demo authors
LOAD_MODEL = True
METRICS_INTERVAL = 5
AGGREGATION = 'weight' #{weight, gradients}
# Differential privacy config
DP_TYPE = 'local' #{local, global, layer-wise}

def save_model(model, training_rounds, id=""):
    if not Path(MODEL_DIR).exists():
        Path(MODEL_DIR).mkdir(parents=True)
    """ Save a torch model to disk.

    :param model: Model to save
    """
    torch.save(model.state_dict(), "{}/turbofan_{}_{}.pt".format(MODEL_DIR, training_rounds, id))


def load_initial_model():
    """ Load the model from the initial training from disk.

    :return: The initial model
    """
    return torch.load("{}/turbofan_initial.pt".format(MODEL_DIR))


def load_latest_model():
    """ Load the latest model created during federated learning from disk.

    :return: The latest model
    """
    index = training_rounds - 1
    if index == 0:
        index = "initial"
    return torch.load("{}/turbofan_{}.pt".format(MODEL_DIR, index))

## Search grid for data

In [5]:
tfan_grid = PublicGridNetwork(hook,"http://" + GRID_ADDRESS + ":" + GRID_PORT)

In [6]:
if not DP_TYPE:
    data = tfan_grid.search(*DATA_TAGS)
# for data that has undergone local dp
elif DP_TYPE=='local':
    data = tfan_grid.search(*DATA_TAGS)
else: raise NotImplementedError
target = tfan_grid.search(*LABEL_TAGS)

In [7]:
data = list(data.values())
target = list(target.values())

In [8]:
print(data)

[[(Wrapper)>[PointerTensor | me:74986335599 -> bob:42608560662]
	Tags: #localdp #dataset #X #turbofan 
	Shape: torch.Size([155, 4, 80, 11])
	Description: The input datapoints to the Turbofan dataset....], [(Wrapper)>[PointerTensor | me:34730639663 -> alice:7948937110]
	Tags: #localdp #dataset #X #turbofan 
	Shape: torch.Size([155, 4, 80, 11])
	Description: The input datapoints to the Turbofan dataset....]]


In [9]:
print(target)

[[(Wrapper)>[PointerTensor | me:429459714 -> bob:25910322289]
	Tags: #turbofan #dataset #Y 
	Shape: torch.Size([155, 4, 1])
	Description: The input labels to the Turbofan dataset....], [(Wrapper)>[PointerTensor | me:77557575669 -> alice:66802965416]
	Tags: #turbofan #dataset #Y 
	Shape: torch.Size([155, 4, 1])
	Description: The input labels to the Turbofan dataset....]]


## Run training

In [15]:
# model = TurbofanModel(SENSOR_MEAN, SENSOR_STD, TRAIN_COLS)
model = BatchNormFCModel(WINDOW_SIZE, TRAIN_COLS)
model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01)
criterion = nn.L1Loss() # used by Turbofan demo authors
# criterion = nn.CrossEntropyLoss()                  
if LOAD_MODEL:
    try:
        print("Loading initial model...")
        model = load_initial_model()
        model.to_device
        print("Done.")
    except: "No initial model found"

def train(max_epochs):
    model.train()
    print("Beginning training...")
    for epoch in range(max_epochs):
        epoch_start_t = time()
        for i in range(len(data)):
            # loop over workers
            worker_start_t = time()
            for j in range(len(data[i])):
                # loop over batches
                worker = data[i][j].location
                model.send(worker)
                it_ts = []
                for k in range(len(data[i][j])):
                    mb_start_t = time()
                    # loop over minibatches
                    mb_data = data[i][j][k]
                    mb_target = target[i][j][k]
                    optimizer.zero_grad()
                    pred = model(mb_data)
                    loss = criterion(pred, mb_target)
                    loss.backward()
                    optimizer.step()
                    it_ts.append(time()-mb_start_t)
                model.get()
                loss = loss.get()

            worker_t = time()-worker_start_t
            if epoch % METRICS_INTERVAL==0 or epoch == MAX_EPOCHS:
                print('Train epoch: {}\t| Worker: {}\t| [{}/{} ({:.0f}%)] | \tLoss: {:.6f}\t| Time: {:.6f}s'.format(epoch, worker.id, epoch, MAX_EPOCHS, 100. *  epoch / MAX_EPOCHS, loss.item(), worker_t)) 
        mean_it_t = mean(it_ts)/len(data[0][0][0])
        if epoch % METRICS_INTERVAL==0:
            epoch_t = time()-epoch_start_t
            print(f'Train epoch: {epoch}\t complete| Time: {epoch_t:.6f}s | Mean iteration time: {mean_it_t:.6f}')
    
train(MAX_EPOCHS)

if SAVE_MODEL==True:
    print(f"Saving model trained with {MAX_EPOCHS} epochs at {MODEL_DIR}...")
    save_model(model, MAX_EPOCHS, TRAIN_ID)

Loading initial model...
Beginning training...
Train epoch: 0	 complete| Time: 23.577961s | Mean iteration time: 0.017529
Train epoch: 5	 complete| Time: 20.610154s | Mean iteration time: 0.016597
Train epoch: 10	 complete| Time: 20.630351s | Mean iteration time: 0.016564
Train epoch: 15	 complete| Time: 22.252657s | Mean iteration time: 0.017162
Train epoch: 20	 complete| Time: 20.138241s | Mean iteration time: 0.016108
Train epoch: 25	 complete| Time: 20.099049s | Mean iteration time: 0.016281
Train epoch: 30	 complete| Time: 21.003189s | Mean iteration time: 0.016253
Train epoch: 35	 complete| Time: 21.968054s | Mean iteration time: 0.017521
Train epoch: 40	 complete| Time: 21.661668s | Mean iteration time: 0.017513
Train epoch: 45	 complete| Time: 20.210776s | Mean iteration time: 0.016148
Train epoch: 50	 complete| Time: 20.302402s | Mean iteration time: 0.016266
Train epoch: 55	 complete| Time: 20.719127s | Mean iteration time: 0.016430
Train epoch: 60	 complete| Time: 20.306869s