# Training instrumentation
This tutorial explains how to instrument training and save checkpoints to a certain format. We use cifar10 trained with resnet18 as an example.

## Import

In [None]:
import os, sys
import json
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

## Define summary writer

In [None]:
# >>>>>>>>>> Define summary writer
sys.path.append("..")
from writer.summary_writer import SummaryWriter
log_dir = "path/to/content" # User define
writer = SummaryWriter(log_dir)
# <<<<<<<<<< Define summary writer

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Data
print('==> Preparing data..')
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
trainset = torchvision.datasets.CIFAR10(
    root='./data', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=128, shuffle=True, num_workers=2)

# record train data using test transform to avoid randomness
record_trainset = torchvision.datasets.CIFAR10(
    root='./data', train=True, download=True, transform=transform_test)
record_trainloader = torch.utils.data.DataLoader(
    record_trainset, batch_size=128, shuffle=False, num_workers=2)

testset = torchvision.datasets.CIFAR10(
    root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(
    testset, batch_size=100, shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer',
           'dog', 'frog', 'horse', 'ship', 'truck')

## Record dataset

In [None]:
# >>>>>>>>>>Record Data
writer.add_training_data(record_trainloader) # use test_transform
writer.add_testing_data(testloader)
# <<<<<<<<<<Record Data

## Define model
The model need to have certain requirements.

1. embedded with feature and prediction function
```python
net = ResNet18()
net.feature() # (N,...)->(N,M), output a 2 dimensional feature, N samples with feature length of M
net.prediction() # (N, M)->(N, C), C-class classification problem, output logits (the layer before softmax or log-softmax)
```
2. put it in "model.py" under folder "CONTENT_PATH/Model"
3. the name of model should be in config
>for example, in our case, config["NET"] == "ResNet"

In [None]:
# ==> Building model..
net = ResNet18()    # choose your own model

In [None]:
net = net.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01,
                      momentum=0.9, weight_decay=5e-4)

# Training
def train():
    net.train()
    for _, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

## record checkpoint

In [None]:
prev_id = 0
idxs = list(range(len(trainset)))
for epoch in range(1,200,1):
    train()
    if epoch % 10 == 0:
        # >>>>>>>>>>record checkpoint for every 10 epochs
        writer.add_checkpoint_data(net.state_dict(), idxs, epoch, prev_id)
        # <<<<<<<<<<record checkpoint for every 10 epochs
    prev_id = epoch

## Record config

In [None]:
# >>>>>>>>>> Record Config
config_dict = {
    "SETTING": "normal",
    "CLASSES": classes, 
    "GPU":"1",
    "DATASET": "cifar10",
    "EPOCH_START": 1,
    "EPOCH_END": 200,
    "EPOCH_PERIOD": 1,
    "TRAINING": {
        "NET": "ResNet18", # name it after your net
        "num_class": 10,
        "train_num": 60000,
        "test_num": 10000,
    },
    "VISUALIZATION":{
        "PREPROCESS":1,
        "BOUNDARY":{
            "B_N_EPOCHS": 0,
            "L_BOUND":0.5,
        },
        "INIT_NUM": 300,
        "ALPHA":1,
        "BETA":1,
        "MAX_HAUSDORFF":0.33,
        "LAMBDA": 1,
        "S_LAMBDA": 1,
        "ENCODER_DIMS":[512,256,256,256,2],
        "DECODER_DIMS":[2,256,256,256,512],
        "N_NEIGHBORS":15,
        "MAX_EPOCH": 20,
        "S_N_EPOCHS": 5,
        "T_N_EPOCHS": 20,
        "PATIENT": 3,
        "RESOLUTION":300,
        "VIS_MODEL_NAME": "DeepDebugger",
        "EVALUATION_NAME": "test_evaluation_DeepDebugger"
    }
}
# <<<<<<<<<< Record Config

In [None]:
# save config
config = dict()
config["DeepDebugger"] = config_dict
with open(os.path.join(log_dir, "config.json"), "w") as f:
    json.dump(config, f)


## Visualize embedding

In [None]:
# >>>>>>>>>> Choose a visualization method to visualize embedding
from Strategy import DeepDebugger
dd = DeepDebugger(config_dict)
dd.visualize_embedding()
# <<<<<<<<<< Choose a visualization method to visualize embedding

# Next Step
Starting server and frontend to interact with our tool. See tutorial/2-start-services.