In [1]:
##%matplotlib widget
## with %matplotlib notebook: seems to require ipympl as part of environment, either
## part of the conda environment or "pip install ipympl"
## otherwise, does not show ANY plots in note"book, plt.savefig() works
%matplotlib notebook  
##%matplotlib inline    ## --plt.savefig()  works, but re-sizing does NOT


This notebook is the second attempt to read in track information and use it to predict the KDE used as input to PvFinder. This time, we are reading in poca KDEs rather than the original KDEs. 

collectdata_kde_C.py uses poca_z, poca_x, poca_y, major_axis_x, major_axis_y, and major_axis_z as the six track parameters (for the moment)


Check the current GPU usage. Please try to be nice!

In [2]:
!nvidia-smi

Wed Feb  3 10:00:00 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 440.64.00    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  TITAN V             Off  | 00000000:03:00.0 Off |                  N/A |
| 28%   31C    P8    23W / 250W |      0MiB / 12066MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE...  Off  | 00000000:83:00.0 Off |                    0 |
| N/A   32C    P0    33W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  TITAN V             Off  | 00000000:84:00.0 Off |                  N/A |
| 22%   

> **WARNING**: The card numbers here are *not* the same as in CUDA. You have been warned.

This notebook is first attempt to read in track information and use it to predict the KDE used as input to PvFinder.


## Imports

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import time
import torch
import pandas as pd

# Python 3 standard library
from pathlib import Path

##from torchsummary import summary

### Set up local parameters

In [4]:
n_epochs = 122

folder = '03Feb_DDplus_loss_Ba_iter25_floatAll_150epochs_1p5em5_4xwill'
name   = folder

# Make an output folder named "name" (change if you want)

## Special instructions for those working on goofy at UC
## Please be very careful to make sure that your folder
## does not live in a subdirectory of your home directory
## this disk has very little capacity. Instead, use 
## a subdirectory in /share/lazy with a symbolic link to
## it in this (the notebooks) subdirectory
folder = 'ML_dir/' + folder
output = Path(folder)


# Size of batches
batch_size = 64 ## batch_size = 24 ---> 4763MiB / 12066MiB on Titan V
# How fast to learn
learning_rate = 1.5e-5

Make the output directory if it does not exist:

In [5]:
output.mkdir(exist_ok=True)

## Get the helper functions

Add the directory with the model
definitions to the path so we can import from it:

> When you type `import X`,
Python searches `sys.path` for a python
file named `X.py` to import. So we need to add the model directory to the path.

In [6]:
from model.collectdata_kde_Ellipsoids import collect_t2kde_data

##  It takes 9 input features (poca centers + (A,B,C,D,E,F)) . 
from model.models_kde import TracksToKDE_Ellipsoids_DDplus as Model

# kde_loss_Ba is a Chi2 loss (go and look at it!)
from model.kde_loss_Ba import Loss

from model.training_kde import trainNet, select_gpu, Results
from model.plots import dual_train_plots, replace_in_ax

In [7]:
# This gets built up during the run - do not rerun this cell
results = pd.DataFrame([], columns=Results._fields)

Set up Torch device configuration. All tensors and model parameters need to know where to be put.
This takes a BUS ID number: The BUS ID is the same as the listing at the top of this script.

In [8]:
# either 0 or 2 on goofy (i.e. TITAN V) [1 or 2 on sleepy RTX 2080TI]
device = select_gpu(2)
##device = "cpu"

1 available GPUs (initially using device 0):
  0 TITAN V


## Loading data

Load the dataset, split into parts, then move to device (see `collectdata.py` in the `../model` directory)

# Preparing the model

Prepare a model, use multiple GPUs if they are VISIBLE, and move the model to the device.

In [9]:
##print("Let's use", torch.cuda.device_count(), "GPUs!")
##if torch.cuda.device_count() > 1:
##    model = torch.nn.DataParallel(model)

In [10]:
## a comment on the web at https://pytorch.org/docs/stable/optim.html says
"""
If you need to move a model to GPU via .cuda(), please do so before constructing optimizers for it. 
Parameters of a model after .cuda() will be different objects with those before the call.

In general, you should make sure that optimized parameters live in consistent locations when 
optimizers are constructed and used.
"""
## so move this here (although we are using model.to(device) not explicitly using .cuda()

nOut1 = 50
nOut2 = 50
nOut3 = 50
nOut4 = 50
nOut5 = 50
nOut6 = 50
nOut7 = 50
nOut8 = 50
nOut9 = 50
nOut10 = 50
nOut11 = 50
latentChannels = 4
model = Model(nOut1,nOut2,nOut3,nOut4,nOut5,nOut6,nOut7,nOut8,nOut9,nOut10,nOut11,latentChannels)

##summary(model, input_size=(4, 4000))
##print(model.parameters)

## add the following code to allow the user to freeze the some of the weights corresponding 
## to those taken from an earlier model trained with the original target histograms
## presumably -- this leaves either the perturbative filter "fixed" and lets the 
## learning focus on the non-perturbative features, so get started faster, or vice versa
ct = 0
for child in model.children():
  print('ct, child = ',ct, "  ", child)
  if ct < 0:
    print("     About to set param.requires_grad=False for ct = ", ct, "params")
    for param in child.parameters():
        # fixing parameters
        param.requires_grad = False 
  ct += 1
##  mds 200121 loss = Loss(epsilon=1e-5,coefficient=1.0)
##  loss = Loss(epsilon=1e-5,coefficient=2.5)
##loss = Loss(epsilon=3e-5, debug=False)
loss = Loss(epsilon=3e-5)

ct, child =  0    Linear(in_features=9, out_features=50, bias=True)
ct, child =  1    Linear(in_features=50, out_features=50, bias=True)
ct, child =  2    Linear(in_features=50, out_features=50, bias=True)
ct, child =  3    Linear(in_features=50, out_features=50, bias=True)
ct, child =  4    Linear(in_features=50, out_features=50, bias=True)
ct, child =  5    Linear(in_features=50, out_features=50, bias=True)
ct, child =  6    Linear(in_features=50, out_features=50, bias=True)
ct, child =  7    Linear(in_features=50, out_features=50, bias=True)
ct, child =  8    Linear(in_features=50, out_features=50, bias=True)
ct, child =  9    Linear(in_features=50, out_features=50, bias=True)
ct, child =  10    Linear(in_features=50, out_features=50, bias=True)
ct, child =  11    Linear(in_features=50, out_features=16000, bias=True)
ct, child =  12    Conv1d(4, 25, kernel_size=(25,), stride=(1,), padding=(12,))
ct, child =  13    Conv1d(25, 1, kernel_size=(5,), stride=(1,), padding=(2,))
ct, child 

Let's move the model's weight matricies to the GPU:

In [11]:
model = model.to(device)

In [12]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
##optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [13]:
print('output = ',output)
##print('oldOutput = ',oldOutput)
##  use the first four layers from a pre-existing model
##  see example at https://discuss.pytorch.org/t/how-to-load-part-of-pre-trained-model/1113

##   ML -> /share/lazy/sokoloff/ML
model_dict = model.state_dict()
## mds 190725 for debugging
print("for model_dict")
index = 0
for k,v in model_dict.items():
    print("index, k =  ",index,"  ",k)
    index = index+1
##    print("value = ", v)
 
updated_dict = model_dict
##print("updated_dict = ",updated_dict)
## when starting "ab initio", reduce biases as the bias gets summed for each track
## contributing to the predicted KDE
updated_dict["layer1.bias"] = 0.005*model_dict["layer1.bias"]
updated_dict["layer2.bias"] = 0.005*model_dict["layer2.bias"]
updated_dict["layer3.bias"] = 0.005*model_dict["layer3.bias"]
updated_dict["layer4.bias"] = 0.005*model_dict["layer4.bias"]
updated_dict["layer5.bias"] = 0.005*model_dict["layer5.bias"]
updated_dict["layer6.bias"] = 0.005*model_dict["layer6.bias"]
updated_dict["layer7.bias"] = 0.005*model_dict["layer7.bias"]
updated_dict["layer8.bias"] = 0.005*model_dict["layer8.bias"]
updated_dict["layer9.bias"] = 0.005*model_dict["layer9.bias"]
updated_dict["layer10.bias"] = 0.005*model_dict["layer10.bias"]
updated_dict["layer11.bias"] = 0.005*model_dict["layer11.bias"]

model.load_state_dict(updated_dict,strict=False)

model_dict = model.state_dict()
##print("updated model_dict = ",model_dict)

## print(" \n","  for pretrained_dict")
## index = 0
##for k,v in pretrained_dict.items():
##    print("index, k =  ",index,"  ",k)
##    index = index+1
## mds  

##pretrained_dict = torch.load('ML/29July2020_Trks_to_KDE_C_lossB_100epochs_b64_1m3_nOut_50x50/29July2020_Trks_to_KDE_C_lossB_100epochs_b64_1m3_nOut_50x50_final.pyt')
##print("model_dict instantiated")
# 1. filter out unnecessary keys
##pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
##print("pretrained_dict iterated")
# 2. overwrite entries in the existing state dict
##model_dict.update(pretrained_dict) 
##
#   when starting from a model with a fully connected last layer rather than a convolutional layer
# 3. load the new state dict
#   need to use strict=False as the two models state model attributes do not agree exactly
#   see https://pytorch.org/docs/master/_modules/torch/nn/modules/module.html#Module.load_state_dict

##model.load_state_dict(pretrained_dict,strict=False)

## print('model_dict =    ', model_dict)

## finished at training cost = 1.46, validation cost = 1.50
##d_folder = '25December__DDplus_loss_Ba_iter7_floatAll_800epochs_4em6'
d_folder = '03Feb_DDplus_loss_Ba_iter24_floatAll_150epochs_1p5em5_4xwill'
d_name = d_folder
##  for 24Jan_DDplus_loss_Ba_iter17_floatAll_150epochs_1p5em5_4xwill use suffix 87
##  as this is the last epoch that ran before I killed the job (don't ask)
suffix = '27'
dict_name = 'ML_dir/' + d_folder + '/' + d_name + '_'+ suffix + '.pyt'
print('dict_name = ',dict_name)
pretrained_dict = torch.load(dict_name)

print(" ")
print("  for pretrained_dict")
index = 0
for k,v in pretrained_dict.items():
    print("index, k =  ",index,"  ",k)
    index = index+1
 

##print("model_dict instantiated")
# 1. filter out unnecessary keys
pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
print("pretrained_dict iterated")
# 2. overwrite entries in the existing state dict
model_dict.update(pretrained_dict) 
##
#   when starting from a model with a fully connected last layer rather than a convolutional layer
# 3. load the new state dict
#   need to use strict=False as the two models state model attributes do not agree exactly
#   see https://pytorch.org/docs/master/_modules/torch/nn/modules/module.html#Module.load_state_dict

model.load_state_dict(pretrained_dict,strict=False)



output =  ML_dir/03Feb_DDplus_loss_Ba_iter25_floatAll_150epochs_1p5em5_4xwill
for model_dict
index, k =   0    layer1.weight
index, k =   1    layer1.bias
index, k =   2    layer2.weight
index, k =   3    layer2.bias
index, k =   4    layer3.weight
index, k =   5    layer3.bias
index, k =   6    layer4.weight
index, k =   7    layer4.bias
index, k =   8    layer5.weight
index, k =   9    layer5.bias
index, k =   10    layer6.weight
index, k =   11    layer6.bias
index, k =   12    layer7.weight
index, k =   13    layer7.bias
index, k =   14    layer8.weight
index, k =   15    layer8.bias
index, k =   16    layer9.weight
index, k =   17    layer9.bias
index, k =   18    layer10.weight
index, k =   19    layer10.bias
index, k =   20    layer11.weight
index, k =   21    layer11.bias
index, k =   22    layer12new.weight
index, k =   23    layer12new.bias
index, k =   24    conv1.weight
index, k =   25    conv1.bias
index, k =   26    conv2.weight
index, k =   27    conv2.bias
index, k =   

<All keys matched successfully>

In [14]:
##print('validation.dataset.tensors = ',validation.dataset.tensors)
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 10
fig_size[1] = 4
plt.rcParams["figure.figsize"] = fig_size

In [15]:
## Training dataset. You can put as many files here as desired.

##train_loader = collect_t2kde_data('/share/lazy/pv-finder/20k_evts_for_KDE_learning_200716.h5',
train_loader = collect_t2kde_data('/share/lazy/will/data/June30_2020_80k_1.h5', 
                                  '/share/lazy/will/data/June30_2020_80k_2.h5',
                                  '/share/lazy/will/data/June30_2020_80k_3.h5',
                                  '/share/lazy/will/data/June30_2020_80k_4.h5',
                                  #'/share/lazy/will/data/June30_2020_80k_5.h5',
                                  #'/share/lazy/will/data/June30_2020_80k_6.h5',
                                  #'/share/lazy/will/data/June30_2020_80k_7.h5',
                             batch_size=batch_size,
## if we are using a larger dataset (240K events, with the datasets above, and 11 GB  of GPU memory),
## the dataset will overflow the GPU memory; device=device will allow the data to move back
## and forth between the CPU and GPU memory. While this allows use of a larger dataset, it slows
## down performance by about 10%.  So comment out when not needed.
#                          device=device,
##                           slice = slice(None,18000)
                           )

Loading data...
pocaMx.shape =  (80000,)
nEvts =  80000
len(pocaMx[0]) =  199
len(pocaMx[1]) =  25
len(pocaMx[2]) =  369
len(pocaMx[3]) =  143
len(pocaMx[4]) =  160
majorAxis.shape =  (80000, 3)
minorAxis_1.shape =  (80000, 3)
minorAxis_2.shape =  (80000, 3)
have entered six_ellipsoid_parameters
  
 
  nEvts =  80000
 iEvt, nTrks =  0 199
 iEvt, nTrks =  1 25
 iEvt, nTrks =  2 369
 iEvt, nTrks =  3 143
 iEvt, nTrks =  4 160
 iEvt, nTrks =  5 260
 iEvt, nTrks =  6 237
 iEvt, nTrks =  7 327
 iEvt, nTrks =  8 178
 iEvt, nTrks =  9 106
A.shape =  (80000,)
majorAxis[iTrk][0][0] =  0.00045611936
majorAxis[iTrk][1][0] =  -4.8292455e-05
majorAxis[iTrk][2][0] =  0.090019904
minorAxis_1[iTrk][0][0] =  -1.8602173
minorAxis_1[iTrk][1][0] =  -17.569641
minorAxis_1[iTrk][2][0] =  4.7891795e-08
minorAxis_2[iTrk][0][0] =  -17.569414
minorAxis_2[iTrk][1][0] =  1.8601931
minorAxis_2[iTrk][2][0] =  0.0900199
  
majorAxis[iTrk][0][0] =  0.002360258
majorAxis[iTrk][1][0] =  -0.007426616
majorAxis[iTrk][2][

nEvts =  80000
len(pocaMx[0]) =  196
len(pocaMx[1]) =  35
len(pocaMx[2]) =  265
len(pocaMx[3]) =  128
len(pocaMx[4]) =  124
majorAxis.shape =  (80000, 3)
minorAxis_1.shape =  (80000, 3)
minorAxis_2.shape =  (80000, 3)
have entered six_ellipsoid_parameters
  
 
  nEvts =  80000
 iEvt, nTrks =  0 196
 iEvt, nTrks =  1 35
 iEvt, nTrks =  2 265
 iEvt, nTrks =  3 128
 iEvt, nTrks =  4 124
 iEvt, nTrks =  5 122
 iEvt, nTrks =  6 300
 iEvt, nTrks =  7 179
 iEvt, nTrks =  8 243
 iEvt, nTrks =  9 112
A.shape =  (80000,)
majorAxis[iTrk][0][0] =  -2.440992e-05
majorAxis[iTrk][1][0] =  -0.00038087237
majorAxis[iTrk][2][0] =  0.08211532
minorAxis_1[iTrk][0][0] =  -17.631672
minorAxis_1[iTrk][1][0] =  1.130005
minorAxis_1[iTrk][2][0] =  4.7444193e-08
minorAxis_2[iTrk][0][0] =  1.1299927
minorAxis_2[iTrk][1][0] =  17.631481
minorAxis_2[iTrk][2][0] =  0.08211532
  
majorAxis[iTrk][0][0] =  -0.02277395
majorAxis[iTrk][1][0] =  0.0014473597
majorAxis[iTrk][2][0] =  0.6347583
minorAxis_1[iTrk][0][0] =  -

In [16]:
                            
# Validation dataset. You can slice to reduce the size.
## mds no separate validation set yet,

## For iter12, change slice(18000,None) to slice(10000,None)
## First, we'll see if this changes the validation cost significantly
##  Second, we will see if this reduces the validation cost fluctuations
val_loader = collect_t2kde_data('dataAA/20K_POCA_kernel_evts_200926.h5',
                            batch_size=batch_size,
##                            device=device,
                            slice = slice(10000,None)
                           )

Loading data...
pocaMx.shape =  (20000,)
nEvts =  20000
len(pocaMx[0]) =  211
len(pocaMx[1]) =  21
len(pocaMx[2]) =  20
len(pocaMx[3]) =  198
len(pocaMx[4]) =  233
majorAxis.shape =  (20000, 3)
minorAxis_1.shape =  (20000, 3)
minorAxis_2.shape =  (20000, 3)
have entered six_ellipsoid_parameters
  
 
  nEvts =  20000
 iEvt, nTrks =  0 211
 iEvt, nTrks =  1 21
 iEvt, nTrks =  2 20
 iEvt, nTrks =  3 198
 iEvt, nTrks =  4 233
 iEvt, nTrks =  5 85
 iEvt, nTrks =  6 223
 iEvt, nTrks =  7 425
 iEvt, nTrks =  8 252
 iEvt, nTrks =  9 169
A.shape =  (20000,)
majorAxis[iTrk][0][0] =  -0.00023452607
majorAxis[iTrk][1][0] =  -0.00047206535
majorAxis[iTrk][2][0] =  0.096502915
minorAxis_1[iTrk][0][0] =  -15.822749
minorAxis_1[iTrk][1][0] =  7.8608756
minorAxis_1[iTrk][2][0] =  -2.6228399e-08
minorAxis_2[iTrk][0][0] =  7.860759
minorAxis_2[iTrk][1][0] =  15.822513
minorAxis_2[iTrk][2][0] =  0.096502915
  
majorAxis[iTrk][0][0] =  -0.37655562
majorAxis[iTrk][1][0] =  0.2768704
majorAxis[iTrk][2][0] = 

In [17]:
ax, tax, lax, lines = dual_train_plots()
fig = ax.figure
plt.tight_layout()

<IPython.core.display.Javascript object>

In [18]:
for result in trainNet(model, optimizer, loss,
                        train_loader, val_loader,
                        n_epochs, epoch_start=len(results),
                        notebook=True):
    
    results = results.append(pd.Series(result._asdict()), ignore_index=True)
    xs = results.index
    
    # Update the plot above
    lines['train'].set_data(results.index,results.cost)
    lines['val'].set_data(results.index,results.val)
    
    #filter first cost epoch (can be really large)
    max_cost = max(max(results.cost if len(results.cost)<2 else results.cost[1:]), max(results.val))
    min_cost = min(min(results.cost), min(results.val))
    
    # The plot limits need updating too
    
    
    
    ax.set_ylim(min_cost*.9, max_cost*1.1)  
    ax.set_xlim(-.5, len(results.cost) - .5)

    
    # Redraw the figure
    fig.canvas.draw()

    # Save each model state dictionary
    torch.save(model.state_dict(), output / f'{name}_{result.epoch}.pyt')

Number of batches: train = 5000, val = 157


HBox(children=(FloatProgress(value=0.0, description='Epochs', layout=Layout(flex='2'), max=122.0, style=Progre…

Number of batches: train = 5000, val = 157


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 0: train=0.5739, val=0.377646, took 587.89 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 1: train=0.57237, val=0.386246, took 584.92 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 2: train=0.574182, val=0.383462, took 588.3 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 3: train=0.572836, val=0.376427, took 590.49 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 4: train=0.571834, val=0.37371, took 589.64 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 5: train=0.57195, val=0.377025, took 584.26 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 6: train=0.573276, val=0.375288, took 585.09 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 7: train=0.571926, val=0.37564, took 584.19 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 8: train=0.572227, val=0.374846, took 583.5 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 9: train=0.571068, val=0.375269, took 583.36 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 10: train=0.570833, val=0.37818, took 581.85 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 11: train=0.572625, val=0.376369, took 586.25 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 12: train=0.571689, val=0.377972, took 580.94 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 13: train=0.571904, val=0.381363, took 580.11 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 14: train=0.571766, val=0.381715, took 579.02 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 15: train=0.570625, val=0.388296, took 578.65 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 16: train=0.571221, val=0.371114, took 577.33 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 17: train=0.569798, val=0.383431, took 577.49 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 18: train=0.570681, val=0.380477, took 576.79 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 19: train=0.569864, val=0.375454, took 575.56 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 20: train=0.569078, val=0.395357, took 575.15 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 21: train=0.570314, val=0.376807, took 575.63 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 22: train=0.571234, val=0.395515, took 575.41 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 23: train=0.569817, val=0.376967, took 575.95 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 24: train=0.5695, val=0.437767, took 576.0 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 25: train=0.569311, val=0.371086, took 575.76 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 26: train=0.571488, val=0.380088, took 575.58 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 27: train=0.570343, val=0.408946, took 575.92 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 28: train=0.571472, val=0.373849, took 575.82 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 29: train=0.570005, val=0.390363, took 576.02 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 30: train=0.568054, val=0.381006, took 575.85 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 31: train=0.569956, val=0.433496, took 575.87 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 32: train=0.569501, val=0.378646, took 575.73 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 33: train=0.567729, val=0.381779, took 575.64 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 34: train=0.56831, val=0.374606, took 575.67 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 35: train=0.567838, val=0.376399, took 575.7 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 36: train=0.570138, val=0.415356, took 575.43 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 37: train=0.568683, val=0.381577, took 575.82 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 38: train=0.569552, val=0.374522, took 575.29 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 39: train=0.567401, val=0.375444, took 575.37 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 40: train=0.569875, val=0.391437, took 575.59 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 41: train=0.568175, val=0.376666, took 575.59 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 42: train=0.568189, val=0.378732, took 575.31 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 43: train=0.565799, val=0.375086, took 575.41 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 44: train=0.567605, val=0.383811, took 575.72 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 45: train=0.56707, val=0.380292, took 575.59 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 46: train=0.568109, val=0.387784, took 575.45 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 47: train=0.567326, val=0.374611, took 610.31 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 48: train=0.568111, val=0.396628, took 581.5 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 49: train=0.569718, val=0.374327, took 575.59 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 50: train=0.567633, val=0.372454, took 575.71 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 51: train=0.569458, val=0.384011, took 575.66 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 52: train=0.56803, val=0.37736, took 576.96 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 53: train=0.565198, val=0.385195, took 575.98 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 54: train=0.566247, val=0.374039, took 575.83 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 55: train=0.567108, val=0.372364, took 575.16 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 56: train=0.566788, val=0.377277, took 575.87 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 57: train=0.569848, val=0.384694, took 575.84 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 58: train=0.565635, val=0.369129, took 575.31 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 59: train=0.566317, val=0.372379, took 575.9 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 60: train=0.566145, val=0.387998, took 577.0 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 61: train=0.567143, val=0.369811, took 575.34 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 62: train=0.564799, val=0.383745, took 575.51 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 63: train=0.566398, val=0.368856, took 574.96 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 64: train=0.567592, val=0.371389, took 576.27 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 65: train=0.564789, val=0.381882, took 576.24 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 66: train=0.566995, val=0.369313, took 575.24 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 67: train=0.566358, val=0.370081, took 575.93 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 68: train=0.565674, val=0.375789, took 575.83 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 69: train=0.56647, val=0.372264, took 577.92 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 70: train=0.563893, val=0.367522, took 578.52 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 71: train=0.565144, val=0.376148, took 576.94 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 72: train=0.563973, val=0.376759, took 578.48 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 73: train=0.565372, val=0.381448, took 577.16 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 74: train=0.565761, val=0.378457, took 577.12 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 75: train=0.566977, val=0.369698, took 576.77 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 76: train=0.565282, val=0.3714, took 576.93 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 77: train=0.564177, val=0.3842, took 576.79 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 78: train=0.564828, val=0.372288, took 576.6 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 79: train=0.564683, val=0.368265, took 575.76 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 80: train=0.564777, val=0.366936, took 576.53 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 81: train=0.564736, val=0.37259, took 576.01 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 82: train=0.566956, val=0.387898, took 575.71 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 83: train=0.563944, val=0.370193, took 576.68 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 84: train=0.563624, val=0.367628, took 576.01 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 85: train=0.565054, val=0.371785, took 576.34 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 86: train=0.562598, val=0.368572, took 576.73 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 87: train=0.562029, val=0.372985, took 576.25 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 88: train=0.56506, val=0.369607, took 576.76 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 89: train=0.563423, val=0.366996, took 576.52 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 90: train=0.563741, val=0.370627, took 576.54 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 91: train=0.562566, val=0.369136, took 575.65 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 92: train=0.563415, val=0.366772, took 576.19 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 93: train=0.562863, val=0.391674, took 575.9 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 94: train=0.562732, val=0.365546, took 575.43 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 95: train=0.56316, val=0.415436, took 575.86 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 96: train=0.56308, val=0.369468, took 575.58 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 97: train=0.560343, val=0.375722, took 575.76 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 98: train=0.561642, val=0.3675, took 575.81 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 99: train=0.562881, val=0.366869, took 575.93 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 100: train=0.560529, val=0.4133, took 575.86 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 101: train=0.562302, val=0.37439, took 575.42 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 102: train=0.563341, val=0.367884, took 575.97 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 103: train=0.562095, val=0.38707, took 575.06 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 104: train=0.562257, val=0.385527, took 574.7 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 105: train=0.562574, val=0.367081, took 575.64 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 106: train=0.562267, val=0.364919, took 576.09 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 107: train=0.560605, val=0.390393, took 575.49 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 108: train=0.562256, val=0.3732, took 575.62 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 109: train=0.562087, val=0.368877, took 574.89 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 110: train=0.563044, val=0.39098, took 575.45 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 111: train=0.563206, val=0.364142, took 576.02 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 112: train=0.561351, val=0.458562, took 575.39 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 113: train=0.560665, val=0.363895, took 573.43 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 114: train=0.560443, val=0.374306, took 575.66 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 115: train=0.560887, val=0.369774, took 575.61 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 116: train=0.559349, val=0.377988, took 575.73 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 117: train=0.560068, val=0.36714, took 575.72 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 118: train=0.560876, val=0.409762, took 576.28 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 119: train=0.56129, val=0.36651, took 576.1 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 120: train=0.560839, val=0.36777, took 576.67 s


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 121: train=0.559106, val=0.38583, took 576.34 s



Go ahead and save the final model (even though it was also saved above):

In [19]:
torch.save(model.state_dict(), output / f'{name}_final.pyt')

Save the output results:

In [20]:
results.to_hdf(output / f'{name}_stats.hdf5', 'results')

Save the plot above:

In [21]:
dual_train_plots(results.index,
                 results.cost, results.val,
                 results.cost, results.val)
plt.tight_layout()
plt.savefig(str(output / f'{name}_stats_a.png'))

<IPython.core.display.Javascript object>

In [22]:
quit()