# Model Training
>  Includes the training phase of the neural networks for all datasets. In order to run the training we recommend to use the python scripts since it might take ~20 hours.
```
python training.py --log=(all|binet|pdc20|pdc21)
```

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#PDC-2020" data-toc-modified-id="PDC-2020-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>PDC 2020</a></span></li><li><span><a href="#PDC-2021" data-toc-modified-id="PDC-2021-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>PDC 2021</a></span></li><li><span><a href="#Binet-datasets" data-toc-modified-id="Binet-datasets-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Binet datasets</a></span></li><li><span><a href="#Shell-Util" data-toc-modified-id="Shell-Util-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Shell Util</a></span></li></ul></div>

In [None]:
#default_exp training

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#export

from dapnn.imports import *
from dapnn.data_processing import *
from dapnn.anomaly import *


In [None]:
#export
import warnings
warnings.filterwarnings(action='once')

In [None]:
notebook2script('04_training.ipynb')

Converted 04_training.ipynb.


## PDC 2020

In [None]:
#export
def train(fn,log_name,store_path='models',epoch=25,ws=5): 
    cols= get_attr(attr_dict,fn)
    log = import_log(fn,cols)
    o,dls,categorify = training_dl(log,cols,ws=ws)
    p = f'{store_path}/{log_name}_vocab.p'
    with open(p, "wb") as output_file:
        pickle.dump(categorify, output_file)
    
    emb_szs = get_emb_sz(o)
    m=MultivariateModel(emb_szs)
    loss=partial(multi_loss_sum,o)
    train_val = train_validate(dls,m,loss=loss,metrics=get_metrics(o),epoch=epoch,show_plot=False,print_output=False,store_path=store_path,model_name=log_name)
    

In [None]:
#export
def train_pdc20_logs():
    store_path='models/pdc2020'
    for training_log in progress_bar(glob.glob('data/csv/PDC2020_training/*')):
        log_name = training_log.split('.')[0].split('_')[-1]
        train(training_log,log_name,store_path=store_path)

## PDC 2021

In [None]:
#export
def train_pdc21_logs():
    store_path='models/pdc2021'
    for training_log in progress_bar(glob.glob('data/csv/PDC2021_training/*')):
        log_name = training_log.split('.')[0].split('_')[-1]
        train(training_log,log_name,store_path=store_path)

## Binet datasets

In [None]:
#export
def train_binet_logs():
    store_path='models/binet_logs'
    for training_log in progress_bar(glob.glob('data/csv/binet_logs/*')):
        log_name = training_log.split('/')[-1][:-7]
        train(training_log,log_name,store_path=store_path)


## Shell Util

In [None]:
#export
def run_training(log="binet"):
    if log == 'binet':
        train_binet_logs()
    elif log == 'pdc20':
        train_pdc20_logs()
    elif log == 'pdc21':
        train_pdc21_logs()
    elif log == 'all':
        train_pdc20_logs()
        train_pdc21_logs()
        train_binet_logs()
    else: 
        raise ValueError(f'{log} is not a supported data set!')