# Training Kraken Models

Requirements (use pip command listed or Anaconda Navigator to install):

- [Kraken](https://github.com/mittagessen/kraken): `pip install kraken`
- [TensorBoard](https://www.tensorflow.org/tensorboard): `pip install tensorboard`
- [Ray](https://docs.ray.io/en/latest/ray-overview/installation.html) `pip install ray`


### Load Modules

In [17]:
import os
from functools import partial
from kraken.lib import arrow_dataset
from kraken.lib.progress import KrakenProgressBar
from kraken.lib.default_specs import (SEGMENTATION_HYPER_PARAMS, SEGMENTATION_SPEC, 
                                      RECOGNITION_HYPER_PARAMS, RECOGNITION_SPEC)
from kraken.lib.train import SegmentationModel, RecognitionModel, KrakenTrainer
from kraken.lib.exceptions import KrakenInputException
import tensorboard
from ray import tune
from ray.tune.integration.pytorch_lightning import TuneReportCallback
from ray.tune.schedulers import ASHAScheduler
import pytorch_lightning as pl
import torch
import warnings

warnings.filterwarnings('ignore') # mute annoying warnings


In [18]:
# load Tensorboard Jupyter extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


### Define the Training Dataset

In [3]:
data = 'W/11.20' # folder with xml + images
xml = [f'{data}/{f.name}' for f in os.scandir(data) if f.name != ".DS_Store" and f.name.endswith('xml')]
print(len(xml))

22


### Train a Segmentation Model

In [19]:
# create a copy of the default hyper-parameters
seg_hyper_params = SEGMENTATION_HYPER_PARAMS.copy()
# and update them as necessary
seg_hyper_params.update({
    'line_width': 8, # height of baseline in target (after scaling)
    'padding': (0, 0), # padding around the page image (l/r, t/b)
    'freq': 1.0, # model saving/report generation frequency (in epochs)
    'quit': 'early', # stop condition for training, `early`|`dumb` (fixed number of epochs)
    'epochs': -1, # number of epochs to train for
    'min_epochs': 1, # minimum number of epochs to train for when using early stopping
    'lag': 10, # number of evaluations to wait before stopping training without improvement
    'min_delta': None, # minimum improv. before reset of early stopping, None=scale delta by best loss
    'optimizer': 'Adam', # optimizer: 'Adam'|'SGD'|'RMSprop'|'Lamb'
    'lrate': 2e-4, # learning rate
    'momentum': 0.9, # momentum
    'weight_decay': 1e-5, # weight decay
    'schedule': 'constant', # lr scheduler: 'constant'|'1cycle'|'exponential'|'cosine'|'step'|'reduceonplateau'
    'completed_epochs': 0,
    'augment': False, # enable image augmentation
    'step_size': 10, # step decay (validation runs between lr decay for exponential and step)
    'gamma': 0.1, # exp decay (factor for exponential, step, and reduceonplateau)
    'rop_factor': 0.1, # exp decay for reduce on plateau 
    'rop_patience': 5, # step decay for reduce on plateau
    'cos_t_max': 50, # epoch of minimal learning rate for cosine
    'warmup': 0, # number of steps to ramp up to `lrate` initial lr
    'pl_logger': 'tensorboard' # logger or None
})


In [20]:
# create the model
seg_model = SegmentationModel(
    hyper_params=seg_hyper_params, # hyperparameters
    output='WSeg/Weisser_4', # output model file
    spec=SEGMENTATION_SPEC, # VGSL spec of the baseline labeling network
    model='WSeg/weisser_3.mlmodel', # existing model file to load and continue training
    training_data=xml, # list of xml files for training
    evaluation_data=None, # list of xml files for evaluation
    partition=0.9, # ground truth partition train/validation ratio, set to 1 if evaluation_data is passed
    num_workers=0, # number of OpenMP threads when running on CPU, set to 0 to avoid ParallelNative.cpp warn.
    load_hyper_parameters=False, # retrieve hyper-parameters model when loading existing file
    force_binarization=False, # forces input images to be binary, otherwise auto determines format
    format_type='page', # `xml`, `alto`, `page`, `path`, or None
    suppress_regions=False, # disables region segmentation training
    suppress_baselines=False, # disables baseline segmentation training
    valid_regions=None, # valid region types in training data (can be used multiple times)
    valid_baselines=None, # valid baseline types in training data (can be used multiple times)
    merge_regions=None, # region merge mapping(s) as `$target:$src`
    merge_baselines=None, # baseline type merge mapping(s) as `$target:$src`
    bounding_regions=None, # regions treated as boundaries for polygonization (can be used multiple times)
    resize='both', # resize output layer: 'add' new classes, 'both' add and set to match, 'fail' abort if no match
    topline=False # baseline location: baseline=False, topline/hanging baseline=True, centerline=None
)

# do a couple of basic checks
print('Training line types:')
for k, v in seg_model.train_set.dataset.class_mapping['baselines'].items():
    print(f'{k}\t\t{v}\t{seg_model.train_set.dataset.class_stats["baselines"][k]}')
    
print('Training region types:')
for k, v in seg_model.train_set.dataset.class_mapping['regions'].items():
    print(f'{k}\t\t{v}\t{seg_model.train_set.dataset.class_stats["regions"][k]}')


FileNotFoundError: [Errno 2] No such file or directory: '/home/giod/Documents/Kraken Training/WSeg/weisser_3.mlmodel'

In [33]:
# define how often to assess values while training
val_check_interval = {
    'check_val_every_n_epoch': int(seg_hyper_params['freq'])
} if seg_hyper_params['freq'] > 1 else {
    'val_check_interval': seg_hyper_params['freq']
}

# define the trainer
seg_trainer = KrakenTrainer(
    accelerator='cuda', # use mps for gpu (several unsupported functions for M1)
    devices=1, # cpu cores when used with accelerator=cpu, 1 when used with mps
    precision=32, # numerical precision for training (default is 32-bit single-point)
    max_epochs=seg_hyper_params['epochs'] if seg_hyper_params['quit'] == 'dumb' else -1,
    min_epochs=seg_hyper_params['min_epochs'],
    enable_progress_bar=True, # show progress bar
    deterministic=False, # use deterministic training (seed default = 42)
    pl_logger='tensorboard', # logger
    log_dir='WSeg/logs', # directory for logger
    **val_check_interval
)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..


In [None]:
# there environment variables are probably only needed on M1 Macs
!export GLOO_SOCKET_IFNAME=localhost
!export OMP_NUM_THREADS=1

In [13]:
# load the TansorBoard dashboard
%tensorboard --logdir 'WSeg/logs'

Reusing TensorBoard on port 6006 (pid 196597), started 0:00:52 ago. (Use '!kill 196597' to kill it.)

In [34]:
# train the model
seg_trainer.fit(seg_model)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

RuntimeError: Predictions and targets are expected to have the same shape, but got torch.Size([1, 7, 450, 348]) and torch.Size([1, 6, 450, 348]).

### Compile the Training Dataset into Binary Format

This is not required, but a binary dataset is substantially faster than a set of xml files for training. It can only be used to train a recognition model (i.e. not a segmentation one).

In [4]:
# compile training dataset into binary format (massive speed improvement over files)
with KrakenProgressBar() as progress:
    extract_task = progress.add_task('Extracting lines', total=0, start=False, visible=True)
    
    arrow_dataset.build_binary_dataset(
        files=xml,
        output_file='W/11.20/training_dataset.arrow',
        format_type='page', # `xml`, `alto`, `page`, `path`, or None
        num_workers=0, # parallelized extraction, `0`= no parallelism
        ignore_splits=False, # enable/disable serialization of train/validation/test splits in the source files
        random_split=None, # serialize random split into the dataset with (train, val, test)
        force_type='kraken_recognition_baseline', # force a dataset type, `kraken_recognition_baseline` / `kraken_recognition_bbox`
        recordbatch_size=100, # minimum number of records per RecordBatch written to output
        skip_empty_lines=True, # do not compile empty text lines into the dataset
        callback = lambda advance, total: progress.update(extract_task, total=total, advance=advance)
    )


Output()

### Train a Recognition Model

In [32]:
# copy default hyper-parameters
rec_hyper_params = RECOGNITION_HYPER_PARAMS.copy()
# and modify as necessary
rec_hyper_params.update({
    'pad': 16, # left/right padding around lines
    'freq': 1.0, # model saving/report generation frequency (in epochs)
    'batch_size': 1, # batch sample size
    'quit': 'early', # stop condition for training, `early` or `dumb` (fixed number of epochs)
    'epochs': -1, # number of epochs to train for
    'min_epochs': 10, # minimum number of epochs to train for when using early stopping
    'lag': 10, # number of evaluations to wait before stopping training without improvement
    'min_delta': None, # minimum improvement between epochs to reset early stopping, None=scale delta by best loss
    'optimizer': 'Adam', # optimizer: 'Adam', 'SGD', 'RMSprop', 'Lamb'
    #'lrate': 1e-3, # learning rate
    "lrate": 0.0003426459057850771,
    "mask_prob": 0.021118124693851062,
    "mask_width": 4,
    "num_negatives": 56,
    #"lrate": 1.4898468934950387e-05,
    #"mask_prob": 0.041236782381262986,
    #"mask_width": 4,
    #"num_negatives": 72,
    'momentum': 0.9, # momentum
    'weight_decay': 0.0, # weight decay
    'schedule': 'constant', # lr scheduler: 'constant', '1cycle', 'exponential', 'cosine', 'step', 'reduceonplateau'
    'normalization': None, # ground truth normalization
    'normalize_whitespace': True, # normalizes unicode whitespace'
    'completed_epochs': 0,
    'augment': False, # enable image augmentation
    'step_size': 10, # step decay (validation runs between learning rate decay for exponential and step schedules)
    'gamma': 0.1, # exp decay (factor for exponential, step, and reduceonplateau schedules)
    'rop_factor': 0.1, # exp decay for reduce on plateau 
    'rop_patience': 5, # step decay for reduce on plateau
    'cos_t_max': 50, # epoch of minimal learning rate for cosine scheduler
    'warmup': 0, # number of steps to ramp up to `lrate` initial lr
    'freeze_backbone': 0, # number of samples to keep everything but last layer frozen
    'pl_logger': 'tensorboard', # loggers
})


In [33]:
# create the model
rec_model = RecognitionModel(
    hyper_params=rec_hyper_params, # hyperparameters
    output='W/11.20/out_print/W_1_highLR', # output model file
    spec=RECOGNITION_SPEC, # VGSL spec of the network to train (CTC layer added automatically)
    append=None, # remove layers before argument and append spec (only for loading existing model)
    model='print_transcription_new.mlmodel', # existing model file to load and continue training
    reorder=True, # reorder code points to display order
    training_data=['W/11.20/training_dataset.arrow'], # list with path to binary or xml files
    evaluation_data=None, # path to binary or list of xml files
    partition=0.9, # ground truth partition train/validation ratio, set to 1 if evaluation_data is passed
    binary_dataset_split=False, # whether to honour fixed splits in binary datasets
    num_workers=0, # number of OpenMP threads when running on CPU
    load_hyper_parameters=True, # retrieve hyper-parameters model when loading existing file
    #repolygonize=False, # repolygonize line data in ALTO/PageXML files
    force_binarization=False, # forces input images to be binary, otherwise auto determines format
    format_type='binary', # `xml`, `alto`, `page`, `path`, or 'binary'
    codec=None, # load a codec JSON definition (invalid if loading existing model)
    resize='both' # resize output layer: 'add' new classes, 'both' add and set to match, 'fail' abort if no match
)


In [34]:
# define how often to assess values while training
val_check_interval = {
    'check_val_every_n_epoch': int(rec_hyper_params['freq'])
} if rec_hyper_params['freq'] > 1 else {
    'val_check_interval': rec_hyper_params['freq']
}

# define the trainer
rec_trainer = KrakenTrainer(
    accelerator='cuda', # use mps for gpu (several unsupported functions for M1)
    devices=1, # cpu cores when used with accelerator=cpu, 1 when used with mps
    precision=32, # numerical precision for training (default is 32-bit single-point),
    max_epochs=rec_hyper_params['epochs'] if rec_hyper_params['quit'] == 'dumb' else -1,
    min_epochs=rec_hyper_params['min_epochs'],
    freeze_backbone=rec_hyper_params['freeze_backbone'],
    enable_progress_bar=True, # show progress bar
    deterministic=False, # use deterministic training (seed default = 42)
    pl_logger='tensorboard', # logger
    log_dir='W/11.20/out_print/logs', # directory for logger
    **val_check_interval
)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..


In [35]:
# load TensorBoard dashboard - this won't work if there's a version already running
%tensorboard --logdir 'W/11.20/out_print/logs'

Launching TensorBoard...

In [36]:
# train the model
try:
    torch.set_float32_matmul_precision('high')
    rec_trainer.fit(rec_model)

except KrakenInputException as e:
    if e.args[0].startswith('Training data and model codec alphabets mismatch') and resize == 'fail':
        print('Mismatched training data for loaded model. Set option `--resize` to `add` or `both`')
    else:
        print(e)


Neural network has been trained on mode 1 images, training set contains mode L data. Consider setting `force_binarization`
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

### Optimize Hyper-Parameters

This is how you determine the best set of parameters to train your model.

In [37]:
# define the training function
def optimize(config, training_data=None, epochs=100):
    hyper_params = RECOGNITION_HYPER_PARAMS.copy()
    hyper_params.update(config)

    model = RecognitionModel(
        hyper_params=hyper_params,
        output='model',
        spec=RECOGNITION_SPEC,
        training_data=training_data,
        format_type='binary',
        model='555/to 10/555_ver 1_best.mlmodel',
        resize='add',
    )

    #callback = TuneReportCallback({'loss': 'train_loss'}, on='validation_end')
    callback = TuneReportCallback({'acc': 'val_accuracy'}, on='validation_end')
    
    trainer = pl.Trainer(
        max_epochs=epochs,
        gpus=1,
        callbacks=[callback],
        enable_progress_bar=False
    )
    
    trainer.fit(model)

# decide what parameters to optimize and what value ranges to use 
config = {
    'lrate': tune.loguniform(1e-8, 1e-1),
    'num_negatives': tune.qrandint(2, 100, 8),
    'mask_prob': tune.loguniform(0.01, 0.2),
    'mask_width': tune.qrandint(2, 8, 2)
}

# resources to assign to each test
resources = {
    "cpu": 8, 
    "gpu": 1
}

# and our training dataset - might be a good idea to thin it down for this!
data = ['555/12-31/training_dataset.arrow']


In [38]:
# load the TansorBoard dashboard
%tensorboard --logdir '/home/giod/Downloads/opt_logs'

In [39]:
# and run the analysis
analysis = tune.run(
    partial(optimize, training_data=data), 
    storage_path='/home/giod/Downloads/opt_logs', 
    num_samples=100, 
    resources_per_trial=resources, 
    config=config
)

print("Best hyperparameters found were: ", analysis.get_best_config(metric='accuracy', mode='max'))


TypeError: run() got an unexpected keyword argument 'storage_path'