# Imports


In [2]:
%load_ext autoreload

In [4]:
%autoreload 2

from dataclasses import dataclass, asdict, field
from enum import Enum
import functools
import operator
from functools import partial
import logging
import pathlib
from pathlib import Path
from pprint import pprint
import sys
from typing import *
import time
import yaml
from yaml import YAMLObject
import socket

import humanize
from matplotlib import pyplot as plt, cm
import numpy as np
import pandas as pd
from pymicro.file import file_utils
import tensorflow as tf
from numpy.random import RandomState

from tensorflow import keras
from tensorflow.keras import utils
from tensorflow.keras import optimizers
from tensorflow.keras import callbacks as keras_callbacks
from tensorflow.keras import losses
from tensorflow.keras import metrics as keras_metrics

from tomo2seg import slack
from tomo2seg import modular_unet
from tomo2seg.logger import logger, dict2str, add_file_handler
from tomo2seg import data as tomo2seg_data
from tomo2seg import viz
from tomo2seg.data import Volume
from tomo2seg.metadata import Metadata
from tomo2seg.volume_sequence import (
    MetaCrop3DGenerator, VolumeCropSequence,
    UniformGridPosition, SequentialGridPosition,
    ET3DUniformCuboidAlmostEverywhere, ET3DConstantEverywhere, 
    GTUniformEverywhere, GTConstantEverywhere, 
    VSConstantEverywhere, VSUniformEverywhere
)
from tomo2seg import volume_sequence
from tomo2seg.model import Model as Tomo2SegModel
from tomo2seg import callbacks as tomo2seg_callbacks
from tomo2seg import losses as tomo2seg_losses
from tomo2seg import schedule as tomo2seg_schedule
from tomo2seg import utils as tomo2seg_utils
from tomo2seg import slackme

In [5]:
# this registers a custom exception handler for the whole current notebook
get_ipython().set_custom_exc((Exception,), slackme.custom_exc)

In [6]:
# these are estimates based on things i've seen fit in the GPU
MAX_INTERNAL_NVOXELS = max(
    # seen cases
    # batch_size * internal_multiplier_factor * (crop_nvoxels) * gpu_factor=1
    4 * (8 * 6) * (96**3),
    8 * (16 * 6) * (320**2),  
    3 * (16 * 6) * (800 * 928),
    15 * 23 * (208**2 * 5) * (8 / 5),
)

MAX_INTERNAL_NVOXELS *= 5/8  # a smaller gpu on other pcs...
MAX_INTERNAL_NVOXELS = int(MAX_INTERNAL_NVOXELS)

logger.info(f"{MAX_INTERNAL_NVOXELS=} ({humanize.intcomma(MAX_INTERNAL_NVOXELS)})")

INFO::tomo2seg::{<ipython-input-6-78d1635f7d9a>:<module>:014}::[2020-12-14::10:51:32.448]
MAX_INTERNAL_NVOXELS=133632000 (133,632,000)



# Args

In [7]:
# [manual-input]

@dataclass
class Args:

    class EarlyStopMode(Enum):
        no_early_stop = 0
        
    class BatchSizeMode(Enum):
        try_max_and_fail = 0
        try_max_and_reduce = 1
        
    # None: continue from the latest model
    # 1: continue from model.autosaved_model_path
    # 2: continue from model.autosaved2_model_path
    # continue_from_autosave: Optional[int] = None 
    class TrainMode(Enum):
        from_scratch = 0
        continuation_from_autosaved_model = 1
        continuation_from_autosaved2_best_model = 2
        continuation_from_latest_model = 3

        @property
        def is_continuation(self) -> bool:
            return self in (
                Args.TrainMode.continuation_from_autosaved_model,
                Args.TrainMode.continuation_from_autosaved2_best_model,
                Args.TrainMode.continuation_from_latest_model,
            )

    early_stop_mode: EarlyStopMode
    batch_size_mode: BatchSizeMode
    train_mode: TrainMode
        
    volume_name: str
    volume_version: str
    labels_version: str
    
    # override the auto-sized value 
    # this allows to reproduce reproduce the same conditions across experiments
    batch_size_per_gpu: Optional[int] = None  
    
    random_state_seed: int = 42
        
    runid: int = None
        
    def __post_init__(self):
        
        if self.train_mode.is_continuation:
            assert self.runid is not None, f"Incompatible args {self.runid=} {self.self.train_mode=}"
        
        if self.runid is None:
            self.runid = int(time.time())
            
        if self.batch_size_per_gpu is not None:
            assert self.batch_size_per_gpu > 0, f"{self.batch_size_per_gpu=}"

            ngpus = len(tf.config.list_physical_devices('GPU'))

            if ngpus > 0: 
                assert self.batch_size_per_gpu % ngpus == 0, f"{self.batch_size_per_gpu=} {ngpus=}"
            
            try:
                MAX_INTERNAL_NVOXELS
            
            except NameError as ex:
                ValueError(f"Please define the variable `{ex.args[0]}`")

                
from tomo2seg.datasets import (
    VOLUME_COMPOSITE_V1 as VOLUME_NAME_VERSION,
#     VOLUME_COMPOSITE_V1_REDUCED as VOLUME_NAME_VERSION,
    VOLUME_COMPOSITE_V1_LABELS_REFINED3 as LABELS_VERSION,
#     VOLUME_FRACTURE00_SEGMENTED00 as VOLUME_NAME_VERSION,
#     VOLUME_FRACTURE00_SEGMENTED00_LABELS_REFINED3 as LABELS_VERSION,
)

args = Args(
    early_stop_mode = Args.EarlyStopMode.no_early_stop,
    batch_size_mode = Args.BatchSizeMode.try_max_and_reduce,
    train_mode=Args.TrainMode.from_scratch,
    
    volume_name=VOLUME_NAME_VERSION[0],
    volume_version=VOLUME_NAME_VERSION[1],
    labels_version=LABELS_VERSION,
    
#     random_state_seed=30,  # I'll change it so we don't repeat the same crops from the begining
#     runid = 1607698009,
)

logger.info(f"args={dict2str(asdict(args))}")

INFO::tomo2seg::{<ipython-input-7-806f1f39c55a>:<module>:091}::[2020-12-14::10:51:33.460]
args={   'batch_size_mode': <BatchSizeMode.try_max_and_reduce: 1>,
    'batch_size_per_gpu': None,
    'early_stop_mode': <EarlyStopMode.no_early_stop: 0>,
    'labels_version': 'refined3',
    'random_state_seed': 42,
    'runid': 1607939493,
    'train_mode': <TrainMode.from_scratch: 0>,
    'volume_name': 'PA66GF30',
    'volume_version': 'v1'}




# Setup


In [8]:
logger.setLevel(logging.DEBUG)
random_state = np.random.RandomState(args.random_state_seed)

n_gpus = len(tf.config.list_physical_devices('GPU'))
    
tf_version = tf.__version__
logger.info(f"{tf_version=}")

hostname = socket.gethostname()
logger.info(
    f"Hostname: {hostname}\nNum GPUs Available: {n_gpus}\nThis should be:\n\t" + '\n\t'.join(['2 on R790-TOMO', '1 on akela', '1 on hathi', '1 on krilin'])
)

logger.debug(
    "physical GPU devices:\n\t" + "\n\t".join(map(str, tf.config.list_physical_devices('GPU'))) + "\n" +
    "logical GPU devices:\n\t" + "\n\t".join(map(str, tf.config.list_logical_devices('GPU'))) 
)

# xla auto-clustering optimization (see: https://www.tensorflow.org/xla#auto-clustering)
# this seems to break the training
tf.config.optimizer.set_jit(False)

# get a distribution strategy to use both gpus (see https://www.tensorflow.org/guide/distributed_training)
gpu_strategy = tf.distribute.MirroredStrategy()  
logger.debug(f"{gpu_strategy=}")

INFO::tomo2seg::{<ipython-input-8-050df0cd2614>:<module>:007}::[2020-12-14::10:51:37.820]
tf_version='2.2.0'

INFO::tomo2seg::{<ipython-input-8-050df0cd2614>:<module>:010}::[2020-12-14::10:51:37.821]
Hostname: akela.materiaux.ensmp.fr
Num GPUs Available: 1
This should be:
	2 on R790-TOMO
	1 on akela
	1 on hathi
	1 on krilin

DEBUG::tomo2seg::{<ipython-input-8-050df0cd2614>:<module>:014}::[2020-12-14::10:51:37.900]
physical GPU devices:
	PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
logical GPU devices:
	LogicalDevice(name='/device:GPU:0', device_type='GPU')

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
DEBUG::tomo2seg::{<ipython-input-8-050df0cd2614>:<module>:025}::[2020-12-14::10:51:37.904]
gpu_strategy=<tensorflow.python.distribute.mirrored_strategy.MirroredStrategy object at 0x7f2ac8558490>



# Model

In [9]:
try:
    tomo2seg_model
except NameError:
    print("already deleted (:")
else:
    del tomo2seg_model

already deleted (:


In [12]:
# [manual-input]
crop_shape = (208, 208, 5)  # multiple of 16 (requirement of a 4-level u-net)
model_nclasses = 3

model_master_name = "unet2halfd"
model_version = "II-enc-c208-f08"

model_is_2halfd = True
model_is_2d = False

model_factory_function = modular_unet.u_net2halfd_IIenc
model_factory_kwargs = {
    **modular_unet.kwargs_IIenc03,
    **dict(
        convlayer=modular_unet.ConvLayer.conv2d,
        input_shape = crop_shape,
        output_channels = model_nclasses,
#         nb_filters_0 = 2,
#         nb_filters_0 = 4,
        nb_filters_0 = 8,
#         nb_filters_0 = 12,
#         nb_filters_0 = 16,common_random_state
#         nb_filters_0 = 32,
    ),
}

try:
    tomo2seg_model
    
except NameError:
    
    logger.info("Creating a Tomo2SegModel.")
    
    tomo2seg_model = Tomo2SegModel(
        model_master_name, 
        model_version, 
        runid=args.runid,
        factory_function=model_factory_function,
        factory_kwargs=model_factory_kwargs,
    )
                    
else:
    logger.warning("The model is already defined. To create a new one: `del tomo2seg_model`")

finally:
    logger.info(f"tomo2seg_model\n{dict2str(asdict(tomo2seg_model))}")    
    logger.info(f"{tomo2seg_model.name=}")

The model is already defined. To create a new one: `del tomo2seg_model`

INFO::tomo2seg::{<ipython-input-12-084be105897a>:<module>:046}::[2020-12-14::10:53:06.714]
tomo2seg_model
{   'factory_function': 'tomo2seg.modular_unet.u_net2halfd_IIenc',
    'factory_kwargs': {   'convlayer': <ConvLayer.conv2d: 0>,
                          'depth': 4,
                          'input_shape': (208, 208, 5),
                          'nb_filters_0': 8,
                          'output_channels': 3,
                          'sigma_noise': 0,
                          'unet_block_kwargs': {   'batch_norm': True,
                                                   'dropout': 0,
                                                   'kernel_size': 3,
                                                   'res': True},
                          'unet_down_kwargs': {'batchnorm': True},
                          'unet_up_kwargs': {'batchnorm': True},
                          'updown_conv_sampling': True},
  

In [11]:
logger.info("Creating the Keras model.")

if args.train_mode.is_continuation:
    logger.warning("Training continuation: a model will be loaded.")

    if args.train_mode == Args.TrainMode.continuation_from_latest_model:
        logger.info("Using the LATEST model to continue the training.")
        load_model_path = tomo2seg_model.model_path

    elif args.train_mode == Args.TrainMode.continuation_from_autosaved_model:
        logger.info("Using the AUTOSAVED model to continue the training.")
        load_model_path = tomo2seg_model.autosaved_model_path

    elif args.train_mode == Args.TrainMode.continuation_from_autosaved2_best_model:
        logger.info("Using the (best) AUTOSAVED2 model to continue the training.")
        load_model_path = tomo2seg_model.autosaved2_best_model_path

    else:
        raise ValueError(f"{args.train_mode=}")

elif (
    tomo2seg_model.model_path.exists() or
    tomo2seg_model.autosaved_model_path.exists()
    # todo uncomment me when implemented
#             or tomo2seg_model.autosaved2_best_model_path.exists()
):
    logger.error(f"The model seems to already exist but this is not a continuation. Please, make sure the arguments are correct.")
    raise ValueError(f"{args.train_mode=} ==> {args.train_mode.is_continuation=} {tomo2seg_model.name=}")

elif args.train_mode == Args.TrainMode.from_scratch:
    logger.info(f"A new model will be instantiated!")

else:
    raise NotImplementedError(f"{args.train_mode=}")

    
with gpu_strategy.scope():
    
    if args.train_mode.is_continuation:
        
        assert load_model_path.exists(), f"Inconsistent arguments {args.train_mode.is_continuation=} {load_model_path=} {load_model_path.exists()=}."
        
        logger.info(f"Loading model {load_model_path.name}")
        
        model = keras.models.load_model(str(load_model_path), compile=False)

        assert model.name == tomo2seg_model.name, f"{model.name=} {tomo2seg_model.name=}"
        
    else:
        
        logger.info(f"Instantiating a new model with model_factory_function={model_factory_function.__name__}.")
      
        model = model_factory_function(
            name=tomo2seg_model.name,
            **model_factory_kwargs
        )

    logger.info("Compiling the model.")

    # [manual-input]
    # using the avg jaccard is dangerous if one of the classes is too
    # underrepresented because it's jaccard will be unstable
    # to be verified!
    loss = tomo2seg_losses.jaccard2_flat
    optimizer = optimizers.Adam(lr=.003)
    metrics = []
    
    logger.debug(f"{loss=}")
    logger.debug(f"{optimizer=}")
    logger.debug(f"{metrics=}")
    
    model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

INFO::tomo2seg::{<ipython-input-11-841396a8de5d>:<module>:001}::[2020-12-14::10:51:55.711]
Creating the Keras model.

INFO::tomo2seg::{<ipython-input-11-841396a8de5d>:<module>:031}::[2020-12-14::10:51:55.712]
A new model will be instantiated!

INFO::tomo2seg::{<ipython-input-11-841396a8de5d>:<module>:051}::[2020-12-14::10:51:55.714]
Instantiating a new model with model_factory_function=u_net2halfd_IIenc.

DEBUG::tomo2seg::{modular_unet.py:u_net2halfd_IIenc:488}::[2020-12-14::10:51:55.714]
dict2str(unet_block_kwargs)="{   'batch_norm': True,\n    'convlayer': <ConvLayer.conv2d: 0>,\n    'dropout': 0,\n    'kernel_size': 3,\n    'res': True,\n    'return_layers': True}"

DEBUG::tomo2seg::{modular_unet.py:u_net2halfd_IIenc:504}::[2020-12-14::10:51:55.715]
dict2str(unet_down_kwargs)="{   'batchnorm': True,\n    'conv_sampling': True,\n    'convlayer': <ConvLayer.conv2d: 0>,\n    'return_layers': True}"

DEBUG::tomo2seg::{modular_unet.py:u_net2halfd_IIenc:519}::[2020-12-14::10:51:55.715]
di

In [14]:
if not args.train_mode.is_continuation:
    
    logger.info(f"Saving the model at {tomo2seg_model.model_path=}.")\
    
    model.save(tomo2seg_model.model_path)

    logger.info(f"Writing the model summary at {tomo2seg_model.summary_path=}.")
    
    with tomo2seg_model.summary_path.open("w") as f:
        def print_to_txt(line):
            f.writelines([line + "\n"])
        model.summary(print_fn=print_to_txt, line_length=140)

    logger.info(f"Printing an image of the architecture at {tomo2seg_model.architecture_plot_path=}.")
    
    utils.plot_model(model, show_shapes=True, to_file=tomo2seg_model.architecture_plot_path);
    
add_file_handler(logger, tomo2seg_model.train_log_path)

# repeat it so that the log file saves this
logger.info(f"args\n{dict2str(asdict(args))}")    
logger.info(f"{tomo2seg_model.name=}")
logger.info(f"tomo2seg_model\n{dict2str(asdict(tomo2seg_model))}")    

INFO::tomo2seg::{<ipython-input-14-e7ea4213221d>:<module>:003}::[2020-12-14::10:53:40.690]
Saving the model at tomo2seg_model.model_path=PosixPath('/home/users/jcasagrande/projects/tomo2seg/data/models/unet2halfd/unet2halfd.II-enc-c208-f08.fold000.1607-939-493').

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: /home/users/jcasagrande/projects/tomo2seg/data/models/unet2halfd/unet2halfd.II-enc-c208-f08.fold000.1607-939-493/assets
INFO::tomo2seg::{<ipython-input-14-e7ea4213221d>:<module>:007}::[2020-12-14::10:53:56.813]
Writing the model summary at tomo2seg_model.summary_path=PosixPath('/home/users/jcasagrande/projects/tomo2seg/data/models/unet2halfd/unet2halfd.II-enc-c208-f08.fold000.1607-939-493/summary.txt').

INFO::tomo2seg::{<ipython-input-14-e7ea4213221d>:<module>:014}::[2020-12-14::10:53:57.252]
Printing an image of the architecture at tomo2seg_model.architecture_plot_path=PosixPath('/home/users/jcasagrande/projec

# Data

In [16]:
# Metadata/paths objects

## Volume
volume = Volume.with_check(
    name=args.volume_name, version=args.volume_version
)

logger.info(f"volume\n{dict2str(asdict(volume))}")

assert volume.nclasses

logger.info("Loading data from disk.")

## Data
voldata = file_utils.HST_read(
    str(volume.data_path),  # it doesn't accept paths...
    
    autoparse_filename=False,  # the file names are not properly formatted
    data_type=volume.metadata.dtype,
    dims=volume.metadata.dimensions,
    verbose=True,
    
) / volume.normalization_factor

logger.debug(f"{voldata.shape=}")

voldata_train = volume.train_partition.get_volume_partition(voldata)
voldata_val = volume.val_partition.get_volume_partition(voldata)

logger.debug(f"{voldata_train.shape=}")
logger.debug(f"{voldata_val.shape=}")

del voldata

## Labels

vollabels = file_utils.HST_read(
    str(volume.versioned_labels_path(args.labels_version)),
    
    autoparse_filename=False,
    data_type="uint8",
    dims=volume.metadata.dimensions,
    verbose=True,
)

logger.debug(f"{vollabels.shape=}")

vollabels_train = volume.train_partition.get_volume_partition(vollabels)
vollabels_val = volume.val_partition.get_volume_partition(vollabels)

logger.debug(f"{vollabels_train.shape=}")
logger.debug(f"{vollabels_val.shape=}")

del vollabels

DEBUG::tomo2seg::{data.py:with_check:264}::[2020-12-14::10:54:48.187]
vol=Volume(name='PA66GF30', version='v1', _metadata=None)

DEBUG::tomo2seg::{data.py:metadata:201}::[2020-12-14::10:54:48.189]
Loading metadata from `/home/users/jcasagrande/projects/tomo2seg/data/PA66GF30.v1/PA66GF30.v1.metadata.yml`.

INFO::tomo2seg::{<ipython-input-16-3be1b813cb29>:<module>:008}::[2020-12-14::10:54:48.193]
volume
{   '_metadata': {   'dimensions': [1300, 1040, 1900],
                     'dtype': 'uint8',
                     'labels': [0, 1, 2],
                     'labels_names': {0: 'matrix', 1: 'fiber', 2: 'porosity'},
                     'set_partitions': {   'test': {   'alias': 'test',
                                                       'x_range': [0, 1300],
                                                       'y_range': [0, 1040],
                                                       'z_range': [1300, 1600]},
                                           'train': {   'alias': 'train',

# Data crop sequences

## Batch size

In [17]:
model_internal_nvoxel_factor = tomo2seg_utils.get_model_internal_nvoxel_factor(model)

logger.debug(f"{model_internal_nvoxel_factor=}")

max_batch_nvoxels = int(np.floor(MAX_INTERNAL_NVOXELS / model_internal_nvoxel_factor))

logger.debug(f"{max_batch_nvoxels=} ({humanize.intcomma(max_batch_nvoxels)})")

crop_nvoxels = functools.reduce(operator.mul, crop_shape)

logger.debug(f"{crop_shape=} ==> {crop_nvoxels=}")

max_batch_size_per_gpu = batch_size_per_gpu = max(1, int(np.floor(max_batch_nvoxels / crop_nvoxels)))

logger.info(f"{batch_size_per_gpu=}")

if args.batch_size_per_gpu is not None:
    logger.warning(f"{args.batch_size_per_gpu=} given ==> replacing {batch_size_per_gpu=}")
    batch_size_per_gpu = args.batch_size_per_gpu

logger.info(f"{n_gpus=}")

batch_size = batch_size_per_gpu * max(1, n_gpus)

logger.info(f"{batch_size=}")

DEBUG::tomo2seg::{utils.py:get_model_internal_nvoxel_factor:023}::[2020-12-14::10:56:15.224]
input_layer=<tensorflow.python.keras.engine.input_layer.InputLayer object at 0x7f2a4306fd60>

DEBUG::tomo2seg::{utils.py:get_model_internal_nvoxel_factor:029}::[2020-12-14::10:56:15.226]
input_nvoxels=216320

DEBUG::tomo2seg::{utils.py:get_model_internal_nvoxel_factor:041}::[2020-12-14::10:56:15.232]
max_internal_nvoxels=4845568 (4,845,568)

DEBUG::tomo2seg::{<ipython-input-17-ce5d5f99df49>:<module>:003}::[2020-12-14::10:56:15.233]
model_internal_nvoxel_factor=23

DEBUG::tomo2seg::{<ipython-input-17-ce5d5f99df49>:<module>:007}::[2020-12-14::10:56:15.235]
max_batch_nvoxels=5810086 (5,810,086)

DEBUG::tomo2seg::{<ipython-input-17-ce5d5f99df49>:<module>:011}::[2020-12-14::10:56:15.236]
crop_shape=(208, 208, 5) ==> crop_nvoxels=216320

INFO::tomo2seg::{<ipython-input-17-ce5d5f99df49>:<module>:015}::[2020-12-14::10:56:15.237]
batch_size_per_gpu=26

INFO::tomo2seg::{<ipython-input-17-ce5d5f99df49>:<m

## Common kwargs

In [18]:
metacrop_gen_common_kwargs = dict(
    crop_shape=crop_shape,
    common_random_state_seed=args.random_state_seed,
    is_2halfd=model_is_2halfd,
    gt_type=volume_sequence.GT2D if model_is_2d or model_is_2halfd else volume_sequence.GT3D,
)

logger.debug(f"{metacrop_gen_common_kwargs=}")

vol_crop_seq_common_kwargs = dict(
    output_as_2d = model_is_2d,
    output_as_2halfd = model_is_2halfd,
    labels = volume.metadata.labels,

    # [manual-input]
    debug__no_data_check=True,
)

logger.debug(f"{vol_crop_seq_common_kwargs=}")

DEBUG::tomo2seg::{<ipython-input-18-d49b358ca222>:<module>:008}::[2020-12-14::10:56:16.471]
metacrop_gen_common_kwargs={'crop_shape': (208, 208, 5), 'common_random_state_seed': 42, 'is_2halfd': True, 'gt_type': <enum 'GT2D'>}

DEBUG::tomo2seg::{<ipython-input-18-d49b358ca222>:<module>:019}::[2020-12-14::10:56:16.472]
vol_crop_seq_common_kwargs={'output_as_2d': False, 'output_as_2halfd': True, 'labels': [0, 1, 2], 'debug__no_data_check': True}



## Train

In [19]:
data = voldata_train
labels = vollabels_train

volume_shape = data.shape

crop_seq_train = VolumeCropSequence(
    data_volume=data,
    labels_volume=labels,
    
    batch_size=batch_size,
    
    meta_crop_generator=MetaCrop3DGenerator.build_setup_train01(
        volume_shape=volume_shape,
        **metacrop_gen_common_kwargs,
        data_original_dtype=volume.metadata.dtype,
        
        # [manual-input]
        gt_no_transpose_rot = False,
    ),
    
    # this volume cropper only returns random crops, 
    # so the number of crops per epoch/batch is w/e i want
    epoch_size=10,
    
    **vol_crop_seq_common_kwargs,
)

INFO::tomo2seg::{volume_sequence.py:build_from_volume_crop_shapes:450}::[2020-12-14::10:56:18.378]
Built UniformGridPosition from volume_shape=(1300, 1040, 1300) and crop_shape=(208, 208, 5) ==> {'x_range': (0, 1093), 'y_range': (0, 833), 'z_range': (0, 1296)}

DEBUG::tomo2seg::{volume_sequence.py:__post_init__:412}::[2020-12-14::10:56:18.380]
UniformGridPosition ==> npositions=1179967824 (1,179,967,824)

Initializing ET3DConstantEverywhere with a UniformGridPosition.
The {x, y, z}_range values will be overwritten.

Initializing GTUniformEverywhere2 with a UniformGridPosition.
The {x, y, z}_range values will be overwritten.

DEBUG::tomo2seg::{volume_sequence.py:__post_init__:807}::[2020-12-14::10:56:18.382]
len(self.gt_name_list)=8

Initializing VSUniformEverywhere with a UniformGridPosition.
The {x, y, z}_range values will be overwritten.

self.output_as_2halfd=True only xy layers is available for 2.5d now!

DEBUG::tomo2seg::{volume_sequence.py:__post_init__:1476}::[2020-12-14::10:56:

## Val

In [20]:
data = voldata_val
labels = vollabels_val

volume_shape = data.shape

# the validation has no reproducibility issues
# so let's push the GPUs (:
val_batch_size = max_batch_size_per_gpu * n_gpus

logger.debug(f"{val_batch_size=}")

grid_pos_gen = SequentialGridPosition.build_min_overlap(
    volume_shape=volume_shape, 
    crop_shape=crop_shape,
    # [manual-input]
    # reduce the total number of crops
#         n_steps_x=11,
#         n_steps_y=11,
        n_steps_z=30,
)

crop_seq_val = VolumeCropSequence(
    data_volume=data,
    labels_volume=labels,
    
    batch_size=val_batch_size,
    
    # go through all the crops in validation
    epoch_size=len(grid_pos_gen),      
    
    # data augmentation
    meta_crop_generator=MetaCrop3DGenerator.build_setup_val00(
        volume_shape=volume_shape,
        grid_pos_gen=grid_pos_gen,
        **metacrop_gen_common_kwargs,
    ),
#     debug__no_data_check = True,
    
    **vol_crop_seq_common_kwargs,
)

DEBUG::tomo2seg::{<ipython-input-20-4565690e19ff>:<module>:010}::[2020-12-14::10:56:19.606]
val_batch_size=26

INFO::tomo2seg::{volume_sequence.py:build_min_overlap:518}::[2020-12-14::10:56:19.607]
Building SequentialGridPosition with minimal overlap (smallest n_steps in each directions) n_steps={'n_steps_x': 7, 'n_steps_y': 5, 'n_steps_z': 60}.

n_steps_kwargs={'n_steps_z': 30} was given --> effective n_steps={'n_steps_x': 7, 'n_steps_y': 5, 'n_steps_z': 30}

INFO::tomo2seg::{volume_sequence.py:build_from_volume_crop_shapes:450}::[2020-12-14::10:56:19.609]
Built SequentialGridPosition from volume_shape=(1300, 1040, 300) and crop_shape=(208, 208, 5) ==> {'x_range': (0, 1093), 'y_range': (0, 833), 'z_range': (0, 296)}

INFO::tomo2seg::{volume_sequence.py:__post_init__:498}::[2020-12-14::10:56:19.612]
The SequentialGridPosition has len(self.positions)=1050 different positions (therefore crops).

Initializing ET3DConstantEverywhere with a SequentialGridPosition.
The {x, y, z}_range values

# Callbacks

In [21]:
autosave_cb = keras_callbacks.ModelCheckpoint(
    tomo2seg_model.autosaved2_model_path_str, 
    monitor="val_loss", 
    verbose=1, 
    save_best_only=True, 
    mode="min",
)

logger.debug(f"{autosave_cb=}")

DEBUG::tomo2seg::{<ipython-input-21-be8d862a0547>:<module>:009}::[2020-12-14::10:56:21.227]
autosave_cb=<tensorflow.python.keras.callbacks.ModelCheckpoint object at 0x7f2909aadfd0>



In [22]:
# this is important because sometimes i update things in the notebook
# so i need to make sure that the objects in the history cb are updated
try:
    history_cb
    
except NameError:
    logger.info("Creating a new history callback.")
    
    history_cb = tomo2seg_callbacks.History(
        optimizer=model.optimizer,
        crop_seq_train=crop_seq_train,
        crop_seq_val=crop_seq_val,
        backup=1,
        csv_path=tomo2seg_model.history_path,
    )
    
else:
    logger.warning("The history callback already exists!")
    
    history_df = history_cb.dataframe

    try:
        history_df_temp = pd.read_csv(tomo2seg_model.history_path)
        # keep the longest one
        history_df = history_df if history_df.shape[0] >= history_df_temp.shape[0] else history_df_temp
        del history_df_temp
    
    except FileNotFoundError:
        logger.info("History hasn't been saved yet.")
        
    except pd.errors.EmptyDataError:
        logger.info("History hasn't been saved yet.")
        
finally:
    # make sure the correct objects are linked 
    history_cb.optimizer = model.optimizer
    history_cb.crop_seq_train = crop_seq_train
    history_cb.crop_seq_val = crop_seq_val

logger.debug(f"{history_cb=}")
logger.debug(f"{history_cb.dataframe.index.size=}")
logger.debug(f"{history_cb.last_epoch=}")

INFO::tomo2seg::{<ipython-input-22-9e74ae8e55e2>:<module>:007}::[2020-12-14::10:56:21.691]
Creating a new history callback.

INFO::tomo2seg::{callbacks.py:__init__:051}::[2020-12-14::10:56:21.706]
Loading history from csv self.csv_path=PosixPath('/home/users/jcasagrande/projects/tomo2seg/data/models/unet2halfd/unet2halfd.II-enc-c208-f08.fold000.1607-939-493/history.csv').

DEBUG::tomo2seg::{callbacks.py:__init__:071}::[2020-12-14::10:56:21.708]
History hasn't been saved yet.

DEBUG::tomo2seg::{<ipython-input-22-9e74ae8e55e2>:<module>:040}::[2020-12-14::10:56:21.709]
history_cb=<tomo2seg.callbacks.History object at 0x7f29208c9df0>

DEBUG::tomo2seg::{<ipython-input-22-9e74ae8e55e2>:<module>:041}::[2020-12-14::10:56:21.710]
history_cb.dataframe.index.size=0

DEBUG::tomo2seg::{<ipython-input-22-9e74ae8e55e2>:<module>:042}::[2020-12-14::10:56:21.711]
history_cb.last_epoch=0



In [23]:
history_plot_cb = tomo2seg_callbacks.HistoryPlot(
    history_callback=history_cb,
    save_path=tomo2seg_model.train_history_plot_wip_path
)
logger.debug(f"{history_plot_cb=}")

DEBUG::tomo2seg::{<ipython-input-23-6edf00a82883>:<module>:005}::[2020-12-14::10:56:21.893]
history_plot_cb=HistoryPlot(history_callback=<tomo2seg.callbacks.History object at 0x7f29208c9df0>, save_path=PosixPath('/home/users/jcasagrande/projects/tomo2seg/data/models/unet2halfd/unet2halfd.II-enc-c208-f08.fold000.1607-939-493/train-hist-plot-wip.png'))



In [24]:
logger.info(f"Setting up early stop with {args.early_stop_mode=}")

if args.early_stop_mode == Args.EarlyStopMode.no_early_stop:
    pass

else:
    raise NotImplementedError(f"{args.early_stop_mode=}")
#     # todo modify the early stopping to take more conditions (don't stop too early before it doesnt break the jaccard2=.32)
#     early_stop_cb = keras_callbacks.EarlyStopping(  
#         monitor='val_loss', 
#         min_delta=.1 / 100, 
#         patience=50,
#         verbose=2, 
#         mode='auto',
#         baseline=.71,  # 0th-order classifier
#         restore_best_weights=False,
#     )

INFO::tomo2seg::{<ipython-input-24-1b2efaf987e7>:<module>:001}::[2020-12-14::10:56:22.370]
Setting up early stop with args.early_stop_mode=<EarlyStopMode.no_early_stop: 0>



# Summary before training

stuff that i use after the training but i want it to appear in the 


mode## Metadata

todo put this back to work

## Volume slices

todo do this in a notebook

## Generator samples

todo do this in a notebook


# Training


## Teeth log lr schedule

In [25]:
# [manual-input]
lr_schedule_cb = keras_callbacks.LearningRateScheduler(
    schedule=(
        schedule := tomo2seg_schedule.get_schedule00()
#         schedule := tomo2seg_schedule.LogSpaceSchedule(
#             offset_epoch=0, wait=0, start=-3, stop=-5, n_between_scales=100
#         )
    ),
    verbose=2,
)

logger.info(f"{lr_schedule_cb.schedule.range=}")

INFO::tomo2seg::{schedule.py:__post_init__:071}::[2020-12-14::10:56:25.519]
LogSpaceSchedule ==> self.n=10

INFO::tomo2seg::{schedule.py:__post_init__:071}::[2020-12-14::10:56:25.520]
LogSpaceSchedule ==> self.n=30

INFO::tomo2seg::{schedule.py:__post_init__:071}::[2020-12-14::10:56:25.521]
LogSpaceSchedule ==> self.n=20

INFO::tomo2seg::{schedule.py:__post_init__:071}::[2020-12-14::10:56:25.522]
LogSpaceSchedule ==> self.n=40

INFO::tomo2seg::{schedule.py:__post_init__:071}::[2020-12-14::10:56:25.523]
LogSpaceSchedule ==> self.n=20

INFO::tomo2seg::{schedule.py:__post_init__:071}::[2020-12-14::10:56:25.524]
LogSpaceSchedule ==> self.n=40

INFO::tomo2seg::{schedule.py:__post_init__:071}::[2020-12-14::10:56:25.525]
LogSpaceSchedule ==> self.n=100

INFO::tomo2seg::{schedule.py:__post_init__:107}::[2020-12-14::10:56:25.526]
ComposedSchedule ==> self.n=260

INFO::tomo2seg::{<ipython-input-25-4abf876edaa4>:<module>:012}::[2020-12-14::10:56:25.527]
lr_schedule_cb.schedule.range=(0, 260)



In [26]:
callbacks = [
    keras_callbacks.TerminateOnNaN(),
    autosave_cb,
    history_cb,
    history_plot_cb,
    lr_schedule_cb,
]

try:
    early_stop_cb

except NameError:
    pass

else:
    callbacks.append(early_stop_cb)

for cb in callbacks:
    logger.debug(f"using callback {cb.__class__.__name__}")

DEBUG::tomo2seg::{<ipython-input-26-c16af9d5fdc3>:<module>:019}::[2020-12-14::10:56:28.565]
using callback TerminateOnNaN

DEBUG::tomo2seg::{<ipython-input-26-c16af9d5fdc3>:<module>:019}::[2020-12-14::10:56:28.566]
using callback ModelCheckpoint

DEBUG::tomo2seg::{<ipython-input-26-c16af9d5fdc3>:<module>:019}::[2020-12-14::10:56:28.567]
using callback History

DEBUG::tomo2seg::{<ipython-input-26-c16af9d5fdc3>:<module>:019}::[2020-12-14::10:56:28.567]
using callback HistoryPlot

DEBUG::tomo2seg::{<ipython-input-26-c16af9d5fdc3>:<module>:019}::[2020-12-14::10:56:28.568]
using callback LearningRateScheduler



In [None]:
# [manual-input]
n_epochs = 400


class TrainingFinished(Exception):
    pass


class FailedToFindBatchSize(Exception):
    pass


def fit():
    model.fit(
        # data sequences
        x=crop_seq_train,
        validation_data=crop_seq_val,

        # [manual-input]
        # epochs
        initial_epoch=0,
        epochs=n_epochs,
    #     initial_epoch=history_cb.last_epoch + 1,  # for some reason it is 0-starting and others 1-starting...
    #         epochs=history_cb.last_epoch + 1 + n_epochs,  

        # others
        callbacks=callbacks,  
        verbose=2,

        # todo change the volume sequence to dinamically load the volume
        # because it would allow me to pass just a path string therefore
        # making it serializible ==> i will be able to multithread (:
        use_multiprocessing=False,   
    );
    raise TrainingFinished()


while True:

    try:
        fit()
        
    except TrainingFinished:
        slack.notify_finished()
        
    except Exception as ex:
                
        logger.exception(ex)
        
        if args.batch_size_mode == Args.BatchSizeMode.try_max_and_fail:
            raise ex
        
        batch_size -= n_gpus
        logger.warning(f"reduced {batch_size=}")
        
        if batch_size < n_gpus:
            raise FailedToFindBatchSize
        
        crop_seq_train.batch_size = batch_size
        crop_seq_val.batch_size = batch_size


Epoch 00001: LearningRateScheduler reducing learning rate to 0.0001.
Epoch 1/400
ERROR::tomo2seg::{<ipython-input-27-1e0a9057069d>:<module>:048}::[2020-12-14::10:59:00.993]
2 root error(s) found.
  (0) Resource exhausted:  OOM when allocating tensor with shape[26,224,104,104] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node unet2halfd.II-enc-c208-f08.fold000.1607-939-493/concat-1/concat (defined at /threading.py:932) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[div_no_nan/ReadVariableOp_1/_468]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

  (1) Resource exhausted:  OOM when allocating tensor with shape[26,224,104,104] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node unet2h

# History

In [None]:
fig, axs = plt.subplots(nrows := 2, ncols := 1, figsize=(2.5 * (sz := 5), nrows * sz), dpi=100)
fig.set_tight_layout(True)

hist_display = viz.TrainingHistoryDisplay(
    history_cb.history, 
    model_name=tomo2seg_model.name,
    loss_name=model.loss.__name__,
    x_axis_mode=(
        "epoch", "batch", "crop", "voxel", "time",
    ),
).plot(
    axs, 
    with_lr=True,
    metrics=(
        "loss", 
    ),
)

axs[0].set_yscale("log")
axs[-1].set_yscale("log")

viz.mark_min_values(hist_display.axs_metrics_[0], hist_display.plots_["loss"][0])
viz.mark_min_values(hist_display.axs_metrics_[0], hist_display.plots_["val_loss"][0], txt_kwargs=dict(rotation=0))

hist_display.fig_.savefig(
    tomo2seg_model.model_path / (hist_display.title + ".png"),
    format='png',
)
# plt.close()

In [None]:
history_cb.dataframe.to_csv(history_cb.csv_path, index=True)

In [None]:
model.save(tomo2seg_model.model_path)

In [None]:
this_nb_name = "train-06-akela90.ipynb"
import os
this_dir = os.getcwd()
logger.warning(f"{this_nb_name=} {this_dir=}")

os.system(f"jupyter nbconvert {this_dir}/{this_nb_name} --output-dir {str(tomo2seg_model.model_path)} --to html")