In [1]:
# !pip install --upgrade tensorflow --quiet
# !pip install keras_tuner --quiet
# !pip install tensorflow-io --quiet
# # Google colab modules
# from google.colab import drive
import sys, importlib

# # Mount drive
# drive.mount('/content/gdrive', force_remount=True)
ROOT_PATH = './'
# sys.path.append(ROOT_PATH)

import coremlv2 as core
core._init_ml()
# core._init_models()
core.os.environ['CUDA_VISIBLE_DEVICES'] = '1'

# Reload coreml
importlib.reload(core)
import keras_tuner as kt

#### Use multiple GPUs

In [2]:
memory_limit = 2816 # 2 + 4
memory_limit = 2300 # 3 + 5
memory_limit = 3500 # 2 + 3
memory_limit = 1700 # 4 + 6
memory_limit = 1850 # 4 + 6, display moved to titan X
memory_limit = 1750 # 4 + 6, display moved to titan X, increase gpu free memory
gpus = core.tf.config.list_physical_devices('GPU')
if gpus:
    # Create virtual GPUs
    try:
        core.tf.config.set_logical_device_configuration(
            gpus[0],
            [core.tf.config.LogicalDeviceConfiguration(memory_limit=memory_limit),
             core.tf.config.LogicalDeviceConfiguration(memory_limit=memory_limit),
             core.tf.config.LogicalDeviceConfiguration(memory_limit=memory_limit),
             core.tf.config.LogicalDeviceConfiguration(memory_limit=memory_limit)])
        core.tf.config.set_logical_device_configuration(
            gpus[1],
            [core.tf.config.LogicalDeviceConfiguration(memory_limit=memory_limit),
             core.tf.config.LogicalDeviceConfiguration(memory_limit=memory_limit),
             core.tf.config.LogicalDeviceConfiguration(memory_limit=memory_limit),
             core.tf.config.LogicalDeviceConfiguration(memory_limit=memory_limit),
             core.tf.config.LogicalDeviceConfiguration(memory_limit=memory_limit),
             core.tf.config.LogicalDeviceConfiguration(memory_limit=memory_limit)])
        logical_gpus = core.tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs, ", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Virtual devices must be set before GPUs have been initialized
        print(e)
    

2 Physical GPUs,  10 Logical GPUs


In [2]:
memory_limit = 1750 # 6 vgpu
memory_limit = 3500 # 3 vgpu
memory_limit = 5250 # 2 vgpu
# memory_limit = 3000 # 2 vgpu
gpus = core.tf.config.list_physical_devices('GPU')
if gpus:
    # Create virtual GPUs
    try:
        core.tf.config.set_logical_device_configuration(
            gpus[0],
            [core.tf.config.LogicalDeviceConfiguration(memory_limit=memory_limit),
             core.tf.config.LogicalDeviceConfiguration(memory_limit=memory_limit)])
        logical_gpus = core.tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs, ", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Virtual devices must be set before GPUs have been initialized
        print(e)

1 Physical GPUs,  2 Logical GPUs


In [2]:
# Limiting GPU memory growth
gpus = core.tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            core.tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = core.tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs, ", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs,  1 Logical GPUs


### Manually retrain model

In [3]:
model_base_id = '327'
version = '3'
model_no = model_base_id
constituent_limits = 0.025
id_constituent = 1
min_vid_constituents = 0.5
epochs = 5
iter_id = f'model-{model_no}-{version}_constituent_limits-{constituent_limits}_id_constituent-{id_constituent}_min_vid_constituents-{min_vid_constituents}_epochs-{epochs}'
ROOT_PATH='./'
DB_ROOT_PATH='J:\#PROJECT\idx'
db_ver = '8'
batch_size = 256
shuffle_buffer_size = 1
generator=True

In [4]:
# Train: `slice_from_beginning`=True
train_gen = core.load_dataset_wsd(slice_from_beginning=True, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=constituent_limits, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no)
# Validation: `slice_from_beginning`=False. constituent_limits in validation is always 0 (focus on idx performance progression only)
validation_gen = core.load_dataset_wsd(slice_from_beginning=False, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=0, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no)

# Load model backbone
model = core.model_switcher_preloaded(model_no, version=version)
history = model.fit(train_gen, validation_data=validation_gen, epochs=epochs, verbose=1)

save_path = core.os.path.join(ROOT_PATH, f'models/preloaded/{iter_id}/')
model.save_weights(f'{save_path}/weights/checkpoint')
model.save(f'{save_path}/model')

history.history['batch_size'] = batch_size
history.history['shuffle_buffer_size'] = shuffle_buffer_size

with open(f'{save_path}/history.json', 'w') as f:
    core.json.dump(history.history, f)

Total constituents: 1149
Total constituents: 422
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




In [4]:
model_base_id = '327'
version = '3'
model_no = model_base_id
constituent_limits = 0.025
id_constituent = 1
min_vid_constituents = 0.5
epochs = 10
iter_id = f'model-{model_no}-{version}_constituent_limits-{constituent_limits}_id_constituent-{id_constituent}_min_vid_constituents-{min_vid_constituents}_epochs-{epochs}'
ROOT_PATH='./'
DB_ROOT_PATH='J:\#PROJECT\idx'
db_ver = '8'
batch_size = 4096
shuffle_buffer_size = 1
generator=True

In [5]:
# Train: `slice_from_beginning`=True
train_gen = core.load_dataset_wsd(slice_from_beginning=True, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=constituent_limits, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no)
# Validation: `slice_from_beginning`=False. constituent_limits in validation is always 0 (focus on idx performance progression only)
validation_gen = core.load_dataset_wsd(slice_from_beginning=False, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=0, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no)

# Load model backbone
model = core.model_switcher_preloaded(model_no, version=version)
history = model.fit(train_gen, validation_data=validation_gen, epochs=epochs, verbose=1)

save_path = core.os.path.join(ROOT_PATH, f'models/preloaded/{iter_id}/')
model.save_weights(f'{save_path}/weights/checkpoint')
model.save(f'{save_path}/model')

history.history['batch_size'] = batch_size
history.history['shuffle_buffer_size'] = shuffle_buffer_size

with open(f'{save_path}/history.json', 'w') as f:
    core.json.dump(history.history, f)

Total constituents: 1149
Total constituents: 422
Epoch 1/10


InternalError:    Failed to call ThenRnnForward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 2, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 256, 256, 1, 120, 4096, 256] 
	 [[{{node CudnnRNN}}]]
	 [[model/lstm_1/PartitionedCall]] [Op:__inference_train_function_431164]

Function call stack:
train_function -> train_function -> train_function


In [3]:
model_base_id = '327'
version = '4'
model_no = model_base_id
constituent_limits = 0.025
id_constituent = 1
min_vid_constituents = 0.5
epochs = 10
batch_size = 1024
iter_id = f'model-{model_no}-{version}_constituentlimits-{constituent_limits}_idconstituent-{id_constituent}_minvidconstituents-{min_vid_constituents}_epochs-{epochs}_batchsize-{batch_size}'
ROOT_PATH='./'
DB_ROOT_PATH='J:\#PROJECT\idx'
db_ver = '8'

shuffle_buffer_size = 1
generator=True

In [4]:
# Train: `slice_from_beginning`=True
train_gen = core.load_dataset_wsd(slice_from_beginning=True, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=constituent_limits, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no)
# Validation: `slice_from_beginning`=False. constituent_limits in validation is always 0 (focus on idx performance progression only)
validation_gen = core.load_dataset_wsd(slice_from_beginning=False, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=0, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no)

# Load model backbone
model = core.model_switcher_preloaded(model_no, version=version)
history = model.fit(train_gen, validation_data=validation_gen, epochs=epochs, verbose=1)

save_path = core.os.path.join(ROOT_PATH, f'models/preloaded/{iter_id}/')
model.save_weights(f'{save_path}/weights/checkpoint')
model.save(f'{save_path}/model')

with open(f'{save_path}/history.json', 'w') as f:
    core.json.dump(history.history, f)

Total constituents: 1149
Total constituents: 422
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




In [3]:
model_base_id = '327'
version = '3'
model_no = model_base_id
constituent_limits = 0
id_constituent = 1
min_vid_constituents = 0.5
epochs = 10
batch_size = 1024
iter_id = f'model-{model_no}-{version}_constituentlimits-{constituent_limits}_idconstituent-{id_constituent}_minvidconstituents-{min_vid_constituents}_epochs-{epochs}_batchsize-{batch_size}'
ROOT_PATH='./'
DB_ROOT_PATH='J:\#PROJECT\idx'
db_ver = '8'

shuffle_buffer_size = 1
generator=True

In [4]:
# Train: `slice_from_beginning`=True
train_gen = core.load_dataset_wsd(slice_from_beginning=True, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=constituent_limits, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no)
# Validation: `slice_from_beginning`=False. constituent_limits in validation is always 0 (focus on idx performance progression only)
validation_gen = core.load_dataset_wsd(slice_from_beginning=False, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=0, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no)

# Load model backbone
model = core.model_switcher_preloaded(model_no, version=version)
history = model.fit(train_gen, validation_data=validation_gen, epochs=epochs, verbose=1)

save_path = core.os.path.join(ROOT_PATH, f'models/preloaded/{iter_id}/')
model.save_weights(f'{save_path}/weights/checkpoint')
model.save(f'{save_path}/model')

with open(f'{save_path}/history.json', 'w') as f:
    core.json.dump(history.history, f)

Total constituents: 422
Total constituents: 422
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




In [3]:
model_base_id = '327'
version = '3'
model_no = model_base_id
constituent_limits = 0.05
id_constituent = 1
min_vid_constituents = 0.3
epochs = 10
batch_size = 512
iter_id = f'model-{model_no}-{version}_constituentlimits-{constituent_limits}_idconstituent-{id_constituent}_minvidconstituents-{min_vid_constituents}_epochs-{epochs}_batchsize-{batch_size}'
ROOT_PATH='./'
DB_ROOT_PATH='J:\#PROJECT\idx'
db_ver = '8'

shuffle_buffer_size = 1
generator=True

In [4]:
# Train: `slice_from_beginning`=True
train_gen = core.load_dataset_wsd(slice_from_beginning=True, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=constituent_limits, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no, min_vid_constituents=min_vid_constituents)
# Validation: `slice_from_beginning`=False. constituent_limits in validation is always 0 (focus on idx performance progression only)
validation_gen = core.load_dataset_wsd(slice_from_beginning=False, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=0, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no, min_vid_constituents=min_vid_constituents)

# Load model backbone
model = core.model_switcher_preloaded(model_no, version=version)
history = model.fit(train_gen, validation_data=validation_gen, epochs=epochs, verbose=1)

save_path = core.os.path.join(ROOT_PATH, f'models/preloaded/{iter_id}/')
model.save_weights(f'{save_path}/weights/checkpoint')
model.save(f'{save_path}/model')

with open(f'{save_path}/history.json', 'w') as f:
    core.json.dump(history.history, f)

Total constituents: 2039
Total constituents: 422
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




In [3]:
model_base_id = '327'
version = '3'
model_no = model_base_id
constituent_limits = 0.025
id_constituent = 1
min_vid_constituents = 0.6
epochs = 10
batch_size = 512
iter_id = f'model-{model_no}-{version}_constituentlimits-{constituent_limits}_idconstituent-{id_constituent}_minvidconstituents-{min_vid_constituents}_epochs-{epochs}_batchsize-{batch_size}_2'
ROOT_PATH='./'
DB_ROOT_PATH='J:\#PROJECT\idx'
db_ver = '8'

shuffle_buffer_size = 1
generator=True

In [4]:
# Train: `slice_from_beginning`=True
train_gen = core.load_dataset_wsd(slice_from_beginning=True, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=constituent_limits, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no, min_vid_constituents=min_vid_constituents)
# Validation: `slice_from_beginning`=False. constituent_limits in validation is always 0 (focus on idx performance progression only)
validation_gen = core.load_dataset_wsd(slice_from_beginning=False, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=0, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no, min_vid_constituents=min_vid_constituents)

# Load model backbone
model = core.model_switcher_preloaded(model_no, version=version)
history = model.fit(train_gen, validation_data=validation_gen, epochs=epochs, verbose=1)

save_path = core.os.path.join(ROOT_PATH, f'models/preloaded/{iter_id}/')
model.save_weights(f'{save_path}/weights/checkpoint')
model.save(f'{save_path}/model')

with open(f'{save_path}/history.json', 'w') as f:
    core.json.dump(history.history, f)

Total constituents: 1416
Total constituents: 422
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




In [3]:
model_base_id = '327'
version = '5'
model_no = model_base_id
constituent_limits = 0.2
id_constituent = 1
min_vid_constituents = 0.2
epochs = 1
batch_size = 512
iter_id = f'model-{model_no}-{version}_constituentlimits-{constituent_limits}_idconstituent-{id_constituent}_minvidconstituents-{min_vid_constituents}_epochs-{epochs}_batchsize-{batch_size}_2'
ROOT_PATH='./'
DB_ROOT_PATH='J:\#PROJECT\idx'
db_ver = '8'

shuffle_buffer_size = 1
generator=True

In [None]:
# Train: `slice_from_beginning`=True
train_gen = core.load_dataset_wsd(slice_from_beginning=True, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=constituent_limits, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no)
# Validation: `slice_from_beginning`=False. constituent_limits in validation is always 0 (focus on idx performance progression only)
validation_gen = core.load_dataset_wsd(slice_from_beginning=False, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=0, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no)

# Load model backbone
model = core.model_switcher_preloaded(model_no, version=version)
if type(train_gen) == list:
    for epoch in range(epochs):
        for i, tg in enumerate(train_gen):
            print(f'Epoch {epoch+1}-{i+1}/{epochs}')
            history = model.fit(tg, validation_data=validation_gen, epochs=1, verbose=1)
elif type(train_gen) != list:
    history = model.fit(train_gen, validation_data=validation_gen, epochs=epochs, verbose=1)

save_path = core.os.path.join(ROOT_PATH, f'models/preloaded/{iter_id}/')
model.save_weights(f'{save_path}/weights/checkpoint')
model.save(f'{save_path}/model')

with open(f'{save_path}/history.json', 'w') as f:
    core.json.dump(history.history, f)

Total constituents: 7216
Total constituents: 422
Epoch 1-1/1
Epoch 1-2/1
Epoch 1-3/1
Epoch 1-4/1
Epoch 1-5/1
Epoch 1-6/1
   1096/Unknown - 444s 404ms/step - loss: 0.6629 - accuracy: 0.5985

In [3]:
model_base_id = '327'
version = '5'
model_no = model_base_id
constituent_limits = 0.2
id_constituent = 1
min_vid_constituents = 0.2
epochs = 5
batch_size = 1024
iter_id = f'model-{model_no}-{version}_constituentlimits-{constituent_limits}_idconstituent-{id_constituent}_minvidconstituents-{min_vid_constituents}_epochs-{epochs}_batchsize-{batch_size}_2'
ROOT_PATH='./'
DB_ROOT_PATH='J:\#PROJECT\idx'
db_ver = '8'

shuffle_buffer_size = 1
generator=True

In [4]:
# Train: `slice_from_beginning`=True
train_gen = core.load_dataset_wsd(slice_from_beginning=True, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=constituent_limits, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no)
# Validation: `slice_from_beginning`=False. constituent_limits in validation is always 0 (focus on idx performance progression only)
validation_gen = core.load_dataset_wsd(slice_from_beginning=False, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=0, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no)

# Load model backbone
model = core.model_switcher_preloaded(model_no, version=version)
if type(train_gen) == list:
    for epoch in range(epochs):
        for i, tg in enumerate(train_gen):
            print(f'Epoch {epoch+1}-{i+1}/{epochs}')
            history = model.fit(tg, validation_data=validation_gen, epochs=1, verbose=1)
elif type(train_gen) != list:
    history = model.fit(train_gen, validation_data=validation_gen, epochs=epochs, verbose=1)

save_path = core.os.path.join(ROOT_PATH, f'models/preloaded/{iter_id}/')
model.save_weights(f'{save_path}/weights/checkpoint')
model.save(f'{save_path}/model')

with open(f'{save_path}/history.json', 'w') as f:
    core.json.dump(history.history, f)

Total constituents: 7216
Total constituents: 422
Epoch 1-1/5
Epoch 1-2/5
Epoch 1-3/5
Epoch 1-4/5
Epoch 1-5/5
Epoch 1-6/5
Epoch 1-7/5
Epoch 1-8/5
Epoch 2-1/5
Epoch 2-2/5
Epoch 2-3/5
Epoch 2-4/5
Epoch 2-5/5
Epoch 2-6/5
Epoch 2-7/5
Epoch 2-8/5
Epoch 3-1/5
Epoch 3-2/5
Epoch 3-3/5
Epoch 3-4/5
Epoch 3-5/5
Epoch 3-6/5
Epoch 3-7/5
Epoch 3-8/5
Epoch 4-1/5
Epoch 4-2/5
Epoch 4-3/5
Epoch 4-4/5
Epoch 4-5/5
Epoch 4-6/5
Epoch 4-7/5
Epoch 4-8/5
Epoch 5-1/5
Epoch 5-2/5
Epoch 5-3/5
Epoch 5-4/5
Epoch 5-5/5
Epoch 5-6/5
Epoch 5-7/5
Epoch 5-8/5




In [3]:
model_base_id = '327'
version = '4'
model_no = model_base_id
constituent_limits = 0.5
id_constituent = 1
min_vid_constituents = 0.1
epochs = 10
batch_size = 256
iter_id = f'model-{model_no}-{version}_constituentlimits-{constituent_limits}_idconstituent-{id_constituent}_minvidconstituents-{min_vid_constituents}_epochs-{epochs}_batchsize-{batch_size}_2'
ROOT_PATH='./'
DB_ROOT_PATH='J:\#PROJECT\idx'
db_ver = '8'

shuffle_buffer_size = 1
generator=True

In [None]:
# Train: `slice_from_beginning`=True
train_gen = core.load_dataset_wsd(slice_from_beginning=True, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=constituent_limits, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no, min_vid_constituents=min_vid_constituents)
# Validation: `slice_from_beginning`=False. constituent_limits in validation is always 0 (focus on idx performance progression only)
validation_gen = core.load_dataset_wsd(slice_from_beginning=False, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=0, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no, min_vid_constituents=min_vid_constituents)

# Load model backbone
model = core.model_switcher_preloaded(model_no, version=version)
if type(train_gen) == list:
    for epoch in range(epochs):
        for i, tg in enumerate(train_gen):
            print(f'Epoch {epoch+1}-{i+1}/{epochs}')
            history = model.fit(tg, validation_data=validation_gen, epochs=1, verbose=1)
elif type(train_gen) != list:
    history = model.fit(train_gen, validation_data=validation_gen, epochs=epochs, verbose=1)

save_path = core.os.path.join(ROOT_PATH, f'models/preloaded/{iter_id}/')
model.save_weights(f'{save_path}/weights/checkpoint')
model.save(f'{save_path}/model')

with open(f'{save_path}/history.json', 'w') as f:
    core.json.dump(history.history, f)

Total constituents: 16419
Total constituents: 422
Epoch 1-1/10
Epoch 1-2/10
Epoch 1-3/10
Epoch 1-4/10
Epoch 1-5/10
   1899/Unknown - 566s 298ms/step - loss: 0.6314 - accuracy: 0.6360

In [3]:
model_base_id = '327'
version = '3'
model_no = model_base_id
constituent_limits = 0.15
id_constituent = 1
min_vid_constituents = 0.2
epochs = 10
batch_size = 2048
iter_id = f'model-{model_no}-{version}_constituentlimits-{constituent_limits}_idconstituent-{id_constituent}_minvidconstituents-{min_vid_constituents}_epochs-{epochs}_batchsize-{batch_size}_2'
ROOT_PATH='./'
DB_ROOT_PATH='J:\#PROJECT\idx'
db_ver = '8'

shuffle_buffer_size = 1
generator=True

In [4]:
# Train: `slice_from_beginning`=True
train_gen = core.load_dataset_wsd(slice_from_beginning=True, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=constituent_limits, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no, min_vid_constituents=min_vid_constituents)
# Validation: `slice_from_beginning`=False. constituent_limits in validation is always 0 (focus on idx performance progression only)
validation_gen = core.load_dataset_wsd(slice_from_beginning=False, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=0, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no, min_vid_constituents=min_vid_constituents)

# Load model backbone
model = core.model_switcher_preloaded(model_no, version=version)
if type(train_gen) == list:
    for epoch in range(epochs):
        for i, tg in enumerate(train_gen):
            print(f'Epoch {epoch+1}-{i+1}/{epochs}')
            history = model.fit(tg, validation_data=validation_gen, epochs=1, verbose=1)
elif type(train_gen) != list:
    history = model.fit(train_gen, validation_data=validation_gen, epochs=epochs, verbose=1)

save_path = core.os.path.join(ROOT_PATH, f'models/preloaded/{iter_id}/')
model.save_weights(f'{save_path}/weights/checkpoint')
model.save(f'{save_path}/model')

with open(f'{save_path}/history.json', 'w') as f:
    core.json.dump(history.history, f)

Total constituents: 5428
Total constituents: 422
Epoch 1-1/10
Epoch 1-2/10
Epoch 1-3/10
Epoch 1-4/10
Epoch 1-5/10
Epoch 1-6/10
Epoch 2-1/10
Epoch 2-2/10
Epoch 2-3/10
Epoch 2-4/10
Epoch 2-5/10
Epoch 2-6/10
Epoch 3-1/10
Epoch 3-2/10
Epoch 3-3/10
Epoch 3-4/10
Epoch 3-5/10
Epoch 3-6/10
Epoch 4-1/10
Epoch 4-2/10
Epoch 4-3/10
Epoch 4-4/10
Epoch 4-5/10
Epoch 4-6/10
Epoch 5-1/10
Epoch 5-2/10
Epoch 5-3/10
Epoch 5-4/10
Epoch 5-5/10
Epoch 5-6/10
Epoch 6-1/10
Epoch 6-2/10
Epoch 6-3/10
Epoch 6-4/10
Epoch 6-5/10
Epoch 6-6/10
Epoch 7-1/10
Epoch 7-2/10
Epoch 7-3/10
Epoch 7-4/10
Epoch 7-5/10
Epoch 7-6/10
Epoch 8-1/10
Epoch 8-2/10
Epoch 8-3/10
Epoch 8-4/10
Epoch 8-5/10
Epoch 8-6/10
Epoch 9-1/10
Epoch 9-2/10
Epoch 9-3/10
Epoch 9-4/10
Epoch 9-5/10
Epoch 9-6/10
Epoch 10-1/10
Epoch 10-2/10
Epoch 10-3/10
Epoch 10-4/10
Epoch 10-5/10
Epoch 10-6/10




In [3]:
model_base_id = '327'
version = '3'
model_no = model_base_id
constituent_limits = 0.5
id_constituent = 1
min_vid_constituents = 0.4
epochs = 5
batch_size = 1024
load_dataset_wsd_ver_switcher = {'1':['1','world data load from test slice'],
                                 '2':['2','world data load from train slice'],}
load_dataset_wsd_ver = '2'
shuffle_buffer_size = 1
generator=True

ROOT_PATH='./'
DB_ROOT_PATH='E:\#PROJECT\idx'
db_ver = '8'

In [None]:
def epochs_subepochs_progress(folder_list, search_condition):
    '''Return search_results for iter_id 
    
    with format below:
    
    iter_id = f'model-{model_no}-{version}_constituentlimits-{constituent_limits}_idconstituent-{id_constituent}_minvidconstituents-{min_vid_constituents}_epochs-{epochs}_batchsize-{batch_size}_loaddatasetwsdver-{load_dataset_wsd_ver_switcher[load_dataset_wsd_ver][0]}_shufflebuffersize-{shuffle_buffer_size}_generator-{1 if generator else 0}'
    
    with last part is epochs-subepochs progress'''
    search_results = {}
    for fl in folder_list:
        if core.re.search(search_condition, fl):
            progress = fl.split('_')[-1]
            progress_split = tuple(progress.split('-'))
            epochs, sub_epochs = progress_split
            epochs, sub_epochs = int(epochs), int(sub_epochs)
            if epochs not in search_results:
                search_results[epochs] = []
            if sub_epochs not in search_results[epochs]:
                search_results[epochs].append(sub_epochs)
    return search_results

# Resume from file, save every sub-epochs
# Train: `slice_from_beginning`=True
train_gen = core.load_dataset_wsd(slice_from_beginning=True, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=constituent_limits, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no, min_vid_constituents=min_vid_constituents)
# Validation: `slice_from_beginning`=False. constituent_limits in validation is always 0 (focus on idx performance progression only)
validation_gen = core.load_dataset_wsd(slice_from_beginning=False, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=0, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no, min_vid_constituents=min_vid_constituents)

In [None]:
# Define suffix-file-name (as complete as possible, contains all required params to reproduce)
# iter_id = f'model-{model_no}-{version}_constituentlimits-{constituent_limits}_idconstituent-{id_constituent}_minvidconstituents-{min_vid_constituents}_epochs-{epochs}_batchsize-{batch_size}_loaddatasetwsdver-{load_dataset_wsd_ver_switcher[load_dataset_wsd_ver][0]}_shufflebuffersize-{shuffle_buffer_size}_generator-{1 if generator else 0}'
iter_id = f'{model_no}-{version}_cl-{constituent_limits}_idc-{id_constituent}_mid-{min_vid_constituents}_ep-{epochs}_bz-{batch_size}_ldtv-{load_dataset_wsd_ver_switcher[load_dataset_wsd_ver][0]}_sbuffs-{shuffle_buffer_size}_gen-{1 if generator else 0}'

# Search for eligibility in selected folder
preloaded_folder = core.os.path.join(ROOT_PATH, 'models/preloaded')
save_path = core.os.path.join(ROOT_PATH, f'models/preloaded/{iter_id}/')
folder_list = core.os.listdir(preloaded_folder)

# split, select last entry, split again into epochs-subepochs -> dictionary entry
search_results = epochs_subepochs_progress(folder_list, iter_id)

# Load default model if current entries aren't exists.
# Load model backbone
model = core.model_switcher_preloaded(model_no, version=version)

if type(train_gen) == list:
    for epoch in range(epochs):
        for i, tg in enumerate(train_gen):
            complete_iter_id = f'{iter_id}_{epoch}-{i}'
            save_path = core.os.path.join(ROOT_PATH, f'models/preloaded/{complete_iter_id}')
            # Check epochs:
            # Check if current complete file name already in folder
            # If true, load model from file, continue to next loop
            if epoch in search_results:
                if i in search_results[epoch]:
                    print(f'Epoch {epoch}-{i}/{epochs-1}: load existing model and continue...')
                    model = core.tf.keras.models.load_model(f'{save_path}/model')
                    
                    # The loops are continued until the condition is false -> training begin using 
                    # last model in progress
                    continue
                elif i not in search_results[epoch]:
                    print(f'Epoch {epoch}-{i}/{epochs-1}: processing...')
            elif epoch not in search_results:
                print(f'Epoch {epoch}-{i}/{epochs-1}: processing...')
            
            # Start retrain model if latest progress has been loaded.
            history = model.fit(tg, validation_data=validation_gen, epochs=1, verbose=1)
            
            # save model after training, only and only if the model is retrained
            model.save_weights(f'{save_path}/weights/checkpoint')
            model.save(f'{save_path}/model')
            
            # Save history.json if the model is retrained
            with open(f'{save_path}/history.json', 'w') as f:
                core.json.dump(history.history, f)
                
            # The model not reloaded from file if the model is retrained
elif type(train_gen) != list:
    history = model.fit(train_gen, validation_data=validation_gen, epochs=epochs, verbose=1)
    with open(f'{save_path}/history.json', 'w') as f:
        core.json.dump(history.history, f)
        
# Finish.

In [3]:
model_base_id = '327'
version = '9'
model_no = model_base_id
constituent_limits = 0.35
id_constituent = 1
min_vid_constituents = 0.8
epochs = 40
batch_size = 2048
load_dataset_wsd_ver_switcher = {'1':['1','world data load from test slice'],
                                 '2':['2','world data load from train slice'],
                                 '3':['3','validation data from test set'],}
load_dataset_wsd_ver = '3'
shuffle_buffer_size = 1
generator = True
hotstart = True
# hotstart_from = '327-7_cl-0.005_idc-1_mid-0.3_ep-40_bz-1792_ldtv-3_sbuffs-1_gen-1_3-0'
hotstart_from = '327-9_cl-0.001_idc-0.25_mid-0.9_ep-40_bz-512_ldtv-3_sbuffs-1_gen-1_15-0' # 0.6690 / 0.5891
hotstart_from = '327-9_cl-0.005_idc-1_mid-0.9_ep-40_bz-2048_ldtv-3_sbuffs-1_gen-1_28-0' # 0.6459 / 0.6173
hotstart_from = '327-9_cl-0.1_idc-1_mid-0.9_ep-40_bz-2048_ldtv-3_sbuffs-1_gen-1_7-5' # 0.6371 / 0.6281
hotstart_from = '327-9_cl-0.15_idc-1_mid-0.9_ep-40_bz-2048_ldtv-3_sbuffs-1_gen-1_7-1' # 0.6298 / 0.6349
hotstart_from = '327-9_cl-0.2_idc-1_mid-0.9_ep-40_bz-2048_ldtv-3_sbuffs-1_gen-1_6-3' # 0.6232 / 0.6408
hotstart_from = '327-9_cl-0.25_idc-1_mid-0.8_ep-40_bz-2048_ldtv-3_sbuffs-1_gen-1_3-5' # 0.6212 / 0.6433
hotstart_from = '327-9_cl-0.3_idc-1_mid-0.8_ep-40_bz-2048_ldtv-3_sbuffs-1_gen-1_3-4' # 0.6185 / 0.6465

# hotstart_from = '327-10_cl-0.001_idc-0.25_mid-0.8_ep-40_bz-512_ldtv-3_sbuffs-1_gen-1_10-0' # 0.6703 / 0.5885
# hotstart_from = '327-10_cl-0.005_idc-1_mid-0.8_ep-40_bz-512_ldtv-3_sbuffs-1_gen-1_7-0' # 0.6574 / 0.6065
# hotstart_from = '327-10_cl-0.1_idc-1_mid-0.5_ep-40_bz-512_ldtv-3_sbuffs-1_gen-1_1-4' # 0.6535 / 0.6123
distributed = True

ROOT_PATH='./'
DB_ROOT_PATH='E:\#PROJECT\idx'
db_ver = '8'

In [4]:
def epochs_subepochs_progress(folder_list, search_condition):
    '''Return search_results for iter_id 
    
    with format below:
    
    iter_id = f'model-{model_no}-{version}_constituentlimits-{constituent_limits}_idconstituent-{id_constituent}_minvidconstituents-{min_vid_constituents}_epochs-{epochs}_batchsize-{batch_size}_loaddatasetwsdver-{load_dataset_wsd_ver_switcher[load_dataset_wsd_ver][0]}_shufflebuffersize-{shuffle_buffer_size}_generator-{1 if generator else 0}'
    
    with last part is epochs-subepochs progress'''
    search_results = {}
    for fl in folder_list:
        if core.re.search(search_condition, fl):
            progress = fl.split('_')[-1]
            progress_split = tuple(progress.split('-'))
            epochs, sub_epochs = progress_split
            epochs, sub_epochs = int(epochs), int(sub_epochs)
            if epochs not in search_results:
                search_results[epochs] = []
            if sub_epochs not in search_results[epochs]:
                search_results[epochs].append(sub_epochs)
    return search_results

# Resume from file, save every sub-epochs
# Train: `slice_from_beginning`=True
train_gen = core.load_dataset_wsd(slice_from_beginning=True, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=constituent_limits, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no, min_vid_constituents=min_vid_constituents)
# Validation: `slice_from_beginning`=False. constituent_limits in validation is always 0 (focus on idx performance progression only)
validation_gen = core.load_dataset_wsd(slice_from_beginning=False, ROOT_PATH=DB_ROOT_PATH, db_ver=db_ver, constituent_limits=0, id_constituent=id_constituent, batch_size=batch_size, shuffle_buffer_size=shuffle_buffer_size, seed=0, generator=generator, model_no=model_no, min_vid_constituents=min_vid_constituents)

Total constituents: 19079
Total constituents: 422


In [None]:
# Define suffix-file-name (as complete as possible, contains all required params to reproduce)
# iter_id = f'model-{model_no}-{version}_constituentlimits-{constituent_limits}_idconstituent-{id_constituent}_minvidconstituents-{min_vid_constituents}_epochs-{epochs}_batchsize-{batch_size}_loaddatasetwsdver-{load_dataset_wsd_ver_switcher[load_dataset_wsd_ver][0]}_shufflebuffersize-{shuffle_buffer_size}_generator-{1 if generator else 0}'
iter_id = f'{model_no}-{version}_cl-{constituent_limits}_idc-{id_constituent}_mid-{min_vid_constituents}_ep-{epochs}_bz-{batch_size}_ldtv-{load_dataset_wsd_ver_switcher[load_dataset_wsd_ver][0]}_sbuffs-{shuffle_buffer_size}_gen-{1 if generator else 0}'

# Search for eligibility in selected folder
preloaded_folder = core.os.path.join(ROOT_PATH, 'models/preloaded')
save_path = core.os.path.join(ROOT_PATH, f'models/preloaded/{iter_id}/')
folder_list = core.os.listdir(preloaded_folder)

# split, select last entry, split again into epochs-subepochs -> dictionary entry
search_results = epochs_subepochs_progress(folder_list, iter_id)

if distributed:
    core.tf.debugging.set_log_device_placement(True)
    gpus = core.tf.config.list_logical_devices('GPU')
    strategy = core.tf.distribute.MirroredStrategy(gpus)
    with strategy.scope():
        # Load default model if current entries aren't exists.
        # Load model backbone
        model = core.model_switcher_preloaded(model_no, version=version)
        
        if hotstart:
            hotstart_save_path = core.os.path.join(ROOT_PATH, f'models/preloaded/{hotstart_from}')
            model = core.tf.keras.models.load_model(f'{hotstart_save_path}/model')

        if type(train_gen) == list:
            for epoch in range(epochs):
                for i, tg in enumerate(train_gen):
                    complete_iter_id = f'{iter_id}_{epoch}-{i}'
                    save_path = core.os.path.join(ROOT_PATH, f'models/preloaded/{complete_iter_id}')
                    # Check epochs:
                    # Check if current complete file name already in folder
                    # If true, load model from file, continue to next loop
                    if epoch in search_results:
                        if i in search_results[epoch]:
                            print(f'Epoch {epoch}-{i}/{epochs-1}: load existing model and continue...')
                            model = core.tf.keras.models.load_model(f'{save_path}/model')

                            # The loops are continued until the condition is false -> training begin using 
                            # last model in progress
                            continue
                        elif i not in search_results[epoch]:
                            print(f'Epoch {epoch}-{i}/{epochs-1}: processing...')
                    elif epoch not in search_results:
                        print(f'Epoch {epoch}-{i}/{epochs-1}: processing...')

                    # Start retrain model if latest progress has been loaded.
                    history = model.fit(tg, validation_data=validation_gen, epochs=1, verbose=1)

                    # save model after training, only and only if the model is retrained
                    model.save_weights(f'{save_path}/weights/checkpoint')
                    model.save(f'{save_path}/model')

                    # Save history.json if the model is retrained
                    with open(f'{save_path}/history.json', 'w') as f:
                        core.json.dump(history.history, f)

                    # The model not reloaded from file if the model is retrained
        elif type(train_gen) != list:
            history = model.fit(train_gen, validation_data=validation_gen, epochs=epochs, verbose=1)
            with open(f'{save_path}/history.json', 'w') as f:
                core.json.dump(history.history, f)
                
elif not distributed:
    print('Please modify some code to run in not distributed mode.')
        
# Finish.

Epoch 0-0/39: processing...




Epoch 0-1/39: processing...




Epoch 0-2/39: processing...




Epoch 0-3/39: processing...




Epoch 0-4/39: processing...




Epoch 0-5/39: processing...




Epoch 0-6/39: processing...




Epoch 0-7/39: processing...




Epoch 0-8/39: processing...




Epoch 0-9/39: processing...




Epoch 0-10/39: processing...




Epoch 0-11/39: processing...




Epoch 0-12/39: processing...




Epoch 0-13/39: processing...




Epoch 0-14/39: processing...
    173/Unknown - 255s 1s/step - loss: 0.6345 - accuracy: 0.6279