In [1]:
import sys
sys.path.append('..')

from scripts.global_funcs import load_data_config, load_model_config, get_num_of_classes
from glob import glob
import os



import nvtabular as nvt
import dask_cudf

from nvtabular.utils import device_mem_size

from dask_cuda import LocalCUDACluster
from dask.distributed import Client

import shutil

# import rmm
import pathlib

In [2]:
model_config = load_model_config()
model_config

{'preblock_filters': 64,
 'preblock_kernel_sizes': [3],
 'preblock_pool_size': 2,
 'idblock_kernel_sizes': [3],
 'idblock_filters': [64, 128, 256, 512],
 'idblock_activation': 'gelu',
 'idblock_avg_pool_size': 2,
 'last_activation': 'softmax',
 'batch_size': 512,
 'learn_rate': 0.0001,
 'epochs': 53,
 'patience': 10,
 'num_warmup_epochs': 3,
 'warmup_lr_multiplier': 0.01,
 'from_logits': False,
 'TF_MEMORY_ALLOCATION': '0.7',
 'TF_VISIBLE_DEVICE': '0',
 'TF_FORCE_GPU_ALLOW_GROWTH': 'true',
 'device_spill_frac': 0.2,
 'protocol': 'tcp',
 'visible_devices': '0',
 'enable_tcp_over_ucx': False,
 'enable_nvlink': False,
 'enable_infiniband': False,
 'rmm_pool_size': '1GB',
 'clear_models_dirs': True,
 'clear_tensorboard': True}

In [3]:
preblock_filters = model_config['preblock_filters']
preblock_kernel_sizes = model_config['preblock_kernel_sizes']
preblock_pool_size = model_config['preblock_pool_size']
idblock_kernel_sizes = model_config['idblock_kernel_sizes']
idblock_filters = model_config['idblock_filters']
idblock_activation = model_config['idblock_activation']
idblock_avg_pool_size = model_config['idblock_avg_pool_size']
last_activation = model_config['last_activation']
batch_size = model_config['batch_size']
learn_rate = model_config['learn_rate']
epochs = model_config['epochs']
patience = model_config['patience']
clear_models_dirs = model_config['clear_models_dirs']
clear_tensorboard = model_config['clear_tensorboard']
num_warmup_epochs = model_config['num_warmup_epochs']
warmup_lr_multiplier = model_config['warmup_lr_multiplier']
from_logits = model_config['from_logits']

In [4]:
os.environ["TF_MEMORY_ALLOCATION"] = model_config['TF_MEMORY_ALLOCATION']  # fraction of free memory
os.environ["TF_VISIBLE_DEVICE"] = model_config['TF_VISIBLE_DEVICE'] 
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = model_config['TF_FORCE_GPU_ALLOW_GROWTH']

In [5]:
import tensorflow as tf
from scripts.cosme_model import COSMELayer
from nvtabular.loader.tensorflow import KerasSequenceLoader, KerasSequenceValidater

2022-05-24 00:30:06.208445: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-24 00:30:06.209219: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-24 00:30:06.209315: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [6]:
config = load_data_config()
config

{'clean_fasta_file': '/media/jcosme/Data/MarRef_parquet_10_cats',
 'output_dir': '/media/jcosme/Data',
 'project_name': 'MarRef_parquet_10_cats',
 'base_col_names': ['seq', 'label'],
 'label_col_name': 'label',
 'input_col_name': 'seq',
 'label_regex': '(?:[^a-zA-Z0-9]+)([a-zA-Z]+[0-9]+)(?:[^a-zA-Z0-9]+)',
 'k_mer': 1,
 'possible_gene_values': ['A', 'C', 'G', 'T'],
 'max_seq_len': 150,
 'data_splits': {'train': 0.9, 'val': 0.05, 'test': 0.05},
 'random_seed': 42,
 'fasta_sep': '>',
 'unq_labs_dir': '/media/jcosme/Data/MarRef_parquet_10_cats/data/unq_labels',
 'unq_labs_dir_csv': '/media/jcosme/Data/MarRef_parquet_10_cats/data/unq_labels.csv',
 'data_dir': '/media/jcosme/Data/MarRef_parquet_10_cats/data/MarRef_parquet_10_cats',
 'nvtab_dir': '/media/jcosme/Data/MarRef_parquet_10_cats/nvtab',
 'dask_dir': '/media/jcosme/Data/MarRef_parquet_10_cats/dask',
 'tensorboard_dir': '/media/jcosme/Data/MarRef_parquet_10_cats/tensorboard',
 'model_checkpoints_dir': '/media/jcosme/Data/MarRef_parqu

In [7]:
nvtab_dir = config['nvtab_dir']
label_col_name = config['label_col_name']
input_col_name = config['input_col_name']
dask_dir = config['dask_dir']
tensorboard_dir = config['tensorboard_dir']
model_checkpoints_dir = config['model_checkpoints_dir']
model_weights_dir = config['model_weights_dir']
max_seq_len = config['max_seq_len']
model_checkpoints_parent_dir = config['model_checkpoints_parent_dir']
possible_gene_values = config['possible_gene_values']
val_split = config['data_splits']['val']

In [8]:
if clear_models_dirs:
    try:
        shutil.rmtree(model_checkpoints_dir)
    except:
        pass
    try:
        shutil.rmtree(model_weights_dir)
    except:
        pass
    try:
        shutil.rmtree(model_checkpoints_parent_dir)
    except:
        pass
    
    
if clear_tensorboard: 
    try:
        shutil.rmtree(tensorboard_dir)
    except:
        pass


In [9]:
n_classes = get_num_of_classes()

In [10]:
# define some information about where to get our data
dask_workdir = pathlib.Path(nvtab_dir, "dask", "workdir")
stats_path = pathlib.Path(nvtab_dir, "dask", "stats")

# Make sure we have a clean worker space for Dask
if pathlib.Path.is_dir(dask_workdir):
    shutil.rmtree(dask_workdir)
dask_workdir.mkdir(parents=True)

# Make sure we have a clean stats space for Dask
if pathlib.Path.is_dir(stats_path):
    shutil.rmtree(stats_path)
stats_path.mkdir(parents=True)

cluster = LocalCUDACluster(
    protocol=model_config['protocol'],
    CUDA_VISIBLE_DEVICES=model_config['visible_devices'],
    device_memory_limit=device_mem_size(kind="total") * model_config['device_spill_frac'],
    enable_tcp_over_ucx=model_config['enable_tcp_over_ucx'],
    enable_nvlink=model_config['enable_nvlink'],
    enable_infiniband=model_config['enable_infiniband'],
    # rmm_pool_size=model_config['rmm_pool_size'],
    local_directory=dask_workdir,
)

client = Client(cluster)

# def _rmm_pool():
#     rmm.reinitialize(
#         pool_allocator=True,
#         initial_pool_size=None,  # Use default size
#     )
    
# client.run(_rmm_pool)
client

distributed.preloading - INFO - Import preload module: dask_cuda.initialize


0,1
Connection method: Cluster object,Cluster type: LocalCUDACluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Status: running,Using processes: True
Dashboard: http://127.0.0.1:8787/status,Workers: 1
Total threads:  1,Total memory:  31.21 GiB

0,1
Comm: tcp://127.0.0.1:34595,Workers: 1
Dashboard: http://127.0.0.1:8787/status,Total threads:  1
Started:  Just now,Total memory:  31.21 GiB

0,1
Comm: tcp://127.0.0.1:42421,Total threads: 1
Dashboard: http://127.0.0.1:35165/status,Memory: 31.21 GiB
Nanny: tcp://127.0.0.1:35779,
Local directory: /media/jcosme/Data/MarRef_parquet_10_cats/nvtab/dask/workdir/dask-worker-space/worker-p2c9n3m2,Local directory: /media/jcosme/Data/MarRef_parquet_10_cats/nvtab/dask/workdir/dask-worker-space/worker-p2c9n3m2
GPU: NVIDIA GeForce RTX 3080 Laptop GPU,GPU memory: 16.00 GiB


In [11]:
TRAIN_PATHS = sorted(glob(f"{nvtab_dir}/train/*.parquet"))
VAL_PATHS = sorted(glob(f"{nvtab_dir}/val/*.parquet"))

In [12]:
# feed them to our datasets
train_dataset = KerasSequenceLoader(
    nvt.Dataset(TRAIN_PATHS, part_size="10MB"), # you could also use a glob pattern
    batch_size=batch_size,
    label_names=[label_col_name],
    cat_names=[input_col_name],
    shuffle=True,
    buffer_size=0.001,  # amount of data, as a fraction of GPU memory, to load at once,
    device=0,
    parts_per_chunk=1,
    engine="parquet",
)


  f"Row group memory size ({rg_byte_size_0}) (bytes) of parquet file is bigger"


In [13]:
valid_dataset = KerasSequenceLoader(
    nvt.Dataset(VAL_PATHS, part_size="10MB"),   # you could also use a glob pattern
    batch_size=int(batch_size*val_split),
    label_names=[label_col_name],
    cat_names=[input_col_name],
    shuffle=False,
    buffer_size=0.001,  # amount of data, as a fraction of GPU memory, to load at once,
    device=0,
    parts_per_chunk=1,
    engine="parquet",
)

In [14]:
# batch = next(iter(train_dataset))

In [15]:
# batch[0]

In [16]:
inputs = {}
inputs[input_col_name] = \
    (tf.keras.Input(name=f"{input_col_name}__values", dtype=tf.float32, shape=(1,)),
     tf.keras.Input(name=f"{input_col_name}__nnzs", dtype=tf.int64, shape=(1,)))

In [None]:
preproc = tf.keras.layers.Lambda(lambda x: x['seq'][0])(inputs)
throw_way = tf.keras.layers.Lambda(lambda x: x['seq'][1])(inputs)
shape = [tf.shape(throw_way)[k] for k in range(2)]
preproc = tf.reshape(preproc, [shape[0], max_seq_len])
preproc = tf.cast(preproc, tf.float32)
preproc = tf.expand_dims(preproc, 0)
preproc = tf.reshape(preproc, [shape[0], 1, max_seq_len])
preproc = tf.math.multiply(preproc, 1/len(possible_gene_values))

In [None]:
cosme_layer = COSMELayer(
                preblock_filters=preblock_filters,
                preblock_kernel_sizes=preblock_kernel_sizes,
                preblock_pool_size=preblock_pool_size,
                idblock_kernel_sizes=idblock_kernel_sizes,
                idblock_filters = idblock_filters,
                idblock_activation=idblock_activation,
                idblock_avg_pool_size=idblock_avg_pool_size,
                last_activation=last_activation,
                n_classes=n_classes,
                )
output_layer = cosme_layer(preproc)

In [None]:
cosme_model = tf.keras.Model(inputs=inputs, outputs=output_layer)

In [None]:
# cosme_model.layers[-1]._name = 'COSMELayer'

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate, amsgrad=True)

In [None]:
cosme_model.compile(optimizer, 
              tf.keras.losses.SparseCategoricalCrossentropy(from_logits=from_logits), 
              metrics=['sparse_categorical_accuracy'],
             )

In [None]:
validation_callback = KerasSequenceValidater(valid_dataset)

In [None]:
earlystopping_cb = tf.keras.callbacks.EarlyStopping(
                        monitor='val_loss',
                        patience=patience,
                        restore_best_weights=True
                    )

checkpoints_cb = tf.keras.callbacks.ModelCheckpoint(
    filepath=model_checkpoints_dir,
    save_weights_only=True,
    monitor='val_accuracy',
   )

tensorboard_cb = tf.keras.callbacks.TensorBoard(
    log_dir=tensorboard_dir,
    histogram_freq=1,
)

restore_cb = tf.keras.callbacks.BackupAndRestore(backup_dir=model_checkpoints_dir)

def lr_scheduler(epoch, lr):
    if epoch < num_warmup_epochs:
        return learn_rate * warmup_lr_multiplier
    else:
        return learn_rate 
    
scheduler_cb = tf.keras.callbacks.LearningRateScheduler(lr_scheduler)

In [None]:
# history = cosme_model.fit(
#     train_dataset,
#     epochs=epochs,
#     callbacks=[validation_callback]
# )

In [None]:
history = cosme_model.fit(
    train_dataset,
    epochs=epochs,
    validation_data=valid_dataset,
    callbacks=[ earlystopping_cb, checkpoints_cb, tensorboard_cb, restore_cb, scheduler_cb]
)

In [None]:
cosme_model.save_weights(model_weights_dir)

In [None]:
cosme_model.evaluate(valid_dataset)

In [None]:
cosme_model.evaluate(train_dataset)

In [None]:
# cosme_model.predict(valid_dataset).argmax(1)