# Train model
___

In [1]:
# Default python libraries
import os
import json
import datetime
from pathlib import Path

# Third party
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.mixed_precision import experimental as mixed_precision

# Local imports

from src import (
    DataGenerator, 
    ModelLoader, 
    EncoderOutput,
    DecoderOutput,
    HotEncoder, # for each nucleotide basis
    HotEncoderKmer, # for kmers
    Word2Vec, # for kmers represented vector embedding
)



In [2]:
# Fix seed
seed = 7
np.random.seed(seed)
tf.keras.backend.clear_session()

## Basic config

In [3]:
# save train results
PATH_TRAIN_RESULTS = Path("train_results")
PATH_TRAIN_RESULTS.mkdir(parents=True, exist_ok=True)

In [4]:
# basepath, to access 01-... and 02-... folders
BASEPATH = Path().cwd().resolve().parent

In [5]:
## Instantiate some classes
# load architectures defined at ./src/models
model_loader = ModelLoader()

# encode input sequence 
encoder_input = Word2Vec(k = 3, s = 1, size_emb = 20) #HotEncoderKmer(k=3)

# encode labels
encoder_output = EncoderOutput(order_output_model=[0,1])

# translate output model to labels
decoder = DecoderOutput(
    order_output_model = ["No-Splice-Junction", "Splice-Junction"],
    argmax = True,
)

# Batches configuration: (batch_size, len_
batch_size = 32
epochs = 100

### Datasets
___

In [6]:
# Load datasets info
with open(BASEPATH.joinpath("01-data-preparation/data/datasets.json")) as fp:
    datasets = json.load(fp)

# Load sequences data
PATH_DATA = Path('/home/jorge/AlgoLab/Tezi-Marzi/Tezi-Documentation/Master Thesis-20210607T163207Z-001/Master Thesis/Archive')
data = pd.read_csv(PATH_DATA.joinpath('Sequences_chr1_unique.csv'))
sequences_by_id = {ID: seq for ID, seq in zip(data.index, data.Sequences)}

In [7]:
id_labels = datasets["id_labels"] # rows in the dataframe 
labels    = datasets["labels"] 

In [8]:
train_generator = DataGenerator(
    sequences = [sequences_by_id.get(ID) for ID in id_labels.get("train")], 
    labels    = labels.get("train"),
    encoder_input = encoder_input,
    encoder_output = encoder_output,
    batch_size= batch_size,
)

val_generator = DataGenerator(
    sequences = [sequences_by_id.get(ID) for ID in id_labels.get("val")], 
    labels    = labels.get("val"),
    encoder_input = encoder_input,
    encoder_output = encoder_output,
    batch_size= batch_size,
)

### Model config
___

In [9]:
# Model to use
model_name   = 'cnn_3-mer_60-seq_20-emb'
weights_path = None # None means that random weights will be used to initialize the net
output_layer = 'softmax' # activation function of last layer, 'sigmod' or 'softmax'
n_output     = 2 # neurons in last layer. None -> default to len(class_names)

In [10]:
X,y = train_generator.__getitem__(1)
X.shape, y.shape

((32, 58, 20), (32, 2))

In [11]:
optimizer=tf.keras.optimizers.Nadam(
    learning_rate=0.003, 
    beta_1=0.9, 
    beta_2=0.999, 
    epsilon=1e-07, 
    name="Nadam"
)
name_reference_optimizer = "nadam"

# Loss
loss = "binary_crossentropy"
name_reference_loss = "binary_crossentropy" #Name to use in training_config put in the dictionary (needed when using a custom loss)
bool_weighted_loss = False # True: use weighted loss using training set

# Metrics
metrics=["accuracy"]
name_reference_metrics = ["accuracy"]

In [16]:
# load model
model_name = "conv_3-mer_60-seq_20-emb"
model_loader = ModelLoader()
model = model_loader(
            model_name   = model_name, 
            n_output     = n_output,
            output_layer = output_layer,
            weights_path = weights_path,
)

# Compile model
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

Modelo 'conv_3-mer_60-seq_20-emb' cargado correctamente


In [17]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 58, 20)]          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 58, 128)           18048     
_________________________________________________________________
leaky_re_lu_4 (LeakyReLU)    (None, 58, 128)           0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 58, 128)           0         
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 29, 128)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 29, 64)            57408     
_________________________________________________________________
leaky_re_lu_5 (LeakyReLU)    (None, 29, 64)            0   

In [18]:
# Save architecture
model_json = model.to_json()
architecture = f"architecture-{model_name}-{output_layer}.json"
with open(PATH_TRAIN_RESULTS.joinpath(architecture), "w") as json_file:
    json_file.write(model_json)

In [19]:
## Callbacks
# ModelCheckpoint
weights_file = f'{PATH_TRAIN_RESULTS.as_posix()}/weights-{model_name}-{output_layer}-' + 'epoch{epoch:03d}-val_acc{val_accuracy:.3f}.hdf5'

# Tensorboard
now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = Path(f"logs/{model_name}-{now}")
log_dir.mkdir(exist_ok=True, parents=True)

# Callbacks
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(filepath=weights_file, save_best_only=True, save_weights_only=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.0001),
    tf.keras.callbacks.EarlyStopping(patience=50),
    tf.keras.callbacks.TensorBoard(log_dir=log_dir.as_posix(), histogram_freq=1)
]

if bool_weighted_loss:
    class_weight = {0:1,1:3}
else:
    print("\n** class_weights None\n")
    class_weight = None

# Train model on dataset
# logger.info(f"Begin training.")

# # Prueba sin generador
# X,y=train_generator.__getitem__(10)
# history_train = model.fit(
#     x=X,
#     y=y,
#     epochs=2,
#     validation_data=(X,y),#val_generator,
#     callbacks = callbacks,
#     #class_weight=class_weights
# )

history_train = model.fit(
    x=train_generator,
    epochs=epochs,
    validation_data=val_generator,
    callbacks = callbacks,
    class_weight=class_weight
)


** class_weights None

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100

KeyboardInterrupt: 