I took things out of `training.py` script so I can easily modify them here.

# Setup

In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras as K
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.metrics import precision_recall_curve
from tensorflow.keras.layers import (
                                BatchNormalization, LeakyReLU,
                                Input, Dense, Conv2D,
                                MaxPooling2D, Flatten, Dropout)
from tensorflow.keras.optimizers import Adam

# Data preprocessing

In [2]:
def one_hot_encoding(df, tensor_dim=(50, 20, 1)):
    """
    fun transform input database to numpy array.
    
    parameters:
    df = Pandas df with col names "gene", "label", "miRNA"
    tensor_dim= 2d matrix shape
    
    output:
    2d dot matrix, labels as np array
    """
    df.reset_index(inplace=True, drop=True)

    # alphabet for watson-crick interactions.
    alphabet = {"AT": 1., "TA": 1., "GC": 1., "CG": 1.}

    # labels to one hot encoding
    label = df["label"].to_numpy()

    # create empty main 2d matrix array
    N = df.shape[0]  # number of samples in df
    shape_matrix_2d = (N, *tensor_dim)  # 2d matrix shape
    # initialize dot matrix with zeros
    ohe_matrix_2d = np.zeros(shape_matrix_2d, dtype="float32")

    # compile matrix with watson-crick interactions.
    for index, row in df.iterrows():
        for bind_index, bind_nt in enumerate(row.gene.upper()):

            for mirna_index, mirna_nt in enumerate(row.miRNA.upper()):

                base_pairs = bind_nt + mirna_nt
                ohe_matrix_2d[index, bind_index, mirna_index, 0] = alphabet.get(base_pairs, 0)

    return ohe_matrix_2d, label

In [3]:
# set random state for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [15]:
!wget https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/miRBind/tuning/Datasets/train_set_1_1_CLASH2013_paper.tsv
!wget https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/miRBind/tuning/Datasets/evaluation_set_1_1_CLASH2013_paper.tsv

--2022-02-23 13:57:31--  https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/miRBind/tuning/Datasets/train_set_1_1_CLASH2013_paper.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2276853 (2.2M) [text/plain]
Saving to: ‘train_set_1_1_CLASH2013_paper.tsv.1’


2022-02-23 13:57:32 (5.83 MB/s) - ‘train_set_1_1_CLASH2013_paper.tsv.1’ saved [2276853/2276853]

--2022-02-23 13:57:32--  https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/miRBind/tuning/Datasets/evaluation_set_1_1_CLASH2013_paper.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 

In [5]:
TRAINING_RATIO = 1
train_df = pd.read_csv("train_set_1_" + str(TRAINING_RATIO) + "_CLASH2013_paper.tsv", sep='\t')
train_df = train_df.sample(frac=1, random_state=RANDOM_STATE)
train_df.head()

Unnamed: 0,miRNA,gene,label
15287,TCTGGCTCCGTGTCTTCACT,GGTGAGGGAGACGGAGGCCGTCATCCACAAGCACCGCTCGGCCACC...,1
27358,TGAGGTAGTAGTTTGTGCTG,GGACAGGCACAGAGACTTGGAAGAGAGAAATAGACGCTCTAGTGGG...,0
9838,TACCCTGTAGATCCGAATTT,ACTTCTTGGACTACATGGGGATCAAAGGCCCCAGGATGCCTCTGGG...,1
896,CGTCAACACTTGCTGGTTTC,GTGTCTCAAAGCAAAGGAAACCTCCACAAGTGCTGCAACAGTGCAT...,1
19287,TTCACCACCTTCTCCACCCA,CTTTGACACTACACAATTTTCTAATATGTGTTAATGCTATGTGACA...,0


In [16]:
val_df = pd.read_csv("evaluation_set_1_" + str(TRAINING_RATIO) + "_CLASH2013_paper.tsv", sep='\t')
val_df = val_df.sample(frac=1, random_state=RANDOM_STATE)
val_df.head()

Unnamed: 0,miRNA,gene,label
1860,TAGCTTATCAGACTGATGTT,TCATCAGCCGGGAGAGCAGCCTCATTCTGGCTGTCACGCCCGCCAA...,0
353,TCCGAGCCTGGGTCTCCCTC,AGGCCTGGGCCTGGTTCGGGGTCTGTTTTATGCTCTTCGGTCCCTC...,1
1333,TTGGGGAAACGGCCGCTGAG,TGCACAGGGGGCAGACAGCGTGGAGCCTATGTTCCGGCATCTCAAG...,0
905,ACTGCTGAGCTAGCACTTCC,AATGGGGAAGTGAGTGCTTGGCAACTGATGTTCCACTTGACACACT...,1
1289,ACTGCCCTAAGTGCTCCTTC,ACCCAGATCCTCTCGGCTCCTCACCTTTAATCCCACATACTGTGCC...,0


In [22]:
train_ohe_data = one_hot_encoding(train_df)
train_ohe, train_labels = train_ohe_data
print("Number of training samples: ", train_df.shape[0])

Number of training samples:  30784


In [17]:
val_ohe_data = one_hot_encoding(val_df)
val_ohe, val_labels = val_ohe_data
print("Number of evaluation samples: ", val_df.shape[0])

Number of training samples:  2000


# Model + Keras Tuner setup

Following this tutorial https://blog.tensorflow.org/2020/01/hyperparameter-tuning-with-keras-tuner.html to setup hyperparameter tuning.

In [18]:
def make_architecture(hp):
    """
    build model architecture

    return a model object
    """
    main_input = Input(shape=(50, 20, 1),
                       dtype='float32', name='main_input'
                       )

    cnn_num = hp.Int('conv_blocks', 2, 6, default=3)
    kernel_size = hp.Int('kernel_size', 3, 6, default=3)
    pool_size = hp.Int('pool_size', 2, 4, default=2)
    dropout_rate = hp.Float('dropout', 0, 0.6, step=0.05, default=0.25)
    # max the same number of dense layers as is the number of cnn layers
    dense_num = hp.Int('dense_blocks', 2, cnn_num, default=3)


    for cnn_i in range(cnn_num):

        x = Conv2D(
            # we increase number of filters by 32 in each layer
            filters=32*(cnn_i + 1),
            kernel_size=(kernel_size, kernel_size),
            padding="same",
            data_format="channels_last",
            name="conv_" + str(cnn_i + 1))(main_input)
        x = LeakyReLU()(x)
        x = BatchNormalization()(x)
        x = MaxPooling2D(pool_size=(pool_size, pool_size), name='Max_' + str(cnn_i + 1))(x)
        x = Dropout(rate=dropout_rate)(x)

    conv_flat = Flatten(name='2d_matrix')(x)

    for dense_i in range(dense_num):

        neurons = 32 * (cnn_num - dense_i)
        x = Dense(neurons)(conv_flat)
        x = LeakyReLU()(x)
        x = BatchNormalization()(x)
        x = Dropout(rate=dropout_rate)(x)

    main_output = Dense(1, activation='sigmoid', name='main_output')(x)

    m = K.Model(inputs=[main_input], outputs=[main_output], name='arch_00')

    return m

In [32]:
!pip install tensorflow_addons

Collecting tensorflow_addons
  Downloading tensorflow_addons-0.16.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[?25l[K     |▎                               | 10 kB 29.4 MB/s eta 0:00:01[K     |▋                               | 20 kB 36.7 MB/s eta 0:00:01[K     |▉                               | 30 kB 15.0 MB/s eta 0:00:01[K     |█▏                              | 40 kB 4.8 MB/s eta 0:00:01[K     |█▌                              | 51 kB 4.6 MB/s eta 0:00:01[K     |█▊                              | 61 kB 5.4 MB/s eta 0:00:01[K     |██                              | 71 kB 5.9 MB/s eta 0:00:01[K     |██▍                             | 81 kB 5.8 MB/s eta 0:00:01[K     |██▋                             | 92 kB 6.4 MB/s eta 0:00:01[K     |███                             | 102 kB 5.3 MB/s eta 0:00:01[K     |███▏                            | 112 kB 5.3 MB/s eta 0:00:01[K     |███▌                            | 122 kB 5.3 MB/s eta 0:00:01[K     |███

In [33]:
import tensorflow_addons as tfa
binary_f1_score = tfa.metrics.F1Score(num_classes=1, threshold=0.5, average="micro")

def compile_model(hp):
    K.backend.clear_session()
    m = make_architecture(hp)

    opt = Adam(
        learning_rate=hp.Float('learning_rate', 1e-4, 1e-2, sampling='log'),
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-07,
        amsgrad=False,
        name="Adam")

    m.compile(
        optimizer=opt,
        loss='binary_crossentropy',
        metrics=['accuracy', binary_f1_score]
        )
    return m

In [11]:
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.1.0-py3-none-any.whl (98 kB)
[?25l[K     |███▍                            | 10 kB 22.5 MB/s eta 0:00:01[K     |██████▊                         | 20 kB 8.7 MB/s eta 0:00:01[K     |██████████                      | 30 kB 7.2 MB/s eta 0:00:01[K     |█████████████▍                  | 40 kB 6.8 MB/s eta 0:00:01[K     |████████████████▊               | 51 kB 4.1 MB/s eta 0:00:01[K     |████████████████████            | 61 kB 4.9 MB/s eta 0:00:01[K     |███████████████████████▍        | 71 kB 3.2 MB/s eta 0:00:01[K     |██████████████████████████▊     | 81 kB 3.6 MB/s eta 0:00:01[K     |██████████████████████████████  | 92 kB 4.0 MB/s eta 0:00:01[K     |████████████████████████████████| 98 kB 3.1 MB/s 
[?25hCollecting kt-legacy
  Downloading kt_legacy-1.0.4-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.1.0 kt-legacy-1.0.4


In [38]:
import keras_tuner as kt

tuner = kt.Hyperband(
    compile_model,
    objective=kt.Objective("val_f1_score", direction="max"),
    max_epochs=30,
    hyperband_iterations=2,
    project_name='miRBind')

In [39]:
tuner.search(train_ohe, train_labels,
             validation_data=(val_ohe, val_labels),
             epochs=10,
             callbacks=[tf.keras.callbacks.EarlyStopping(patience=3)],
             
)

Trial 44 Complete [00h 00m 21s]
val_f1_score: 0.8192433714866638

Best val_f1_score So Far: 0.8377823829650879
Total elapsed time: 00h 09m 41s

Search: Running Trial #45

Hyperparameter    |Value             |Best Value So Far 
conv_blocks       |4                 |5                 
kernel_size       |6                 |5                 
pool_size         |2                 |3                 
dropout           |0.55              |0.25              
dense_blocks      |2                 |3                 
learning_rate     |0.0001167         |0.0016344         
tuner/epochs      |4                 |4                 
tuner/initial_e...|2                 |2                 
tuner/bracket     |3                 |3                 
tuner/round       |1                 |1                 
tuner/trial_id    |a6a0cd25d96aec3...|f20d4efa2b87162...

Epoch 3/4
Epoch 4/4


KeyboardInterrupt: ignored