In [1]:
!pip install keras-tuner scikit-learn
!pip install icd10-cm bgen_reader pydantic
from google.colab import drive
drive.mount('/content/gdrive')
import tensorflow as tf
import kerastuner as kt
import sys
import os
import shutil
import string
import sklearn
sys.path.append(os.path.abspath('/content/gdrive/My Drive/6.874 project/Colab'))
import input_processing

Collecting keras-tuner
[?25l  Downloading https://files.pythonhosted.org/packages/20/ec/1ef246787174b1e2bb591c95f29d3c1310070cad877824f907faba3dade9/keras-tuner-1.0.2.tar.gz (62kB)
[K     |█████▏                          | 10kB 16.2MB/s eta 0:00:01[K     |██████████▍                     | 20kB 12.2MB/s eta 0:00:01[K     |███████████████▋                | 30kB 10.1MB/s eta 0:00:01[K     |████████████████████▉           | 40kB 9.5MB/s eta 0:00:01[K     |██████████████████████████      | 51kB 5.6MB/s eta 0:00:01[K     |███████████████████████████████▎| 61kB 6.2MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 3.9MB/s 
Collecting terminaltables
  Downloading https://files.pythonhosted.org/packages/9b/c4/4a21174f32f8a7e1104798c445dacdc1d4df86f2f26722767034e4de4bff/terminaltables-3.1.0.tar.gz
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.w

Mapping genotypes: 100%|██████████| 1261158/1261158 [01:14<00:00, 16927.43it/s]


reading -- time=0:00:01.49, thread 1 of 2, part 5 of 5


## Useful Constants

* A = number of different genotypic alleles
* P = number of different phenotypes
* D = dropout rate
* l2 = L2 regularization rate

In [7]:
A = 1000
P = 1

## Network Architecture
Outputs:

1. Input: N x A x 3
1. Dense Layer w/ ReLU: N x A // 3
1. Dropout(D): N x A // 3
1. Dense Layer w/ ReLU: N x P * 3
1. Dense Layer w/ Sigmoid: N x P

In [8]:
def make_CNN_model(c1_shape, c1_out, mp_size, mp_stride, dropout, l2):
    regularizer = tf.keras.regularizers.L2(l2)
    return tf.keras.Sequential(layers=[
        tf.keras.layers.InputLayer(input_shape=(A, 3)),
        tf.keras.layers.Conv1D(c1_out, c1_shape, activation=tf.nn.relu),
        tf.keras.layers.MaxPool1D(mp_size, mp_stride),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Dense(P, activation=tf.nn.sigmoid, kernel_regularizer=regularizer)
    ], name='cnn_model')

## Model Training Parameters
#### Loss Function
Binary Cross-Entropy + L2 Regularization
#### Optimizer
Adam
#### Metric
Binary Accuracy

In [5]:
def compile_model(model, optimizer):
    model.compile(optimizer=optimizer, loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy', tf.metrics.AUC()])
    model.summary()

## Hyperparameter Tuning

In [10]:
batch_size = 100
epochs = 20
use_multiprocessing=False
validation_split = 0.2

In [15]:
class IntRange:
  def __init__(self, min, max, step):
    self.min = min
    self.max = max
    self.step = step
  
  def to_Int(self, hp, name):
    return hp.Int(name, min_value = self.min, max_value = self.max, step = self.step)

def get_tunable_CNN_model(hp, c1, kcount, mp_size, mp_stride):
    c1 = c1.to_Int(hp, 'c1_shape')
    mp_size = mp_size.to_Int(hp, 'max_pool_size')
    mp_stride = mp_stride.to_Int(hp, 'max_pool_stride')
    dropout = hp.Choice('dropout', values=[0.2, 0.5, 0.8])
    l2 = hp.Choice('l2', values=[0.01, 0.005, 0.001])
    lr = hp.Choice('lr', values=[0.01, 0.005, 0.001])
    model = make_CNN_model(c1, kcount, mp_size, mp_stride, dropout, l2)
    compile_model(model, tf.keras.optimizers.Adam(lr))
    return model

def get_tuner(model_builder):
    try:
      shutil.rmtree('hp_tuning')
    except FileNotFoundError:
      pass
    return kt.Hyperband(model_builder, objective=kt.Objective('val_auc', direction='max'), max_epochs=epochs, executions_per_trial=3, directory='hp_tuning', project_name='initial_model')

## Search Space

In [17]:
c1_search = IntRange(3, P * 27, 2)
kcount = 32
mp_size = IntRange(2, A // 4, 2)
mp_step = IntRange(2, A // 4, 2)

## Get Training Data

In [12]:
#def get_data(N, A, P):
 #   genotypes = tf.stack([tf.constant([[1, 0, 0]] * A, dtype=tf.float32)] * N)
  #  phenotypes = tf.stack([tf.constant([1] * P, dtype=tf.float32)] * N)
   # return genotypes, phenotypes

## Hyperparameter Tuning Call

In [13]:
search_data = input_processing.get_data(A, 1000)
search_train_x, search_validation_x, search_train_y, search_validation_y = sklearn.model_selection.train_test_split(*search_data, test_size=0.2, random_state=0, stratify=search_data[1])




In [18]:
tuner = get_tuner(lambda hp: get_tunable_CNN_model(hp, c1_search, kcount, mp_size, mp_step))
tuner.search_space_summary()
tuner.search(search_train_x, search_train_y, epochs=epochs, validation_data=(search_validation_x, search_validation_y))
tuner.results_summary()

Trial 30 Complete [00h 01m 01s]
val_auc: 0.6984615723292033

Best val_auc So Far: 0.7394871910413107
Total elapsed time: 00h 19m 41s
INFO:tensorflow:Oracle triggered exit
Results summary
Results in hp_tuning/initial_model
Showing 10 best trials
Objective(name='val_auc', direction='max')
Trial summary
Hyperparameters:
c1_shape: 21
max_pool_size: 166
max_pool_stride: 158
dropout: 0.8
l2: 0.005
lr: 0.005
tuner/epochs: 20
tuner/initial_epoch: 7
tuner/bracket: 2
tuner/round: 2
tuner/trial_id: 2aea7c78d7a4b884aa434ec86365da1b
Score: 0.7394871910413107
Trial summary
Hyperparameters:
c1_shape: 23
max_pool_size: 74
max_pool_stride: 60
dropout: 0.5
l2: 0.005
lr: 0.001
tuner/epochs: 20
tuner/initial_epoch: 7
tuner/bracket: 1
tuner/round: 1
tuner/trial_id: 32a5e89a55b8300b16db15a58cff02ca
Score: 0.7317948937416077
Trial summary
Hyperparameters:
c1_shape: 23
max_pool_size: 78
max_pool_stride: 204
dropout: 0.5
l2: 0.01
lr: 0.01
tuner/epochs: 20
tuner/initial_epoch: 0
tuner/bracket: 0
tuner/round: 0


## Optimal Model

In [23]:
models = tuner.get_best_models()

Model: "cnn_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 969, 21)           2037      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 6, 21)             0         
_________________________________________________________________
flatten (Flatten)            (None, 126)               0         
_________________________________________________________________
dropout (Dropout)            (None, 126)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 127       
Total params: 2,164
Trainable params: 2,164
Non-trainable params: 0
_________________________________________________________________


## Train Model

In [21]:
x, y = input_processing.get_data(A, 100000)
train_x, validation_x, train_y, validation_y = sklearn.model_selection.train_test_split(x, y, test_size=validation_split, stratify=y)




In [24]:
histories = [model.fit(x=train_x, y=train_y, batch_size=batch_size, epochs=epochs, validation_data=(validation_x, validation_y), use_multiprocessing=use_multiprocessing, workers=os.cpu_count() - 1 if use_multiprocessing else 1) for model in models]

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Test Model

In [26]:
test_x, test_y = input_processing.get_data(A, 1000)




In [27]:
histories = [model.evaluate(x=test_x, y=test_y, use_multiprocessing=use_multiprocessing, workers=os.cpu_count() - 1 if use_multiprocessing else 1) for model in models]



In [29]:
model = models[0]
model.save('/content/gdrive/My Drive/6.874 project/Colab/models/cnn')

INFO:tensorflow:Assets written to: /content/gdrive/My Drive/6.874 project/Colab/models/cnn/assets
