In [2]:
!pip install keras-tuner scikit-learn
!pip install icd10-cm bgen_reader pydantic
from google.colab import drive
drive.mount('/content/gdrive')
import tensorflow as tf
import kerastuner as kt
import sys
import os
import shutil
import string
import sklearn
sys.path.append(os.path.abspath('/content/gdrive/My Drive/6.874 project/Colab'))
import input_processing

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Sample IDs are read from /content/gdrive/MyDrive/6.874 project/Data/ukb45624_imp_chr21_v3_s487275.sample.


Mapping genotypes: 100%|██████████| 1261158/1261158 [01:22<00:00, 15228.43it/s]


reading -- time=0:00:02.96, thread 1 of 2, part 5 of 5


## Useful Constants

* A = number of different genotypic alleles
* P = number of different phenotypes
* D = dropout rate
* l2 = L2 regularization rate

In [3]:
A = 1000
P = 1

## Network Architecture
Outputs:

1. Input: N x A x 3
1. Dense Layer w/ ReLU: N x A // 3
1. Dropout(D): N x A // 3
1. Dense Layer w/ ReLU: N x P * 3
1. Dense Layer w/ Sigmoid: N x P

In [4]:
def make_DNN_model(d1, dropout, l2):
    regularizer = tf.keras.regularizers.L2(l2)
    return tf.keras.Sequential(layers=[
        tf.keras.layers.InputLayer(input_shape=(A, 3)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(d1, activation=tf.nn.relu, kernel_regularizer=regularizer),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Dense(P, activation=tf.nn.sigmoid, kernel_regularizer=regularizer)
    ], name='nn_model')

## Model Training Parameters
#### Loss Function
Binary Cross-Entropy + L2 Regularization
#### Optimizer
Adam
#### Metric
Binary Accuracy

In [5]:
def compile_model(model, optimizer):
    model.compile(optimizer=optimizer, loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy', tf.metrics.AUC()])
    model.summary()

## Hyperparameter Tuning

In [6]:
batch_size = 100
epochs = 20
use_multiprocessing=False
validation_split = 0.2

def get_tunable_DNN_model(hp, d1_min, d1_max, d_step):
    d1_units = hp.Int('d1_units', min_value=d1_min, max_value=d1_max, step=d_step)
    dropout = hp.Choice('dropout', values=[0.2, 0.5, 0.8])
    l2 = hp.Choice('l2', values=[0.01, 0.005, 0.001])
    lr = hp.Choice('lr', values=[0.01, 0.005, 0.001])
    model = make_DNN_model(d1_units, dropout, l2)
    compile_model(model, tf.keras.optimizers.Adam(lr))
    return model

def get_tuner(model_builder):
    try:
      shutil.rmtree('hp_tuning')
    except FileNotFoundError:
      pass
    return kt.Hyperband(model_builder, objective=kt.Objective('val_auc', direction='max'), max_epochs=epochs, executions_per_trial=3, directory='hp_tuning', project_name='initial_model')

## Search Space

In [7]:
d1_min = max(P // 4, 1)
d1_max = P * 64
d_step = 16

## Get Training Data

In [8]:
#def get_data(N, A, P):
 #   genotypes = tf.stack([tf.constant([[1, 0, 0]] * A, dtype=tf.float32)] * N)
  #  phenotypes = tf.stack([tf.constant([1] * P, dtype=tf.float32)] * N)
   # return genotypes, phenotypes

## Hyperparameter Tuning Call

In [9]:
search_data = input_processing.get_data(A, 1000)
search_train_x, search_validation_x, search_train_y, search_validation_y = sklearn.model_selection.train_test_split(*search_data, test_size=0.2, random_state=0, stratify=search_data[1])




In [10]:
tuner = get_tuner(lambda hp: get_tunable_DNN_model(hp, d1_min, d1_max, d_step))
tuner.search_space_summary()
tuner.search(search_train_x, search_train_y, epochs=epochs, validation_data=(search_validation_x, search_validation_y))
tuner.results_summary()

Trial 30 Complete [00h 00m 27s]
val_auc: 0.5

Best val_auc So Far: 0.6916239460309347
Total elapsed time: 00h 12m 09s
INFO:tensorflow:Oracle triggered exit
Results summary
Results in hp_tuning/initial_model
Showing 10 best trials
Objective(name='val_auc', direction='max')
Trial summary
Hyperparameters:
d1_units: 17
dropout: 0.8
l2: 0.01
lr: 0.001
tuner/epochs: 7
tuner/initial_epoch: 3
tuner/bracket: 2
tuner/round: 1
tuner/trial_id: 79441ef7e29abc405f2c4ae70ecb4879
Score: 0.6916239460309347
Trial summary
Hyperparameters:
d1_units: 33
dropout: 0.8
l2: 0.01
lr: 0.01
tuner/epochs: 20
tuner/initial_epoch: 0
tuner/bracket: 0
tuner/round: 0
Score: 0.6673504114151001
Trial summary
Hyperparameters:
d1_units: 33
dropout: 0.8
l2: 0.001
lr: 0.01
tuner/epochs: 20
tuner/initial_epoch: 7
tuner/bracket: 1
tuner/round: 1
tuner/trial_id: fa07dc63f0baff6b91b3a6cdea6841ea
Score: 0.6620513002077738
Trial summary
Hyperparameters:
d1_units: 17
dropout: 0.8
l2: 0.005
lr: 0.001
tuner/epochs: 7
tuner/initial_ep

## Optimal Model

In [11]:
models = tuner.get_best_models()

Model: "nn_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 3000)              0         
_________________________________________________________________
dense (Dense)                (None, 17)                51017     
_________________________________________________________________
dropout (Dropout)            (None, 17)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 18        
Total params: 51,035
Trainable params: 51,035
Non-trainable params: 0
_________________________________________________________________


## Train Model

In [12]:
x, y = input_processing.get_data(A, 100000)
train_x, validation_x, train_y, validation_y = sklearn.model_selection.train_test_split(x, y, test_size=validation_split, stratify=y)




In [13]:
histories = [model.fit(x=train_x, y=train_y, batch_size=batch_size, epochs=epochs, validation_data=(validation_x, validation_y), use_multiprocessing=use_multiprocessing, workers=os.cpu_count() - 1 if use_multiprocessing else 1) for model in models]

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Test Model

In [15]:
test_x, test_y = input_processing.get_data(A, 1000)




NameError: ignored

In [16]:
model = models[0]
history = model.evaluate(x=test_x, y=test_y, use_multiprocessing=use_multiprocessing, workers=os.cpu_count() - 1 if use_multiprocessing else 1)



In [17]:
model.save('/content/gdrive/My Drive/6.874 project/Colab/models/dnn')

INFO:tensorflow:Assets written to: /content/gdrive/My Drive/6.874 project/Colab/models/dnn/assets
