# Model Development

### Load in the data

In [1]:
import pandas as pd
import numpy as np

kins = ['JAK1', 'JAK2', 'JAK3', 'TYK2']

train_file_names = {}
test_file_names = {}

train_dfs = {}
test_dfs = {}

for kin in kins:
    train_file_names[kin] = "data/train_" + kin + ".csv"
    test_file_names[kin] = "data/test_" + kin + ".csv"
    
    train_dfs[kin] = pd.read_csv(train_file_names[kin]).drop(columns=['Unnamed: 0'])
    test_dfs[kin] = pd.read_csv(test_file_names[kin]).drop(columns=['Unnamed: 0'])

display(train_dfs['JAK1'])

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC25,PC26,PC27,PC28,PC29,PC30,PC31,PC32,PC33,target
0,0.210370,-3.783125,-3.109984,-2.330564,0.194412,-0.106196,1.359164,-0.800872,-0.018861,-0.106735,...,-0.448110,-0.216595,0.238602,-0.064950,0.374568,0.152447,0.038673,-0.005687,0.284065,9.11
1,-4.652259,-1.481696,-1.125819,2.124545,-0.359850,-0.727299,-0.421893,-0.488968,-0.470321,0.748475,...,-0.183500,0.387231,-0.063486,0.376771,-0.169494,-0.115262,-0.166771,-0.088883,-0.336320,9.04
2,0.866014,-3.058621,-2.573576,-1.358566,0.107867,1.033776,0.713390,-0.566665,-0.056746,0.425253,...,0.078281,0.297077,0.336427,0.075464,0.003606,-0.049016,0.092363,-0.015393,-0.100124,8.94
3,-3.691269,-0.402958,0.439577,-0.756925,-0.220367,0.326274,-0.581471,-1.383755,0.853735,-0.420171,...,-0.036835,0.182699,-0.114823,0.318219,-0.046057,-0.087436,-0.472599,0.064285,-0.018419,9.15
4,-2.843302,-0.335543,-0.126733,-1.664559,0.824550,-0.191094,-0.474833,-0.662604,-0.009798,-0.112947,...,-0.208560,-0.025368,0.172369,-0.155116,0.340742,0.103013,-0.019645,-0.131348,-0.153660,9.62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
543,-1.228603,-0.642169,-0.511690,2.357222,0.040193,2.118894,1.133076,-1.983986,1.294645,-0.747971,...,-0.222962,0.718315,0.090781,0.129622,0.087587,0.148446,0.129731,-0.326210,0.141956,8.91
544,5.776524,-0.193315,1.423830,-0.006551,-0.832585,0.248954,-1.536620,-0.245970,-0.007010,-1.118094,...,-0.317216,0.178732,-0.037934,-0.227529,0.203615,-0.227141,-0.661057,0.350377,0.033586,9.79
545,6.397876,5.206743,-1.936534,-0.469356,-0.005460,-0.688453,-0.433237,0.020591,0.173213,0.567808,...,0.127434,-0.763843,-0.001832,0.110674,0.044971,0.151868,0.395583,0.054981,-0.038778,10.61
546,2.305427,7.295340,-0.521168,0.069176,0.365588,-1.432064,0.930092,-1.407111,-0.348041,-0.465773,...,0.134273,-0.065842,-0.189253,-0.713009,0.120962,-0.210204,-0.054649,-0.295065,-0.132540,10.93


### Build MLPs for each kinase and optimize hyperparameters
Here we find the optimal hyperparameters for each of the networks. The ones we are tuning for are number of hidden layers, number of neurons per layer, and learning rate. I've included the dropout rate to avoid overfitting, as we are opening up the hidden layer exploration range all the way up to 16. 

In [None]:
import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt
from sklearn import ensemble
from sklearn import datasets
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection

def model_builder(hp):
    model = keras.Sequential()
    # Input layer
    model.add(keras.layers.Dense(33))

    # Tune the number of neurons in each layer, and the number of layers
    hp_neurons = hp.Choice('neurons', values=[64, 128, 256, 512])
    hp_layers = hp.Choice('layers', values = range(3, 17))
    while hp_layers > 0:
        model.add(keras.layers.Dense(hp_neurons, activation='relu', kernel_initializer='he_uniform'))
        model.add(keras.layers.Dropout(0.2))
        hp_layers -= 1

    model.add(keras.layers.Dense(1, activation='linear'))
    
    # Tune the learning rate for the optimizer
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss='mse',
                metrics=['mean_squared_error'])

    return model

def get_best_hyperparameters(kin, train_data):
    x_train = train_data.drop(columns=['target']).to_numpy(dtype=float)
    y_train = train_data['target'].to_numpy(dtype=float)
    print(x_train.shape)
    tuner = kt.BayesianOptimization(model_builder,
                                    objective=kt.Objective("val_mean_squared_error", "min"),
                                    overwrite=True,
                                    max_trials=10
                                   )

    stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_mean_squared_error', patience=10000)

    tuner.search(x_train, y_train, epochs=50, validation_split=0.2)
    best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
    hp_dict = {
        'neurons': best_hps.get('neurons'),
        'layers': best_hps.get('layers'),
        'learning': best_hps.get('learning_rate')
    }
    return hp_dict

optimal_hps = {}
for kin in kins:
    optimal_hps[kin] = get_best_hyperparameters(kin, train_dfs[kin])

for x in optimal_hps:
    print(x)
    print(optimal_hps[x])

Trial 7 Complete [00h 00m 02s]
val_mean_squared_error: 1.2417775392532349

Best val_mean_squared_error So Far: 0.4018065631389618
Total elapsed time: 00h 00m 31s

Search: Running Trial #8

Value             |Best Value So Far |Hyperparameter
128               |64                |neurons
3                 |5                 |layers
0.01              |0.01              |learning_rate

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
 1/17 [>.............................] - ETA: 0s - loss: 4.9516 - mean_squared_error: 4.9516

### Evaluate model performance on test data
We will be using mean squared error as the loss function, as it's important to reduce large outlier errors in the pKi. Outliers could result in completely missed strong bindings, and sometimes assumption of a strong binding when there is little to no strength in actuality.

In [None]:
def build_model(num_neurons, num_layers, learning_rate):
    layer_seq = []
    layer_seq.append(keras.layers.Dense(33))
    while num_layers > 0:
        layer_seq.append(keras.layers.Dense(num_neurons, activation='relu', kernel_initializer='he_uniform'))
        layer_seq.append(keras.layers.Dropout(0.2))
        num_layers -= 1
    
    layer_seq.append(keras.layers.Dense(1, activation='linear'))


    model = keras.Sequential(layer_seq)
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate), loss='mean_squared_error')
    
    return model



def evaluate_model(kin, train_data, test_data, optimal_hps):
    x_train = train_data.drop(columns=['target']).to_numpy(dtype=float)
    y_train = train_data['target'].to_numpy(dtype=float)
    
    x_test = test_data.drop(columns=['target']).to_numpy(dtype=float)
    y_test = test_data['target'].to_numpy(dtype=float)
    model = build_model(optimal_hps[kin]['neurons'], optimal_hps[kin]['layers'], optimal_hps[kin]['learning'])
    monitor = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=1000,
        verbose=0,
        mode='auto',
        restore_best_weights=True,
    )
    
    evaluation = model.fit(x_train,
            y_train,
            epochs=10000,
            verbose=0,
            validation_split=0.2,
            callbacks=[monitor])
    print("Evaluate on test data for " + kin)
    results = model.evaluate(x_test, y_test, batch_size=128)
    print("Test MSE:", results)
    return results