The intention of this notebook is to check what kind of performance can be achieved with a CNN only network for transfer learning with pretrained weights and additional hidden layer.

The new network would be the old one where I fix all the weights of the pretrained CNN network and then throw away the last layer, do this two times (for wildtype and for mutant sequence), concatenate the output of the three different layers (01, 02, 03 as in the other notebooks) and append a hidden layer and an output neuron.

This is equivalent to defining a fully connected NN with one hidden layer and one output neuron, whose input is the extracted representation from the transfer learning CNN model (since the weights would be fixed). Since setting this up and training it is easyer, this is what I will do.

Additionally I feed it the difference of the mutant & wildtype representations.

## Notebook Setup

In [0]:
#Imports:
import os
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import preprocessing
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import RepeatedKFold
import time
import datetime

In [0]:
#Connect to google drive:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
#Set up file paths:
data_folder_loc = "gdrive/My Drive/iGEM/Databases/Merck&Co/Data/T1626/cnn_rep_sampler_T1626/"
base_data_folder_loc = "gdrive/My Drive/iGEM/Databases/Merck&Co/Data/T1626/own_baseline_xgboost_T1626/"
table_loc = "gdrive/My Drive/iGEM/Databases/Merck&Co/Models/CNN_Features_Pretrain_Cont/"

y_target_loc = base_data_folder_loc + "y_label_own_baseline_xgboost_T1626.pkl"
X_seq_all_01_loc =  data_folder_loc + "sequence_cnn_rep_all_default_01.csv"
X_seq_all_02_loc =  data_folder_loc + "sequence_cnn_rep_all_default_02.csv"
X_seq_all_03_loc =  data_folder_loc + "sequence_cnn_rep_all_default_03.csv"

print("Folders: ")
print("\t",os.path.isdir(data_folder_loc))
print("\t",os.path.isdir(base_data_folder_loc))
print("\t",os.path.isdir(table_loc))
print("Files: ")
print("\t",os.path.isfile(y_target_loc))
print("\t",os.path.isfile(X_seq_all_01_loc))
print("\t",os.path.isfile(X_seq_all_02_loc))
print("\t",os.path.isfile(X_seq_all_03_loc))

Folders: 
	 True
	 True
	 True
Files: 
	 True
	 True
	 True
	 True


In [0]:
#XGBoost
!pip install xgboost==0.82 #working GPU support
import xgboost as xgb



In [0]:
#BayersianOptimization
!pip install bayesian-optimization
from bayes_opt import BayesianOptimization



## Prepare Data

In [0]:
#Just load data
y_label = pd.read_pickle(y_target_loc) 

X_seq_all_01 = pd.read_csv(X_seq_all_01_loc, index_col=0)
X_seq_all_02 = pd.read_csv(X_seq_all_02_loc, index_col=0)
X_seq_all_03 = pd.read_csv(X_seq_all_03_loc, index_col=0)

In [0]:
X_seq_wt_mut_01 = X_seq_all_01[list(filter(lambda x: "diff_rep" not in x, X_seq_all_01.columns))]
X_seq_wt_mut_02 = X_seq_all_02[list(filter(lambda x: "diff_rep" not in x, X_seq_all_02.columns))]
X_seq_wt_mut_03 = X_seq_all_03[list(filter(lambda x: "diff_rep" not in x, X_seq_all_03.columns))]

In [0]:
X_seq_wt_mut_01_02_03 = pd.concat([X_seq_wt_mut_01, X_seq_wt_mut_02, X_seq_wt_mut_03], axis=1)

## N-Trial nested K-Fold Crossvalidation NN Pretrained Function###

In [0]:
def n_trial_nested_k_fold_crossvalidation_NN(data_x, data_y, num_trials=7, eval_metric="mae", outer_fold=5, repeated_k_fold_seed=42375, inner_test_split=0.2):  
  print("Trials: ", num_trials)
  print("Evaluation metric: ", eval_metric)
  print("Outer folds: ", outer_fold)
    
  #saves result for each outer cross val
  scores_mae = [] 
  scores_rmse = []
  best_parameters = []

  total_fold_counter = 0
  rkf = RepeatedKFold(n_splits=outer_fold, n_repeats=num_trials, random_state=repeated_k_fold_seed)
  # Main loop for trials and outer outer cross_val
  for train_index, test_index in rkf.split(data_x):
    #Measure runtime
    start_time = time.time()
    
    # Print current status
    print("\nTrial number: ", (total_fold_counter//outer_fold)+1)
    print("Fold number: ", (total_fold_counter%outer_fold)+1)
    total_fold_counter += 1
    
    # Select current train and test data
    X_train, X_test = data_x[train_index], data_x[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]
    
    # Split training data into train_inner and test_inner 80/20 //for early stopping
    #Train test split
    X_train_inner, X_test_inner, y_train_inner, y_test_inner = train_test_split(X_train, y_train, test_size=inner_test_split, random_state=repeated_k_fold_seed) 
    
    #Define model
    inputs = tf.keras.layers.Input(shape=(X_train_inner.shape[-1],))

    x = tf.keras.layers.Dense(512)(inputs)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation("relu")(x)
    x = tf.keras.layers.Dense(1, activation="linear")(x)

    model = tf.keras.Model(inputs=inputs, outputs=x)

    #Compile the model
    model.compile(
      optimizer=tf.train.AdamOptimizer(),
      loss=tf.keras.losses.mean_absolute_error,
      metrics=['mae']
    )
    
    weight_loc = "model_temp.hdf5"

    #Configure how to save model and early stopping
    callbacks_list = [
        tf.keras.callbacks.ModelCheckpoint(
            filepath=weight_loc,
            save_weights_only=True,
            monitor='val_loss', 
            save_best_only=True,
            mode='auto',
            period=1),
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                         patience=4, 
                                         mode='auto')
    ]

    history = model.fit(X_train_inner,
                        y_train_inner,
                        validation_data=(X_test_inner,y_test_inner),
                        epochs=25,
                        batch_size=8,
                        callbacks=callbacks_list,
                        verbose=1)
    
    #load the best temporary model
    model.load_weights(weight_loc)
    
    #calculate predictions
    y_pred = model.predict(X_test, batch_size=1024, verbose=1)

    # Report testing and training scores
    scores_mae.append(mean_absolute_error(y_test, y_pred))
    scores_rmse.append(np.sqrt(mean_squared_error(y_test, y_pred)))
    print("MAE: ", scores_mae[-1], " RMSE: ",scores_rmse[-1], " Runtime: ", str(datetime.timedelta(seconds=(time.time()-start_time))))
  
  print()
  results = pd.DataFrame(best_parameters)
  results["scores_mae"] = scores_mae
  results["scores_rmse"] = scores_rmse
  return results

## Evaluate model

### Model fully_connected_512

In [0]:
data = X_seq_wt_mut_01_02_03

In [0]:
#define model:



```
Layer (type)                 Output Shape              Param #   
=================================================================
input_3 (InputLayer)         (None, 3584)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 512)               1835520   
_________________________________________________________________
batch_normalization_v1_1 (Ba (None, 512)               2048      
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 513       
=================================================================
Total params: 1,838,081
Trainable params: 1,837,057
Non-trainable params: 1,024
```



In [0]:
results = n_trial_nested_k_fold_crossvalidation_NN(X_seq_wt_mut_01_02_03.values, y_label)

Trials:  7
Evaluation metric:  mae
Outer folds:  5

Trial number:  1
Fold number:  1
Train on 1040 samples, validate on 260 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
MAE:  3.5592830141949134  RMSE:  4.901196341564822  Runtime:  0:00:18.491743

Trial number:  1
Fold number:  2
Train on 1040 samples, validate on 261 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
MAE:  3.5179724080860613  RMSE:  4.645746954628608  Runtime:  0:00:17.947835

Trial number:  1
Fold number:  3
Train on 1040 samples, validate on 261 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
MAE:  3.9464627994674903  RMSE:  5.283763665676268  Runtime:  0:00:15.181728

Trial number:  1
Fold number:  4
Train on 1040 samples, validate on 261 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
MAE:  3.838093765909855  RMSE:  5.29214

In [0]:
results.to_csv(table_loc+"pretrain_cont_results_table.csv")

In [0]:
results.describe()

Unnamed: 0,scores_mae,scores_rmse
count,35.0,35.0
mean,3.705597,4.96856
std,0.170001,0.245597
min,3.399547,4.529309
25%,3.569561,4.777363
50%,3.713882,4.961233
75%,3.840561,5.176313
max,3.97577,5.345837


##Results


*  Model fully_connected_512: 3.733756 MAE, 4.968560 RMSE