# 4-fold CNN Cross Validation

In [1]:
import tensorflow as tf
import os
cwd = os.getcwd()
os.chdir('../../src/')
from dataloader import OneHotEncoder, load_fasta_data, RNASeqDataGenerator
os.chdir('../experiments/CrossValidation/')

In [2]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla K80, pci bus id: 0000:00:04.0, compute capability: 3.7



In [38]:
from tqdm import tqdm
be_type='CBE'
sample='162F'
be_class='A3A'

data_dir = "../../data/raw/{0}/{0}-sequence/".format(be_type)

train_chr = [str(x) for x in range(1, 23)]+['X']

train_seqs, train_ers = [], []
for rep in tqdm(train_chr):
        seq, er = load_fasta_data(sample, rep, data_dir)
        train_seqs = seq+train_seqs
        train_ers = er+train_ers

100%|██████████| 23/23 [00:08<00:00,  2.59it/s]


In [39]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, BatchNormalization, MaxPooling1D, Dropout

for fold in range(1, 5):
    splits = len(train_seqs)//4
    fold_test_seqs = train_seqs[splits*(fold-1):splits*fold]
    fold_test_ers = train_ers[splits*(fold-1):splits*fold]
    fold_train_seqs = train_seqs[0:splits*(fold-1)] + train_seqs[splits*fold:]
    fold_train_ers = train_ers[0:splits*(fold-1)]  + train_ers[splits*fold:]
    traingen = RNASeqDataGenerator(fold_train_seqs, fold_train_ers, batch_size=1024, logits=False)
    testgen = RNASeqDataGenerator(fold_test_seqs, fold_test_ers, logits=False)
    #create model
    model = Sequential()
    #add model layers
    model.add(Conv1D(32, kernel_size=32, activation='relu', input_shape=(101,4)))
    model.add(BatchNormalization())
    model.add(Conv1D(32, kernel_size=32, activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D())
    model.add(Conv1D(32, kernel_size=16, activation='relu'))
    model.add(BatchNormalization())
    model.add(Flatten())
    model.add(Dense(16, activation = 'relu'))
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(optimizer="RMSprop", loss="mse", metrics=["mae", 'mse'])
    print('Model built!')
    print(model.summary())
    print('Training model')
    history = model.fit(traingen, validation_data=testgen, workers=10, use_multiprocessing=True, epochs=3)
    print('Model trained!')
    model.save("{}/{}Fold-{}.h5".format(be_class, sample, fold))

Model built!
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_12 (Conv1D)           (None, 70, 32)            4128      
_________________________________________________________________
batch_normalization_12 (Batc (None, 70, 32)            128       
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 39, 32)            32800     
_________________________________________________________________
batch_normalization_13 (Batc (None, 39, 32)            128       
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 19, 32)            0         
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 4, 32)             16416     
_________________________________________________________________
batch_normalization_14 (Batc (None, 4, 32

Epoch 1/3
Epoch 2/3
Epoch 3/3
Model trained!
Model built!
Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_21 (Conv1D)           (None, 70, 32)            4128      
_________________________________________________________________
batch_normalization_21 (Batc (None, 70, 32)            128       
_________________________________________________________________
conv1d_22 (Conv1D)           (None, 39, 32)            32800     
_________________________________________________________________
batch_normalization_22 (Batc (None, 39, 32)            128       
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 19, 32)            0         
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 4, 32)             16416     
______________________________________________________________

# Model Performance Metrics

In [40]:
be_type = 'CBE'
be_class = 'A3A'
sample = '162F'
from scipy import stats
from tqdm import tqdm
import numpy as np
import tensorflow as tf

def rmse(x, y):
    return np.sqrt(np.mean((x-y)**2))

def spearman(x, y):
        return stats.spearmanr(x, y)[0]
def pearson(x, y):
        return stats.pearsonr(x, y)[0]
    
data_dir = "../../data/raw/{0}/{0}-sequence/".format(be_type)

train_chr = [str(x) for x in range(1, 23)]+['X']

train_seqs, train_ers = [], []
for rep in tqdm(train_chr):
        seq, er = load_fasta_data(sample, rep, data_dir)
        train_seqs = seq+train_seqs
        train_ers = er+train_ers
        
splits = len(train_seqs)//4   

r = []
s = []
p = []
 
for fold in range(1, 5): 
    model = tf.keras.models.load_model("{}/{}Fold-{}.h5".format(be_class, sample, fold))
    test_seqs2 = train_seqs[splits*(fold-1):splits*fold]
    test_ers = train_ers[splits*(fold-1):splits*fold]
    testgen = RNASeqDataGenerator(test_seqs2, test_ers, logits=False)
    preds = model.predict(testgen)
    preds = np.squeeze(preds, axis=1)
    test_er = np.hstack([y for x, y in tqdm(testgen)])
    r.append(rmse(preds, test_er))
    s.append(spearman(preds, test_er))
    p.append(pearson(preds, test_er))

100%|██████████| 23/23 [00:08<00:00,  2.81it/s]
100%|██████████| 2772/2772 [00:20<00:00, 137.61it/s]
100%|██████████| 2772/2772 [00:20<00:00, 136.79it/s]
100%|██████████| 2772/2772 [00:19<00:00, 138.62it/s]
100%|██████████| 2772/2772 [00:20<00:00, 138.13it/s]


In [41]:
r

[0.026893737244911335,
 0.027385156519877184,
 0.027246271941422186,
 0.027692689921831843]

In [42]:
p

[0.609665201774, 0.6083757474965663, 0.6109869328891024, 0.6020256483641462]

In [43]:
s

[0.42772313070099693,
 0.4430799636111852,
 0.44675761853479046,
 0.4575794858024004]

In [44]:
r, s, p

([0.026893737244911335,
  0.027385156519877184,
  0.027246271941422186,
  0.027692689921831843],
 [0.42772313070099693,
  0.4430799636111852,
  0.44675761853479046,
  0.4575794858024004],
 [0.609665201774, 0.6083757474965663, 0.6109869328891024, 0.6020256483641462])

In [45]:
import pandas as pd
df = pd.DataFrame({'Fold':range(1,5), 'RMSE':r, 'SpearmanR':s, 'PearsonR':p})

In [46]:
df.head()

Unnamed: 0,Fold,RMSE,SpearmanR,PearsonR
0,1,0.026894,0.427723,0.609665
1,2,0.027385,0.44308,0.608376
2,3,0.027246,0.446758,0.610987
3,4,0.027693,0.457579,0.602026


In [47]:
df.to_csv("{}/{}results.csv".format(be_class, sample), index=False)