# 4-fold CNN Cross Validation

In [1]:
import tensorflow as tf
import os
cwd = os.getcwd()
os.chdir('../../src/')
from dataloader import OneHotEncoder, load_fasta_data, RNASeqDataGenerator
os.chdir('../experiments/CrossValidation/')

In [6]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla K80, pci bus id: 0000:00:04.0, compute capability: 3.7



In [17]:
from tqdm import tqdm
be_type='ABE'
sample='158B'
be_class='ABEmax'

data_dir = "../../data/raw/{0}/{0}-sequence/".format(be_type)

train_chr = [str(x) for x in range(1, 23)]+['X']

train_seqs, train_ers = [], []
for rep in tqdm(train_chr):
        seq, er = load_fasta_data(sample, rep, data_dir)
        train_seqs = seq+train_seqs
        train_ers = er+train_ers

100%|██████████| 23/23 [00:05<00:00,  4.36it/s]


In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, BatchNormalization, MaxPooling1D, Dropout

for fold in range(1, 5):
    splits = len(train_seqs)//4
    fold_test_seqs = train_seqs[splits*(fold-1):splits*fold]
    fold_test_ers = train_ers[splits*(fold-1):splits*fold]
    fold_train_seqs = train_seqs[0:splits*(fold-1)] + train_seqs[splits*fold:]
    fold_train_ers = train_ers[0:splits*(fold-1)]  + train_ers[splits*fold:]
    traingen = RNASeqDataGenerator(fold_train_seqs, fold_train_ers, batch_size=1024, logits=False)
    testgen = RNASeqDataGenerator(fold_test_seqs, fold_test_ers, logits=False)
    #create model
    model = Sequential()
    #add model layers
    model.add(Conv1D(32, kernel_size=32, activation='relu', input_shape=(101,4)))
    model.add(BatchNormalization())
    model.add(Conv1D(32, kernel_size=32, activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D())
    model.add(Conv1D(32, kernel_size=16, activation='relu'))
    model.add(BatchNormalization())
    model.add(Flatten())
    model.add(Dense(16, activation = 'relu'))
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(optimizer="RMSprop", loss="mse", metrics=["mae", 'mse'])
    print('Model built!')
    print(model.summary())
    print('Training model')
    history = model.fit(traingen, validation_data=testgen, workers=10, use_multiprocessing=True, epochs=3)
    print('Model trained!')
    model.save("{}/{}Fold-{}.h5".format(be_class, sample, fold))

Model built!
Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_51 (Conv1D)           (None, 70, 32)            4128      
_________________________________________________________________
batch_normalization_51 (Batc (None, 70, 32)            128       
_________________________________________________________________
conv1d_52 (Conv1D)           (None, 39, 32)            32800     
_________________________________________________________________
batch_normalization_52 (Batc (None, 39, 32)            128       
_________________________________________________________________
max_pooling1d_17 (MaxPooling (None, 19, 32)            0         
_________________________________________________________________
conv1d_53 (Conv1D)           (None, 4, 32)             16416     
_________________________________________________________________
batch_normalization_53 (Batc (None, 4, 3

Epoch 1/3
Epoch 2/3
Epoch 3/3
Model trained!
Model built!
Model: "sequential_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_60 (Conv1D)           (None, 70, 32)            4128      
_________________________________________________________________
batch_normalization_60 (Batc (None, 70, 32)            128       
_________________________________________________________________
conv1d_61 (Conv1D)           (None, 39, 32)            32800     
_________________________________________________________________
batch_normalization_61 (Batc (None, 39, 32)            128       
_________________________________________________________________
max_pooling1d_20 (MaxPooling (None, 19, 32)            0         
_________________________________________________________________
conv1d_62 (Conv1D)           (None, 4, 32)             16416     
_____________________________________________________________

# Model Performance Metrics

In [37]:
be_type = 'CBE'
be_class = 'A3A'
sample = '160F'
from scipy import stats
from tqdm import tqdm
import numpy as np
import tensorflow as tf

def rmse(x, y):
    return np.sqrt(np.mean((x-y)**2))

def spearman(x, y):
        return stats.spearmanr(x, y)[0]
def pearson(x, y):
        return stats.pearsonr(x, y)[0]
    
data_dir = "../../data/raw/{0}/{0}-sequence/".format(be_type)

train_chr = [str(x) for x in range(1, 23)]+['X']

train_seqs, train_ers = [], []
for rep in tqdm(train_chr):
        seq, er = load_fasta_data(sample, rep, data_dir)
        train_seqs = seq+train_seqs
        train_ers = er+train_ers
        
splits = len(train_seqs)//4   

r = []
s = []
p = []
 
for fold in range(1, 5): 
    model = tf.keras.models.load_model("{}/{}Fold-{}.h5".format(be_class, sample, fold))
    test_seqs2 = train_seqs[splits*(fold-1):splits*fold]
    test_ers = train_ers[splits*(fold-1):splits*fold]
    testgen = RNASeqDataGenerator(test_seqs2, test_ers, logits=False)
    preds = model.predict(testgen)
    preds = np.squeeze(preds, axis=1)
    test_er = np.hstack([y for x, y in tqdm(testgen)])
    r.append(rmse(preds, test_er))
    s.append(spearman(preds, test_er))
    p.append(pearson(preds, test_er))

100%|██████████| 23/23 [00:05<00:00,  4.00it/s]
100%|██████████| 1879/1879 [00:13<00:00, 135.80it/s]
100%|██████████| 1879/1879 [00:13<00:00, 135.34it/s]
100%|██████████| 1879/1879 [00:13<00:00, 134.73it/s]
100%|██████████| 1879/1879 [00:13<00:00, 135.68it/s]


In [38]:
r

[0.02601557493983662,
 0.02629827702666776,
 0.026410987101021598,
 0.026564142092027662]

In [39]:
p

[0.5800935183019567,
 0.5883328203890918,
 0.5828996595483633,
 0.5846704188515293]

In [40]:
s

[0.4040750836036608,
 0.43109797593980853,
 0.4299068990295353,
 0.44019899793777456]

In [41]:
r, s, p

([0.02601557493983662,
  0.02629827702666776,
  0.026410987101021598,
  0.026564142092027662],
 [0.4040750836036608,
  0.43109797593980853,
  0.4299068990295353,
  0.44019899793777456],
 [0.5800935183019567,
  0.5883328203890918,
  0.5828996595483633,
  0.5846704188515293])

In [42]:
import pandas as pd
df = pd.DataFrame({'Fold':range(1,5), 'RMSE':r, 'SpearmanR':s, 'PearsonR':p})

In [43]:
df.head()

Unnamed: 0,Fold,RMSE,SpearmanR,PearsonR
0,1,0.026016,0.404075,0.580094
1,2,0.026298,0.431098,0.588333
2,3,0.026411,0.429907,0.5829
3,4,0.026564,0.440199,0.58467


In [44]:
df.to_csv("A3A/160Fresults.csv")