Imports:

In [23]:
import sys
import os
import numpy as np
import tensorflow 
import keras
from keras.models import model_from_json, load_model
import pandas as pd

Using GPUs

In [24]:
gpu = 1 # don't set this to 0 

if gpu > -1:
	device = '/gpu:%i' % gpu
	os.environ['CUDA_VISIBLE_DEVICES']=str(1)
else:
	device = '/cpu:0'

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'


Loading the models 

In [35]:
model_file_0 = "/isdata/alab/people/masa/deep_learning_21_student/GM12878_rDHS_finetuned/partition_0/model_compiled.h5"
model_file_1 = "/isdata/alab/people/masa/deep_learning_21_student/GM12878_rDHS_finetuned/partition_1/model_compiled.h5"
model_file_2 = "/isdata/alab/people/masa/deep_learning_21_student/GM12878_rDHS_finetuned/partition_2/model_compiled.h5"

In [36]:
model_0 = load_model(model_file_0, compile=False)
model_1 = load_model(model_file_1, compile=False)
model_2 = load_model(model_file_2, compile=False)

model_0.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None, 4)           0         
_________________________________________________________________
model_1 (Model)              (None, 64)                554144    
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              66560     
_________________________________________________________________
batch_normalization_1 (Batch (None, 1024)              4096      
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                32800     
_________________________________________________________________
batch_normalization_2 (Batch (None, 32)                128 

Loading the dataframe. The sequences are converted to upper case. Cell_line contains the positives (labelled GM12878) and the negatives (Negative). Fold indicates the data partitions that were used in each cross-validation (CV) fold (0, 1, 2). 

In [25]:
df = pd.read_csv('/home/isacor/deep_learning_21_student/sequences_rDHS/training_dataset_GM12878_600.csv', sep='\t')

In [74]:
df.Sequence = df.Sequence.str.upper()
print (df.cell_line.value_counts())
df

GM12878        91902
GM12878_NEG    77189
Negative       18428
Name: cell_line, dtype: int64


Unnamed: 0,Region,Sequence,zscore,cell_line,Chr,Fold,Len
0,chr1:112466510-112467110,GCGGAACGAAGCCTAGACTGCGTAAAAAAAAACAGAGAACAGAAAC...,3.78050,GM12878,chr1,0,600
1,chr1:156122722-156123322,CAGAACTGTTACCTTAGAGCTGGCCAGGATTAGAGAACAGTGCCTG...,2.39238,GM12878,chr1,0,600
2,chr14:20663708-20664308,GCTGCAGGGTCTGCTGAAAGTCCTTAAGCAGCCCTTTCCTAAAACT...,1.97273,GM12878,chr14,0,600
3,chr19:18410476-18411076,AAAATATGGACAAACAAGAAAAAAAGATGCGCAGGGAGGAGTTCAA...,2.24890,GM12878,chr19,0,600
4,chr1:778437-779037,CGGCCGCGCCAGACATAGTTTTCTATTTTTGACCAACATAAACACT...,3.73188,GM12878,chr1,0,600
...,...,...,...,...,...,...,...
187514,chr13:51807675-51808275,GAGAAAGTGATTTTCCACAAACGGGACTCGTGTTCGTCTTAAACTC...,0.00000,Negative,chr13,2,600
187515,chr13:18351290-18351890,CTTGGAAGGTTTTCCTTGTGACTTACCACTTGTAAACACTGAGAAA...,0.00000,Negative,chr13,2,600
187516,chr13:32386394-32386994,GAGACTGCATCTCAAAATAAAGAAAAAATGATAATCATCTTTTCTG...,0.00000,Negative,chr13,2,600
187517,chr13:46311381-46311981,GTACAGGGAGGCTGCAAGTACCCTTCACCCAGTTTCCCACAATGGT...,0.00000,Negative,chr13,2,600


One-hot encoding of the sequences

In [27]:
encoder = np.identity(4)
encoder_dict = dict()
for i, letter in enumerate ('ACGT'):
    encoder_dict[letter] = encoder[i]

def one_hot_encoder (seq, encoder_dict):
    encoded_seq = np.zeros((len(seq), len(encoder_dict)))
    for i, letter in enumerate (seq):
        encoded_seq[i] = encoder_dict[letter]
    return encoded_seq  

In [9]:
seq_list = list()
for seq in df.Sequence:
    seq_list.append(one_hot_encoder(seq, encoder_dict))
input_seqs = np.array(seq_list)

print(input_seqs.shape)
print(df.Sequence[0])
input_seqs[0]

(187519, 600, 4)
GCGGAACGAAGCCTAGACTGCGTAAAAAAAAACAGAGAACAGAAACCACCTCACAGCATTGTTGTGAGCCTTAAATGAGCAAATTTAAGACTTAAAACTGGTCATTGACATAAGCCTTCAAAAAGTGATAGCTATTAGTATTATGTAGCTCCTAGAACAGTGGCTGACATATAGTAGATGCTCATAACTGTTAAAAAGGGGCCAGGAGGATTCTATCTATTTATGCATTCATCCATTTAATTATGCTTCAGAAATGTCTTGCTCTTTTTATACTGTCGATGCTGTAAAATACAAAACCCATAATAAGCCAAGTAAGCAAGCAAGAGCCAGCACTCTGGGAGGAAAACAAAGAGATTTTCTTCATTGAAATATCCATGCCACAGTTCCTACTAGGTGCCAGGCACTGTGATAGGCACTAGGATAGAGAAATAAGTTACAGGCCTTTAGGGATTTTAGGTGCTGGTGGGAGAGACAAACATGAAACGCAGTGTTATGTGGATTATAATAAATAGAGCAACAAGCTGTGGCTGCACAGTTAACCCCCAGTGGAGTCAGGAAAGGTCTCACAGAGGAAGGGACACTTGTCTTGGTCTTGAAGGA


array([[0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.]])

I am masking partitions of the data to only input data that each model partition wasn't trained on

In [45]:
def encoded_partitions (sequences, mask):
    seq_list = list()
    for seq in sequences.loc[mask]:
        seq_list.append(one_hot_encoder(seq, encoder_dict))
    return np.array(seq_list)
    
part_1_2 = encoded_partitions(df.Sequence, (df.Fold != 0))
part_0_2 = encoded_partitions(df.Sequence, (df.Fold != 1))
part_0_1 = encoded_partitions(df.Sequence, (df.Fold != 2))

In [38]:
pred_0 = model_0.predict(part_1_2, batch_size=128)

In [46]:
pred_1 = model_1.predict(part_0_2, batch_size=128)

In [48]:
pred_2 = model_2.predict(part_0_1, batch_size=128)

After getting the predictions, I am transforming the continuous output of the model to binary class labels

In [54]:
bin_pred_0 = np.around(pred_0, decimals=0)
bin_pred_1 = np.around(pred_1, decimals=0)
bin_pred_2 = np.around(pred_2, decimals=0)

In [75]:
# true classes of everything except partition 0 
true_label_0 = df.cell_line.loc[df.Fold != 0].to_numpy()
true_label_0 = np.where (true_label_0 == 'GM12878', 1, 0)

# true classes of everything except partition 1
true_label_1 = df.cell_line.loc[df.Fold != 1].to_numpy()
true_label_1 = np.where (true_label_1 == 'GM12878', 1, 0)

# true classes of everything except partition 2
true_label_2 = df.cell_line.loc[df.Fold != 2].to_numpy()
true_label_2 = np.where (true_label_2 == 'GM12878', 1, 0)


Some summary statistics of the model's performance: 

In [76]:
from tensorflow import math

In [77]:
confusion_matrix_0 = math.confusion_matrix(true_label_0, bin_pred_0)
confusion_matrix_1 = math.confusion_matrix(true_label_1, bin_pred_1)
confusion_matrix_2 = math.confusion_matrix(true_label_2, bin_pred_2)

with tensorflow.Session() as sess:  print(confusion_matrix_0.eval(), '\n',
                                         confusion_matrix_1.eval(), '\n',
                                         confusion_matrix_2.eval()) 


[[50595 12079]
 [10117 44145]] 
 [[51499 13161]
 [12490 54708]] 
 [[49458 14442]
 [11050 51294]]


In [78]:
from sklearn.metrics import classification_report

In [79]:
print("Classification report - model trained on partition 0:\n", 
      classification_report(true_label_0, bin_pred_0))

print("Classification report - model trained on partition 1:\n", 
      classification_report(true_label_1, bin_pred_1))

print("Classification report - model trained on partition 2:\n", 
      classification_report(true_label_2, bin_pred_2))


Classification report - model trained on partition 0:
               precision    recall  f1-score   support

           0       0.83      0.81      0.82     62674
           1       0.79      0.81      0.80     54262

    accuracy                           0.81    116936
   macro avg       0.81      0.81      0.81    116936
weighted avg       0.81      0.81      0.81    116936

Classification report - model trained on partition 1:
               precision    recall  f1-score   support

           0       0.80      0.80      0.80     64660
           1       0.81      0.81      0.81     67198

    accuracy                           0.81    131858
   macro avg       0.81      0.81      0.81    131858
weighted avg       0.81      0.81      0.81    131858

Classification report - model trained on partition 2:
               precision    recall  f1-score   support

           0       0.82      0.77      0.80     63900
           1       0.78      0.82      0.80     62344

    accuracy     