# ::::: promid benchmark ::::: ####

In [9]:
import os
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False # Supress tensorflow warning
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Supress tensorflow warning

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, matthews_corrcoef, average_precision_score

In [3]:
dir_path = os.path.dirname(os.path.realpath('benchmark_promid.ipynb'))

In [4]:
tata_scan_dev_dir = dir_path + "/../data/promoter/human_epdnew_hg38_TATA_scan_test.csv"
notata_scan_dev_dir = dir_path + "/../data/promoter/human_epdnew_hg38_noTATA_scan_test.csv"

In [5]:
tata_scan_dev = pd.read_csv(tata_scan_dev_dir, index_col=0)
notata_scan_dev = pd.read_csv(notata_scan_dev_dir, index_col=0)

In [6]:
def encode(seq, strand):
    enc_mat = np.append(np.eye(4), [[0,0,0,0]], axis=0)
    enc_mat = enc_mat.astype(np.bool)
    mapping_pos = dict(zip("ACGTN", range(5)))
    mapping_neg = dict(zip("TGCAN", range(5)))
    
    if(strand == "+"):
        seq2 = [mapping_pos[i] for i in seq]
    else:
        seq = seq[::-1]
        seq2 = [mapping_neg[i] for i in seq]
    return enc_mat[seq2]

In [7]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [19]:
#############  predict TATA  ##############
# # multiprocessing
# print("Number of processors: ", mp.cpu_count())
# pool = mp.Pool(mp.cpu_count())

# globals
batch_size = 128
cutoff = 0.5
probs_tata = []


new_graph = tf.Graph()
print("\t - Starting Tensorflow...", end = " ")
with tf.Session(graph=new_graph) as sess:
    print("loading saved model...", end = " ")
    tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], "/projects/b1017/Jerry/PromID/promid/models/model_scan")
    saver = tf.train.Saver()
    saver.restore(sess, "/projects/b1017/Jerry/PromID/promid/models/model_scan/variables/variables")
    input_x = tf.get_default_graph().get_tensor_by_name("input_prom:0")
    y = tf.get_default_graph().get_tensor_by_name("output_prom:0")
    kr = tf.get_default_graph().get_tensor_by_name("kr:0")
    in_training_mode = tf.get_default_graph().get_tensor_by_name("in_training_mode:0")  
    print("loaded!")

    # predict
    print("\t - One-hot encoding scanned subsequences...", end = " ")
    encoded = [encode(row['seq'], row['strand']) for index, row in tata_scan_dev.iterrows()]
    print("Done!")
#             test_res = pool.starmap(predict, [(batch, sess) for batch in chunks(encoded, batch_size)])
    print("\t - Begin prediction")
    i = 1
    for batch in chunks(encoded, batch_size):
        if i % 100 == 0:
            print("\t\tbatch: ", i, "out of ", math.ceil(len(encoded)/batch_size))
        pred = sess.run(y, feed_dict={input_x: batch, kr: 1.0, in_training_mode: False})
        probs_tata.extend([prob[0] for prob in pred])
        i += 1
    print("\t Prediction finished!")
                

# binary prediction
tata_scan_dev['pred'] = [1 if prob > cutoff else 0 for prob in probs_tata]

# pool.close()    

	 - Starting Tensorflow... loading saved model... loaded!
	 - One-hot encoding scanned subsequences... Done!
	 - Begin prediction
		batch:  100 out of  423
		batch:  200 out of  423
		batch:  300 out of  423
		batch:  400 out of  423
	 Prediction finished!


In [29]:
## metrics
# TP = len(tata_scan_dev[(tata_scan_dev['label'] == 1) & (tata_scan_dev['pred'] == 1)])
# FP = len(tata_scan_dev[(tata_scan_dev['label'] == 0) & (tata_scan_dev['pred'] == 1)])
# FN = len(tata_scan_dev[(tata_scan_dev['label'] == 1) & (tata_scan_dev['pred'] == 0)])
# TN = len(tata_scan_dev[(tata_scan_dev['label'] == 0) & (tata_scan_dev['pred'] == 0)])

In [32]:
# precision = TP/(TP + FP)
# recall = TP/(TP + FN)
# F1 = 2*precision*recall/(precision+recall)
# MCC = (TP*TN - FP*FN)/(((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))**0.5)

In [10]:
acc = accuracy_score(tata_scan_dev['label'],tata_scan_dev['pred'])
precision = precision_score(tata_scan_dev['label'],tata_scan_dev['pred'])
recall = recall_score(tata_scan_dev['label'],tata_scan_dev['pred'])
F1 = f1_score(tata_scan_dev['label'],tata_scan_dev['pred'])
MCC = matthews_corrcoef(tata_scan_dev['label'],tata_scan_dev['pred'])
AUC = roc_auc_score(tata_scan_dev['label'],probs_tata)
AUPR = average_precision_score(tata_scan_dev['label'],probs_tata)

In [11]:
print("accuracy = ", precision)
print("precision = ", precision)
print("recall = ", recall)
print("F1 = ", F1)
print("MCC = ", MCC)
print("AUC = ", AUC)
print("AUPR = ", AUPR)

accuracy =  0.16945727482678985
precision =  0.16945727482678985
recall =  0.4070735090152566
F1 =  0.23929881777415415
MCC =  0.16078539284007243
AUC =  0.7032102464498875
AUPR =  0.1723718738002334


In [12]:
#############  predict noTATA  ##############
# # multiprocessing
# print("Number of processors: ", mp.cpu_count())
# pool = mp.Pool(mp.cpu_count())

# globals
batch_size = 128
cutoff = 0.5
probs_notata = []


new_graph = tf.Graph()
print("\t - Starting Tensorflow...", end = " ")
with tf.Session(graph=new_graph) as sess:
    print("loading saved model...", end = " ")
    tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], "/projects/b1017/Jerry/PromID/promid/models/model_scan")
    saver = tf.train.Saver()
    saver.restore(sess, "/projects/b1017/Jerry/PromID/promid/models/model_scan/variables/variables")
    input_x = tf.get_default_graph().get_tensor_by_name("input_prom:0")
    y = tf.get_default_graph().get_tensor_by_name("output_prom:0")
    kr = tf.get_default_graph().get_tensor_by_name("kr:0")
    in_training_mode = tf.get_default_graph().get_tensor_by_name("in_training_mode:0")  
    print("loaded!")

    # predict
    print("\t - One-hot encoding scanned subsequences...", end = " ")
    encoded = [encode(row['seq'], row['strand']) for index, row in notata_scan_dev.iterrows()]
    print("Done!")
#             test_res = pool.starmap(predict, [(batch, sess) for batch in chunks(encoded, batch_size)])
    print("\t - Begin prediction")
    i = 1
    for batch in chunks(encoded, batch_size):
        if i % 100 == 0:
            print("\t\tbatch: ", i, "out of ", math.ceil(len(encoded)/batch_size))
        pred = sess.run(y, feed_dict={input_x: batch, kr: 1.0, in_training_mode: False})
        probs_notata.extend([prob[0] for prob in pred])
        i += 1
    print("\t Prediction finished!")
                

# binary prediction
notata_scan_dev['pred'] = [1 if prob > cutoff else 0 for prob in probs_notata]

# pool.close()    

	 - Starting Tensorflow... loading saved model... loaded!
	 - One-hot encoding scanned subsequences... Done!
	 - Begin prediction
		batch:  100 out of  3840
		batch:  200 out of  3840
		batch:  300 out of  3840
		batch:  400 out of  3840
		batch:  500 out of  3840
		batch:  600 out of  3840
		batch:  700 out of  3840
		batch:  800 out of  3840
		batch:  900 out of  3840
		batch:  1000 out of  3840
		batch:  1100 out of  3840
		batch:  1200 out of  3840
		batch:  1300 out of  3840
		batch:  1400 out of  3840
		batch:  1500 out of  3840
		batch:  1600 out of  3840
		batch:  1700 out of  3840
		batch:  1800 out of  3840
		batch:  1900 out of  3840
		batch:  2000 out of  3840
		batch:  2100 out of  3840
		batch:  2200 out of  3840
		batch:  2300 out of  3840
		batch:  2400 out of  3840
		batch:  2500 out of  3840
		batch:  2600 out of  3840
		batch:  2700 out of  3840
		batch:  2800 out of  3840
		batch:  2900 out of  3840
		batch:  3000 out of  3840
		batch:  3100 out of  3840
		batch:  3

In [35]:
## metrics
# TP = len(notata_scan_dev[(notata_scan_dev['label'] == 1) & (notata_scan_dev['pred'] == 1)])
# FP = len(notata_scan_dev[(notata_scan_dev['label'] == 0) & (notata_scan_dev['pred'] == 1)])
# FN = len(notata_scan_dev[(notata_scan_dev['label'] == 1) & (notata_scan_dev['pred'] == 0)])
# TN = len(notata_scan_dev[(notata_scan_dev['label'] == 0) & (notata_scan_dev['pred'] == 0)])

In [36]:
# precision = TP/(TP + FP)
# recall = TP/(TP + FN)
# F1 = 2*precision*recall/(precision+recall)
# MCC = (TP*TN - FP*FN)/(((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))**0.5)

In [13]:
acc = accuracy_score(notata_scan_dev['label'],notata_scan_dev['pred'])
precision = precision_score(notata_scan_dev['label'],notata_scan_dev['pred'])
recall = recall_score(notata_scan_dev['label'],notata_scan_dev['pred'])
F1 = f1_score(notata_scan_dev['label'],notata_scan_dev['pred'])
MCC = matthews_corrcoef(notata_scan_dev['label'],notata_scan_dev['pred'])
AUC = roc_auc_score(notata_scan_dev['label'],probs_notata)
AUPR = average_precision_score(notata_scan_dev['label'],probs_notata)

In [14]:
print("accuracy = ", precision)
print("precision = ", precision)
print("recall = ", recall)
print("F1 = ", F1)
print("MCC = ", MCC)
print("AUC = ", AUC)
print("AUPR = ", AUPR)

accuracy =  0.20050695825049702
precision =  0.20050695825049702
recall =  0.4479358664031445
F1 =  0.27701519594042473
MCC =  0.1914509030103389
AUC =  0.7207212019816974
AUPR =  0.2091199064919121


In [25]:
#############  combined  ##############
scan_dev = pd.concat([tata_scan_dev, notata_scan_dev])
# probs_tata.extend(probs_notata)
probs = np.asarray(probs_tata)

# ## metrics
# TP = len(scan_dev[(scan_dev['label'] == 1) & (scan_dev['pred'] == 1)])
# FP = len(scan_dev[(scan_dev['label'] == 0) & (scan_dev['pred'] == 1)])
# FN = len(scan_dev[(scan_dev['label'] == 1) & (scan_dev['pred'] == 0)])
# TN = len(scan_dev[(scan_dev['label'] == 0) & (scan_dev['pred'] == 0)])

# precision = TP/(TP + FP)
# recall = TP/(TP + FN)
# F1 = 2*precision*recall/(precision+recall)
# MCC = (TP*TN - FP*FN)/(((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))**0.5)

# print("precision = ", precision)
# print("recall = ", recall)
# print("F1 = ", F1)
# print("MCC = ", MCC)
acc = accuracy_score(scan_dev['label'],scan_dev['pred'])
precision = precision_score(scan_dev['label'],scan_dev['pred'])
recall = recall_score(scan_dev['label'],scan_dev['pred'])
F1 = f1_score(scan_dev['label'],scan_dev['pred'])
MCC = matthews_corrcoef(scan_dev['label'],scan_dev['pred'])
AUC = roc_auc_score(scan_dev['label'],probs)
AUPR = average_precision_score(scan_dev['label'],probs)
print("accuracy = ", precision)
print("precision = ", precision)
print("recall = ", recall)
print("F1 = ", F1)
print("MCC = ", MCC)
print("AUC = ", AUC)
print("AUPR = ", AUPR)

accuracy =  0.1975998270145596
precision =  0.1975998270145596
recall =  0.4443543975525255
F1 =  0.27355331183855214
MCC =  0.18871574212193554
AUC =  0.7192282194431263
AUPR =  0.20549190327538053


In [24]:
len(probs_tata)

545454