In [1]:
#### ::::: FPROM benchmark ::::: ####

In [2]:
import re
import os
import sys
import math
import pybedtools
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq

In [3]:
dir_path = os.path.dirname(os.path.realpath('benchmark_FPROM.ipynb'))

In [4]:
fprom_pred_tata_dir = dir_path + "/../../data/promoter/FPROM/tata_results.txt"
fprom_pred_notata_dir = dir_path + "/../../data/promoter/FPROM/notata_results.txt"

In [5]:
# load into dict
# TATA
fprom_pred_tata = {}
with open(fprom_pred_tata_dir) as f:
    name = ''
    for line in f:
        if 'Name: ' in line: 
            name = line.split('Name: ')[1].split(' ')[0]
        elif 'Promoter/Enhancer(s) are predicted' in line:
            n_pred = line.strip('\t').strip(' ').split(' ')[0]
            pred = 1 if int(n_pred) >= 1 else 0
            fprom_pred_tata[name] = pred

# TATA
fprom_pred_notata = {}
with open(fprom_pred_notata_dir) as f:
    name = ''
    for line in f:
        if 'Name: ' in line: 
            name = line.split('Name: ')[1].split(' ')[0]
        elif 'Promoter/Enhancer(s) are predicted' in line:
            n_pred = line.strip('\t').strip(' ').split(' ')[0]
            pred = 1 if int(n_pred) >= 1 else 0
            fprom_pred_notata[name] = pred

In [6]:
# convert to df
# TATA
tata_prom_df = pd.DataFrame.from_dict(fprom_pred_tata,orient='index',columns=['pred'])
# noTATA
notata_prom_df = pd.DataFrame.from_dict(fprom_pred_notata,orient='index',columns=['pred'])

In [7]:
# merge with reference
tata_prom_ref = pd.read_csv(dir_path + "/../../data/promoter/human_epdnew_hg38_TATA_scan_test.csv",index_col=0)
tata_prom_df = tata_prom_df.merge(tata_prom_ref, left_index=True, right_on='name')
notata_prom_ref = pd.read_csv(dir_path + "/../../data/promoter/human_epdnew_hg38_noTATA_scan_test.csv",index_col=0)
notata_prom_df = notata_prom_df.merge(notata_prom_ref, left_index=True, right_on='name')

In [8]:
## metrics
# TATA
TP = len(tata_prom_df[(tata_prom_df['label'] == 1) & (tata_prom_df['pred'] == 1)])
FP = len(tata_prom_df[(tata_prom_df['label'] == 0) & (tata_prom_df['pred'] == 1)])
FN = len(tata_prom_df[(tata_prom_df['label'] == 1) & (tata_prom_df['pred'] == 0)])
TN = len(tata_prom_df[(tata_prom_df['label'] == 0) & (tata_prom_df['pred'] == 0)])

accuracy = (TP + TN)/(TP+FP+FN+TN)
precision = TP/(TP + FP)
recall = TP/(TP + FN)
F1 = 2*precision*recall/(precision+recall)
MCC = (TP*TN - FP*FN)/(((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))**0.5)

print("accuracy = ", accuracy)
print("precision = ", precision)
print("recall = ", recall)
print("F1 = ", F1)
print("MCC = ", MCC)

accuracy =  0.5707255707255707
precision =  0.1228925289652417
recall =  0.7110494683310218
F1 =  0.20956533587682244
MCC =  0.14669074822347083


In [9]:
# noTATA
TP = len(notata_prom_df[(notata_prom_df['label'] == 1) & (notata_prom_df['pred'] == 1)])
FP = len(notata_prom_df[(notata_prom_df['label'] == 0) & (notata_prom_df['pred'] == 1)])
FN = len(notata_prom_df[(notata_prom_df['label'] == 1) & (notata_prom_df['pred'] == 0)])
TN = len(notata_prom_df[(notata_prom_df['label'] == 0) & (notata_prom_df['pred'] == 0)])

accuracy = (TP + TN)/(TP+FP+FN+TN)
precision = TP/(TP + FP)
recall = TP/(TP + FN)
F1 = 2*precision*recall/(precision+recall)
MCC = (TP*TN - FP*FN)/(((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))**0.5)

print("accuracy = ", accuracy)
print("precision = ", precision)
print("recall = ", recall)
print("F1 = ", F1)
print("MCC = ", MCC)

accuracy =  0.5611619861619862
precision =  0.14067646687165664
recall =  0.7416668517243676
F1 =  0.23649540966077873
MCC =  0.16432697412005723


In [10]:
# combined
scan_dev = pd.concat([tata_prom_df, notata_prom_df])

## metrics
TP = len(scan_dev[(scan_dev['label'] == 1) & (scan_dev['pred'] == 1)])
FP = len(scan_dev[(scan_dev['label'] == 0) & (scan_dev['pred'] == 1)])
FN = len(scan_dev[(scan_dev['label'] == 1) & (scan_dev['pred'] == 0)])
TN = len(scan_dev[(scan_dev['label'] == 0) & (scan_dev['pred'] == 0)])

accuracy = (TP + TN)/(TP+FP+FN+TN)
precision = TP/(TP + FP)
recall = TP/(TP + FN)
F1 = 2*precision*recall/(precision+recall)
MCC = (TP*TN - FP*FN)/(((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))**0.5)

print("accuracy = ", accuracy)
print("precision = ", precision)
print("recall = ", recall)
print("F1 = ", F1)
print("MCC = ", MCC)

accuracy =  0.5621097287763954
precision =  0.1389803383630544
recall =  0.7389833255667889
F1 =  0.23395991622754547
MCC =  0.16277410537307302
