In [None]:
##copy the genome data to the file        "Input_data.txt"
##Run all the cells and wait for the output to be saved to the file "Motif_list_with_indexes_and_classification.txt"
import tensorflow as tf                                        #imports tensorflow modules.
import numpy as np                                             #imports numpy modules as np.
import re                                                      #imports regular expression module(re).
import matplotlib.pyplot as plt                                #imports matplotlib as plt.

In [None]:
### Main variables used in finding
#class_conf represents the required minimum confidence for output promoters.
#Length of output sequences is 151 as the models are trained using datasets of range 150 taken from newEPD database.
stride = 2
seq_len = 151
class_conf = 0.95

In [None]:
#dictionary for known promoters which are required for classification.
decs = {
    "imotif_element":r'C[3-5](A|C|T|G)[1-7]C[3-5](A|C|T|G)[1-7]C[3-5](A|C|T|G)[1-7]C[3-5]',
    "g_quadruplex_element":r'G[3-5](A|C|G|T)[1-7]G[3-5](A|C|G|T)[1-7]G[3-5](A|C|G|T)[1-7]G[3-5]',
    "z_motifs_element":r'(C|T)(A|G)>6',
    "G_tract_element":r'(CCCCCCC|GGGGGGG)',
    "A_tract_element":r'(AAAAAAA|TTTTTTT)',
    "TATA_box_element":r'TATA(A|T)A(A|T)(A|G)',
    "inrfly_element":r'TCAGT(C|T)',
    "inrhuman_element":r'(C|T)(A|G)',
    "bbcabw_element":r'(C|G|T)(C|G|T)CA(C|G|T)(A|T)',
    "rgwcgtg_element":r'(A|G)G(A|T)CGTG',
    "rgwyvt_element":r'(A|G)G(A|T)(C|T)(A|C|G)T',
    "GCGWKCGGTTS_element":r'GCG(A|T)(G|T)CGGTT(G|C)',
    "mte_element":r'C(G|C)A(A|G)C(G|C)(G|C)AACG(G|C)',
    "OHLER1_element":r'(C|T)GGTCACACT(A|G)',
    "OHLER6_element":r'(G|T)T(C|T)(A|G)GTAT(A|T)TTT',
    "OHLER7_element":r'(G|T)(A|C|G|T)(A|C|G|T)CA(G|T)C(A|C|G|T)CT(A|G)(A|C|G|T)(C|T)',
    "dre_element":r'(A|T)ATCGAT(A|T)',
    "tct_element":r'(C|T)(C|T)CTTT(C|T)(C|T)',
    "BREu_element":r'(G|C)(G|C)(A|G)CGCC',
    "BREd_element":r'(A|G)T(A|G|T)(G|T)(G|T)(G|T)(G|T)',
    "DCEI_element":r'CTTC',
    "DCEII_element":r'CTGT',
    "DCEIII_element":r'AGC',
    "XCPE1_element":r'(A|G|T)(G|C)G(C|T)GG(A|G)A(G|C)(A|C)',
    "XCPE2_element":r'(A|C|G)C(C|T)C(A|G)TT(A|G)C(A|C)(C|T)',
    "pausebutton_element":r'(G|T)CG(A|G)(A|T)CG',
    "sp1_element":r'(GGCGGG|GGGCGG|CCGCCC|CCCGCC)',
    "atg_element":r'ATG',
}

In [None]:
#This is a method for choosing the organism of choice.
def let_user_pick(options):                                    
    print("Please choose:")
    for idx, element in enumerate(options):
        print("{}) {}".format(idx+1,element))
    i = input("Enter number: ")
    try:
        if 0 < int(i) <= len(options):
            return int(i)-1
    except Exception as e:
        print(e)
        pass
    return None


In [None]:
#Importing the saved models of different oragnisms from their respective folders as each organism has a separate model.
homo_sapiens_model = tf.keras.models.load_model("Models/Homo-Sapiens/model")
c_elegans_model = tf.keras.models.load_model("Models/C-elegans/model")
drosophila_melanogaster_model = tf.keras.models.load_model("Models/Drosophila-Melanogaster/model")
mus_musculas_model = tf.keras.models.load_model("Models/Mus-Musculus/model")
yeast_model = tf.keras.models.load_model("Models/Yeast/model")

In [None]:
#species dictionary
species = {
    "Homo-Sapiens": homo_sapiens_model,
    "C-Elegans": c_elegans_model,
    "Drosophila-Melanogaster": drosophila_melanogaster_model,
    "Mus-Musculas": mus_musculas_model,
    "Yeast":yeast_model,
}

In [None]:
#Data input from file "Input_data.txt" and modifying it as required.
inp = "Input_data.txt"
try:
    with open(inp, "r") as fl:
        input_seq = "".join(fl.read().split("\n")[1:])
except FileNotFoundError:
    print("File Not found")
except Exception as e:
    print(e)

In [None]:
#After running this cell select the choice of organism based on the serial number.
#Input is an integer only.
print("Matching for {} sequences".format(len( [ind for ind in range(0, len(input_seq)-stride-seq_len, stride)] )))    
slected_specie = let_user_pick(species.keys())
tmp_model = list(species.values())[slected_specie]

In [None]:
#Main program for predicting and classifying the promoters present in a given genome.
#wait till this runs completely.
#output will be save to the file "Motif_list_with_indexes_and_classification.txt".
with open("Motif_list_with_indexes_and_classification.txt", "w+") as fl:
    for ind in range(0, len(input_seq)-stride-seq_len, stride):
        out = np.array([ int(_) for _ in input_seq[ind:ind+seq_len].replace("N", "0").replace("A", "1").replace("T", "2").replace("G", "3").replace("C", "4") ])
        out = out.reshape(1, 151, 1) / 4

        pred_cat = tmp_model.predict(out)
        if np.argmax(pred_cat[0]) == 1 and pred_cat[0][1] >= class_conf:
            fl.write("{} with {:0.2f}% Confidence at index {}\n".format("Promoter", pred_cat[0][1]*100, ind))
            fl.write("Sequence at {}: {}\n".format(ind, input_seq[ind:ind+seq_len]))
            flg = False
            for key, value in decs.items():
                if re.search(value, input_seq[ind:ind+seq_len]):
                    fl.write(f"{key} is present\n")
                    flg = True
            if not flg:
                fl.write("Unable to classify\n")
            fl.write("\n\n")