In [1]:
import sys
sys.path.append("/home/kelab/m6AAIpy2")

In [2]:
from keras.models import load_model
from pkg_resources import resource_filename
import numpy as np
import pandas as pd
from Bio.Seq import Seq
import keras.backend as kb

Using TensorFlow backend.


In [3]:
def one_hot_encode(seq):

    map = np.asarray([[0, 0, 0, 0],
                      [1, 0, 0, 0],
                      [0, 1, 0, 0],
                      [0, 0, 1, 0],
                      [0, 0, 0, 1]])

    seq = seq.upper().replace('A', '\x01').replace('C', '\x02')
    seq = seq.replace('G', '\x03').replace('T', '\x04').replace('N', '\x00')

    return map[np.fromstring(seq, np.int8) % 5]

In [4]:
def categorical_crossentropy_2d(y_true, y_pred):
    # Standard categorical cross entropy for sequence outputs

    return - kb.mean(y_true[:, :, 0]*kb.log(y_pred[:, :, 0]+1e-10)
                   + y_true[:, :, 1]*kb.log(y_pred[:, :, 1]+1e-10))

In [5]:
context = 10000

In [6]:
paths = ('/home/kelab/Desktop/iM6A/mouseRAC10000_c{}.h5'.format(x) for x in range(1, 6))

In [7]:
models = [load_model(y, custom_objects={'categorical_crossentropy_2d': categorical_crossentropy_2d}) for y in paths]

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [8]:
models

[<keras.engine.training.Model at 0x7f4d36fea650>,
 <keras.engine.training.Model at 0x7f4d31840810>,
 <keras.engine.training.Model at 0x7f4baa1d0d10>,
 <keras.engine.training.Model at 0x7f4a4d19ddd0>,
 <keras.engine.training.Model at 0x7f4a489c7e90>]

### Read data

In [10]:
Fasta = pd.read_csv("Temp/mm10_Fasta.csv")

### Select positive strand

In [12]:
Fasta_Pos = Fasta[Fasta["strand"]=="+"]

In [13]:
Fasta_Pos = Fasta_Pos.reset_index(drop = True)

In [14]:
Sequence = Fasta_Pos["Sequence"].tolist()

In [None]:
for i in range(len(Fasta_Pos)):
    
    # Define exon location   
    Exon = []
    a = Fasta_Pos.loc[i, "exonStarts"].split(",")[0:-1]
    b = Fasta_Pos.loc[i, "exonEnds"].split(",")[0:-1]
    A = [int(u) for u in a]
    B = [int(v) for v in b]
    Exon = A + B
    Exon = sorted(Exon)
    
    print(i)
    
    # Define length of exon and intron
    Length = []
    for j in range(1,len(Exon),1):
        length = Exon[j] - Exon[-1 + j]
        Length.append(length)
    
    CumSum = []
    Sum = 0
    for k in Length:
        Sum = Sum + k
        CumSum.append(Sum)    
    
    
    # Define sequence of exon and intron
    input_sequence = Sequence[i]
    
    First = [input_sequence[0:CumSum[0]]]
    for m in range(1,len(CumSum),1):
        seq = input_sequence[(CumSum[m-1]):(CumSum[m])]
        First.append(seq)
    
    # Define sequence of exon
    sequence = ""
    Sum = 0
    CumSum = []
    for n in range(0,len(First),2):
        seq = First[n]
        Sum = Sum + len(seq)
        sequence = sequence + seq
        CumSum.append(Sum)  
    
    # Prediction
    x = one_hot_encode('N'*(context//2) + sequence + 'N'*(context//2))[None, :]
    y = np.mean([models[m].predict(x) for m in range(5)], axis=0)
    m6AAI_prob = y[0, :, 1]
    m6AAI_prob = m6AAI_prob.tolist()

    # Define probability in cDNA
    First_prob = [m6AAI_prob[0:CumSum[0]]]
    for m in range(1,len(CumSum),1):
        prob = m6AAI_prob[(CumSum[m-1]):(CumSum[m])]
        First_prob.append(prob)
        
    # Define intron length
    IntronLength = []
    for n in range(1,len(Length),2):
        intron = Length[n]
        IntronLength.append(intron)
    IntronLength.append(0)
    
    Probability = []
    for o in range(0,len(First_prob),1):
        List = First_prob[o]
        Intron = [0]*IntronLength[o]
        Probability.append(List+Intron)
        
    iM6A_prob = []
    for t in Probability:
        iM6A_prob = iM6A_prob + t
    Probability = iM6A_prob        

    Probability = pd.DataFrame({'Probability':Probability})
    df = pd.DataFrame(np.random.randn((Fasta_Pos.loc[i,"Length"]), 3))
    df.columns = ["name", "chrom", "strand"]
    df["name"] = Fasta_Pos.loc[i,"name"]
    df["chrom"] = Fasta_Pos.loc[i,"chrom"]
    df["strand"] = Fasta_Pos.loc[i,"strand"]
    
    list = range(Fasta_Pos.loc[i,"txStart"], Fasta_Pos.loc[i,"txEnd"])
    Start = pd.DataFrame(list, columns=["Start"])
    df = pd.concat([df, Start], axis=1)
    df["End"] = df["Start"]
    df = pd.concat([df, Probability], axis=1)
    
    df.columns = ["name", "chrom", "strand", "Start", "End", "Probabilty"]
    df = df[df["Probabilty"] >= 0.001]
    df.to_csv("./OutputsIL/{}.bed".format(Fasta_Pos.loc[i,"name"]), sep="\t", index=False)    

### Select negative strand

In [16]:
Fasta_Neg = Fasta[Fasta["strand"]=="-"]

In [17]:
Fasta_Neg = Fasta_Neg.reset_index(drop = True)

In [18]:
Sequence = Fasta_Neg["Sequence"].tolist()

In [None]:
for i in range(len(Fasta_Neg)):
    
    # Define exon location
    Exon = []
    a = Fasta_Neg.loc[i, "exonStarts"].split(",")[0:-1]
    b = Fasta_Neg.loc[i, "exonEnds"].split(",")[0:-1]
    A = [int(u) for u in a]
    B = [int(v) for v in b]
    Exon = A + B
    Exon = sorted(Exon)
    
    print(i)
    
    # Define length of exon and intron
    Length = []
    for j in range(1,len(Exon),1):
        length = Exon[j] - Exon[-1 + j]
        Length.append(length)
    Length = Length[::-1]
    
    CumSum = []
    Sum = 0
    for k in Length:
        Sum = Sum + k
        CumSum.append(Sum)
   
    # Define sequence of exon and intron
    input_sequence = Sequence[i]
    
    First = [input_sequence[0:CumSum[0]]]
    for m in range(1,len(CumSum),1):
        seq = input_sequence[(CumSum[m-1]):(CumSum[m])]
        First.append(seq)
        
    # Define sequence of exon
    sequence = ""
    Sum = 0
    CumSum = []
    for n in range(0,len(First),2):
        seq = First[n]
        Sum = Sum + len(seq)
        sequence = sequence + seq
        CumSum.append(Sum)          
            
    # Prediction
    x = one_hot_encode('N'*(context//2) + sequence + 'N'*(context//2))[None, :]
    y = np.mean([models[m].predict(x) for m in range(5)], axis=0)
    m6AAI_prob = y[0, :, 1]
    m6AAI_prob = m6AAI_prob.tolist()        
        
    # Define probability in cDNA
    First_prob = [m6AAI_prob[0:CumSum[0]]]
    for m in range(1,len(CumSum),1):
        prob = m6AAI_prob[(CumSum[m-1]):(CumSum[m])]
        First_prob.append(prob)
    
    
    # Define intron length
    IntronLength = []
    for n in range(1,len(Length),2):
        intron = Length[n]
        IntronLength.append(intron)
    IntronLength.append(0)   
    
    Probability = []
    for o in range(0,len(First_prob),1):
        List = First_prob[o]
        Intron = [0]*IntronLength[o]
        Probability.append(List+Intron)    
    
    iM6A_prob = []
    for t in Probability:
        iM6A_prob = iM6A_prob + t
    Probability = iM6A_prob        
    
    Probability = pd.DataFrame({'Probability':Probability})
    Probability.sort_index(inplace=True, ascending=False)
    Probability = Probability.reset_index(drop = True)
    df = pd.DataFrame(np.random.randn((Fasta_Neg.loc[i,"Length"]), 3))
    df.columns = ["name", "chrom", "strand"]
    df["name"] = Fasta_Neg.loc[i,"name"]
    df["chrom"] = Fasta_Neg.loc[i,"chrom"]
    df["strand"] = Fasta_Neg.loc[i,"strand"]
    
    list = range(Fasta_Neg.loc[i,"txStart"], Fasta_Neg.loc[i,"txEnd"])
    Start = pd.DataFrame(list, columns=["Start"])
    df = pd.concat([df, Start], axis=1)    
    df["End"] = df["Start"]
    df = pd.concat([df, Probability], axis=1)    
    
    df.columns = ["name", "chrom", "strand", "Start", "End", "Probabilty"]
    df = df[df["Probabilty"] >= 0.001]
    df.to_csv("./OutputsIL/{}.bed".format(Fasta_Neg.loc[i,"name"]), sep="\t", index=False)    
