In [1]:
import sys
sys.path.append("/pod/2/ke-lab/LUOZ/Singularity/m6AAIpy2")

In [2]:
from keras.models import load_model
from pkg_resources import resource_filename
import numpy as np
import pandas as pd
from Bio.Seq import Seq
import keras.backend as kb

Using TensorFlow backend.


In [3]:
def one_hot_encode(seq):

    map = np.asarray([[0, 0, 0, 0],
                      [1, 0, 0, 0],
                      [0, 1, 0, 0],
                      [0, 0, 1, 0],
                      [0, 0, 0, 1]])

    seq = seq.upper().replace('A', '\x01').replace('C', '\x02')
    seq = seq.replace('G', '\x03').replace('T', '\x04').replace('N', '\x00')

    return map[np.fromstring(seq, np.int8) % 5]

In [4]:
def categorical_crossentropy_2d(y_true, y_pred):
    # Standard categorical cross entropy for sequence outputs

    return - kb.mean(y_true[:, :, 0]*kb.log(y_pred[:, :, 0]+1e-10)
                   + y_true[:, :, 1]*kb.log(y_pred[:, :, 1]+1e-10))

In [5]:
context = 10000

In [6]:
paths = ('/pod/2/ke-lab/LUOZ/iM6A/Test/mouseRAC10000_c{}.h5'.format(x) for x in range(1, 6))

In [7]:
models = [load_model(y, custom_objects={'categorical_crossentropy_2d': categorical_crossentropy_2d}) for y in paths]

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [8]:
models

[<keras.engine.training.Model at 0x2aaab17db190>,
 <keras.engine.training.Model at 0x2aac1bf3f210>,
 <keras.engine.training.Model at 0x2aac628549d0>,
 <keras.engine.training.Model at 0x2aacc51eb850>,
 <keras.engine.training.Model at 0x2ab4856ad310>]

### Read data

In [9]:
Fasta = pd.read_csv("Temp/mm10_LastIntron_Fasta.csv")

### Select positive strand

In [13]:
Fasta_Pos = Fasta[Fasta["strand"]=="+"]

In [14]:
Fasta_Pos = Fasta_Pos.reset_index(drop = True)

In [16]:
PreSequence = Fasta_Pos["PreSequence"].tolist()
LastIntronSequence = Fasta_Pos["LastIntronSequence"].tolist()
LastExonSequence = Fasta_Pos["LastExonSequence"].tolist()
GeneLength = Fasta_Pos["GeneLength"].tolist()

In [17]:
df = pd.DataFrame(range(-1000,1001),columns=["index"])
for i in range(len(Fasta_Pos)):
    Pre = PreSequence[i]
    LastIntron = LastIntronSequence[i]
    LastExon = LastExonSequence[i]
    
    input_sequence = Pre + LastIntron + LastExon

    x = one_hot_encode('N'*(context//2) + input_sequence + 'N'*(context//2))[None, :]
    y = np.mean([models[m].predict(x) for m in range(5)], axis=0)
    m6AAI_prob = y[0, :, 1]
    m6AAI_prob = m6AAI_prob.tolist()

    
    # Define exon location
    Exon = []
    a = Fasta_Pos.loc[i, "exonStarts"].split(",")[0:-1]
    b = Fasta_Pos.loc[i, "exonEnds"].split(",")[0:-1]
    A = [int(u) for u in a]
    B = [int(v) for v in b]
    Exon = A + B
    Exon = sorted(Exon)
    
    # Define length of exon and intron
    Length = []
    for j in range(1,len(Exon),1):
        length = Exon[j] - Exon[-1 + j]
        Length.append(length)
    
    CumSum = []
    Sum = 0
    for k in Length:
        Sum = Sum + k
        CumSum.append(Sum)

    # Define probability in cDNA
    First_prob = [m6AAI_prob[0:CumSum[0]]]
    for m in range(1,len(CumSum),1):
        prob = m6AAI_prob[(CumSum[m-1]):(CumSum[m])]
        First_prob.append(prob)
    
    Probability = []
    for n in range(0,len(First_prob),2):
        List = First_prob[n]
        Probability.append(List)

    iM6A_prob = []
    for t in Probability:
        iM6A_prob = iM6A_prob + t
        
    Start = 0
    for t in range(len(Probability)-2):
        Start = Start + len(Probability[t])        
    
    Pre = iM6A_prob[0:Start]
    Last = iM6A_prob[Start:]
    
    if len(Pre) < 1000:
        Pre = [0]*(1000-Start) + Pre
    if len(Pre) >= 1000:
        Pre = Pre[-1000:]
    
    if len(Last) < 1001:
        Last = Last + [0]*(1001-len(Last))
    if len(Last) >= 1001:
        Last = Last[0:1001]
    
    New = np.array(Pre + Last)
    New = pd.DataFrame({Fasta_Pos.loc[i,"name"]:New})
    
    df = pd.merge(df, New, left_index=True, right_index=True)        

  if sys.path[0] == '':


### Select negative strand

In [18]:
Fasta_Neg = Fasta[Fasta["strand"]=="-"]

In [19]:
Fasta_Neg = Fasta_Neg.reset_index(drop = True)

In [21]:
PreSequence = Fasta_Neg["PreSequence"].tolist()
LastIntronSequence = Fasta_Neg["LastIntronSequence"].tolist()
LastExonSequence = Fasta_Neg["LastExonSequence"].tolist()
GeneLength = Fasta_Neg["GeneLength"].tolist()

In [22]:
for i in range(len(Fasta_Neg)):
    Pre = PreSequence[i]
    LastIntron = LastIntronSequence[i]
    LastExon = LastExonSequence[i]    
    
    input_sequence = Pre + LastIntron + LastExon
    
    x = one_hot_encode('N'*(context//2) + input_sequence + 'N'*(context//2))[None, :]
    y = np.mean([models[m].predict(x) for m in range(5)], axis=0)
    m6AAI_prob = y[0, :, 1]
    m6AAI_prob = m6AAI_prob.tolist()

    
    # Define exon location
    Exon = []
    a = Fasta_Neg.loc[i, "exonStarts"].split(",")[0:-1]
    b = Fasta_Neg.loc[i, "exonEnds"].split(",")[0:-1]
    A = [int(u) for u in a]
    B = [int(v) for v in b]
    Exon = A + B
    Exon = sorted(Exon)
    
    # Define length of exon and intron
    Length = []
    for j in range(1,len(Exon),1):
        length = Exon[j] - Exon[-1 + j]
        Length.append(length)
    Length = Length[::-1]
    
    CumSum = []
    Sum = 0
    for k in Length:
        Sum = Sum + k
        CumSum.append(Sum)

    # Define probability in cDNA
    First_prob = [m6AAI_prob[0:CumSum[0]]]
    for m in range(1,len(CumSum),1):
        prob = m6AAI_prob[(CumSum[m-1]):(CumSum[m])]
        First_prob.append(prob)
    
    Probability = []
    for n in range(0,len(First_prob),2):
        List = First_prob[n]
        Probability.append(List)

    iM6A_prob = []
    for t in Probability:
        iM6A_prob = iM6A_prob + t
        
    Start = 0
    for t in range(len(Probability)-2):
        Start = Start + len(Probability[t])        
    
    Pre = iM6A_prob[0:Start]
    Last = iM6A_prob[Start:]
    
    if len(Pre) < 1000:
        Pre = [0]*(1000-Start) + Pre
    if len(Pre) >= 1000:
        Pre = Pre[-1000:]
    
    if len(Last) < 1001:
        Last = Last + [0]*(1001-len(Last))
    if len(Last) >= 1001:
        Last = Last[0:1001]
    
    New = np.array(Pre + Last)
    New = pd.DataFrame({Fasta_Neg.loc[i,"name"]:New})
    
    df = pd.merge(df, New, left_index=True, right_index=True)        

  if sys.path[0] == '':


In [24]:
df.drop(["index"], axis=1, inplace=True)
df = df.T

In [25]:
df.loc["Sum"] = df.sum()
df = df.T

In [26]:
Value = df[["Sum"]]

In [27]:
Value.head(5)

Unnamed: 0,Sum
0,1.274996
1,0.676975
2,2.098657
3,1.247592
4,2.088193


In [28]:
df.drop(["Sum"], axis=1, inplace=True)
df = df.T

In [29]:
df.loc["Number"] = (df > 0).sum()
df = df.T

In [30]:
Number = df[["Number"]]

In [32]:
Data = pd.concat([Value, Number], axis=1)

In [33]:
Data["Mean"] = Data["Sum"]/Data["Number"]

In [36]:
Data = Data.head(2000)
Sum = []
Number = []

for j in range(0,len(Data),10):

    a = Data.loc[j:(j+9),"Sum"]
    b = np.mean(a, axis=0)
    Sum.append(b)
    
    a = Data.loc[j:(j+9),"Number"]
    b = np.mean(a, axis=0)
    Number.append(b)

Result = pd.DataFrame({"Index":range(-1000,1000,10), "Sum":Sum, "Number":Number})
Result["Mean"] = Result["Sum"]/Result["Number"] 

In [38]:
Result.to_csv("Mouse_iM6A_SecondToLastExonStart_10interval.csv", index=0)