In [1]:
import sys
sys.path.append("/home/kelab/m6AAIpy2")

In [2]:
from keras.models import load_model
from pkg_resources import resource_filename
import numpy as np
import pandas as pd
from Bio.Seq import Seq
import keras.backend as kb

Using TensorFlow backend.


In [3]:
def one_hot_encode(seq):

    map = np.asarray([[0, 0, 0, 0],
                      [1, 0, 0, 0],
                      [0, 1, 0, 0],
                      [0, 0, 1, 0],
                      [0, 0, 0, 1]])

    seq = seq.upper().replace('A', '\x01').replace('C', '\x02')
    seq = seq.replace('G', '\x03').replace('T', '\x04').replace('N', '\x00')

    return map[np.fromstring(seq, np.int8) % 5]

In [4]:
def categorical_crossentropy_2d(y_true, y_pred):
    # Standard categorical cross entropy for sequence outputs

    return - kb.mean(y_true[:, :, 0]*kb.log(y_pred[:, :, 0]+1e-10)
                   + y_true[:, :, 1]*kb.log(y_pred[:, :, 1]+1e-10))

In [5]:
context = 10000

In [6]:
paths = ('/home/kelab/Desktop/iM6A/mouseRAC10000_c{}.h5'.format(x) for x in range(1, 6))

In [7]:
models = [load_model(y, custom_objects={'categorical_crossentropy_2d': categorical_crossentropy_2d}) for y in paths]

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [8]:
models

[<keras.engine.training.Model at 0x7f4d36fea650>,
 <keras.engine.training.Model at 0x7f4d31840810>,
 <keras.engine.training.Model at 0x7f4baa1d0d10>,
 <keras.engine.training.Model at 0x7f4a4d19ddd0>,
 <keras.engine.training.Model at 0x7f4a489c7e90>]

### Read data

In [10]:
Fasta = pd.read_csv("Temp/mm10_LastIntron_Fasta.csv")

### Select positive strand

In [12]:
Fasta_Pos = Fasta[Fasta["strand"]=="+"]

In [13]:
Fasta_Pos = Fasta_Pos.reset_index(drop = True)

In [14]:
PreSequence = Fasta_Pos["PreSequence"].tolist()
LastIntronSequence = Fasta_Pos["LastIntronSequence"].tolist()
LastExonSequence = Fasta_Pos["LastExonSequence"].tolist()

In [None]:
for i in range(len(Sequence)):
    Pre = PreSequence[i]
    LastIntron = LastIntronSequence[i]
    LastExon = LastExonSequence[i]
    
    input_sequence = Pre + LastExon
    
    x = one_hot_encode('N'*(context//2) + input_sequence + 'N'*(context//2))[None, :]
    y = np.mean([models[m].predict(x) for m in range(5)], axis=0)
    m6AAI_prob = y[0, :, 1]
    m6AAI_prob = m6AAI_prob.tolist()    
    m6AAI_prob = m6AAI_prob[0:(len(Pre))] + [0]*(len(LastIntron)) + m6AAI_prob[(len(Pre)):]    
    
    Probability = pd.DataFrame({'Probability':m6AAI_prob})
    
    df = pd.DataFrame(np.random.randn((Fasta_Pos.loc[i,"Length"]), 3))
    df.columns = ["name", "chrom", "strand"]
    df["name"] = Fasta_Pos.loc[i,"name"]
    df["chrom"] = Fasta_Pos.loc[i,"chrom"]
    df["strand"] = Fasta_Pos.loc[i,"strand"]
    
    list = range(Fasta_Pos.loc[i,"txStart"], Fasta_Pos.loc[i,"txEnd"])
    Start = pd.DataFrame(list, columns=["Start"])
    df = pd.concat([df, Start], axis=1)
    df["End"] = df["Start"]
    df = pd.concat([df, Probability], axis=1)
    
    df.columns = ["name", "chrom", "strand", "Start", "End", "Probabilty"]
    df = df[df["Probabilty"] >=0.001]
    df.to_csv("./Outputs/{}.bed".format(Fasta_Pos.loc[i,"name"]), sep="\t", index=False)


### Select negative strand

In [16]:
Fasta_Neg = Fasta[Fasta["strand"]=="-"]

In [17]:
Fasta_Neg = Fasta_Neg.reset_index(drop = True)

In [18]:
PreSequence = Fasta_Neg["PreSequence"].tolist()
LastIntronSequence = Fasta_Neg["LastIntronSequence"].tolist()
LastExonSequence = Fasta_Neg["LastExonSequence"].tolist()

In [None]:
for i in range(len(Sequence)):
    Pre = PreSequence[i]
    LastIntron = LastIntronSequence[i]
    LastExon = LastExonSequence[i]
    
    input_sequence = Pre + LastExon
    
    x = one_hot_encode('N'*(context//2) + input_sequence + 'N'*(context//2))[None, :]
    y = np.mean([models[m].predict(x) for m in range(5)], axis=0)
    m6AAI_prob = y[0, :, 1]
    m6AAI_prob = m6AAI_prob.tolist()    
    m6AAI_prob = m6AAI_prob[0:(len(Pre))] + [0]*(len(LastIntron)) + m6AAI_prob[(len(Pre)):]    
    
    Probability = pd.DataFrame({'Probability':m6AAI_prob})
    Probability.sort_index(inplace=True, ascending=False)
    Probability = Probability.reset_index(drop = True)
    
    df = pd.DataFrame(np.random.randn((Fasta_Neg.loc[i,"Length"]), 3))
    df.columns = ["name", "chrom", "strand"]
    df["name"] = Fasta_Neg.loc[i,"name"]
    df["chrom"] = Fasta_Neg.loc[i,"chrom"]
    df["strand"] = Fasta_Neg.loc[i,"strand"]
    
    list = range(Fasta_Neg.loc[i,"txStart"], Fasta_Neg.loc[i,"txEnd"])
    Start = pd.DataFrame(list, columns=["Start"])
    df = pd.concat([df, Start], axis=1)    
    df["End"] = df["Start"]
    df = pd.concat([df, Probability], axis=1)    
    
    df.columns = ["name", "chrom", "strand", "Start", "End", "Probabilty"]
    df = df[df["Probabilty"] >= 0.001]
    df.to_csv("./Outputs/{}.bed".format(Fasta_Neg.loc[i,"name"]), sep="\t", index=False)