In [1]:
import sys
import random
sys.path.append("/pod/2/ke-lab/LUOZ/Singularity/iM6A")

In [2]:
from keras.models import load_model
from pkg_resources import resource_filename
import numpy as np
import pandas as pd
from Bio.Seq import Seq
from Bio.SeqUtils import nt_search
import keras.backend as kb

Using TensorFlow backend.


In [3]:
def one_hot_encode(seq):

    map = np.asarray([[0, 0, 0, 0],
                      [1, 0, 0, 0],
                      [0, 1, 0, 0],
                      [0, 0, 1, 0],
                      [0, 0, 0, 1]])

    seq = seq.upper().replace('A', '\x01').replace('C', '\x02')
    seq = seq.replace('G', '\x03').replace('T', '\x04').replace('N', '\x00')

    return map[np.fromstring(seq, np.int8) % 5]

In [4]:
def categorical_crossentropy_2d(y_true, y_pred):
    # Standard categorical cross entropy for sequence outputs

    return - kb.mean(y_true[:, :, 0]*kb.log(y_pred[:, :, 0]+1e-10)
                   + y_true[:, :, 1]*kb.log(y_pred[:, :, 1]+1e-10))

In [5]:
context = 10000

In [6]:
paths = ('Test/mouseRAC10000_c{}.h5'.format(x) for x in range(1, 6))

In [None]:
models = [load_model(resource_filename('m6AAI', x), custom_objects={'categorical_crossentropy_2d': categorical_crossentropy_2d}) for x in paths]

### Read data

In [9]:
Data = pd.read_csv("Fasta_SNM_Long.csv", keep_default_na=False)

In [10]:
Data

Unnamed: 0,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,LastExonStart,LastExonEnd,m6AStart,...,GeneLength,LastExonLength,Pre,PreSequence,LastExonSequence,m6APosition,POS,Up,Mid,Down
0,Gpr39,chr1,+,125676994,125873862,125677336,125872884,125872369,125873862,125872731,...,196868,1493,195375,GGGACCTGGAAGGGAGAGGAGGATCCGCCGCCAGACAGAGTTCTGC...,GACTGATTGTGGTGACGTTGGCCGTGTGTTGGATGCCCAATCAGAT...,362,195737,GACTGATTGTGGTGACGTTGGCCGTGTGTTGGATGCCCAATCAGAT...,GATCCTCCTGCCCTTCTCTGATACCTTCTTCTACCTCAGCTCTGTG...,CTACTGGGACAGAATGGAAGCTCAAGGCTTTGGGAAAGGCAGATGC...
1,Khdrbs2,chr1,+,32172713,32658568,32172917,32657541,32657443,32658568,32657720,...,485855,1125,484730,GGCCGCACGGGCATCCTCTCCAGCTGAGGCCACGGCCGGAGCCTGG...,CACCAGAAGAATGGGCCACAACTCGCTCCAGCCTGAAGGCACCACC...,277,485007,CACCAGAAGAATGGGCCACAACTCGCT,CCAGCCTGAAGGCACCACCACCAAGGTCAGCCAGAGGGGGATACAG...,TTCGAACTAGCTGATAATAAAGTTGTAGATAAGATGTTTTAACCTG...
2,Sh3bp4,chr1,+,89070414,89155068,89137685,89153351,89153126,89155068,89153400,...,84654,1942,82712,agaccacctctgcgcccgcTGGCGGGGGCCGAGCGCTCTGGGCTGT...,GCCATGTGGAAACCTGCCTATGACTTCTTACTCACCTGGAGCCACC...,274,82986,GCCATGTGGAAACCTGCCTATGAC,TTCTTACTCACCTGGAGCCACCAGATTGGGGACAGCTACCGAGATG...,TGAAAATGTACGTTTAAATTTAAAATCACTTTTTTAAACAGAAGAC...
3,Cdh20,chr1,+,104768528,104995481,104934096,104994385,104993879,104995481,104994314,...,226953,1602,225351,AGAAAACGCGCGAAGGAGGGGAAGAAGCGGACCGCGCGCTGGTCTT...,TGTTGGTGCTGCTCATCCTGTCCATGAGGCGACATCGCAAACAACC...,435,225786,TGTTGGTGCTGCTCATCCTGTCCATGAGGCGACATCGCAAACAACC...,AAGACGCGCCAGGACATGCTCCCCGAGATCGAGAGCCTGTCCCGCT...,AGGAAACCCAGGAGAAGAGGGCAGAATCTCCAATTACCATTTTAAT...
4,Cdh7,chr1,+,109983736,110139001,109994179,110138355,110137861,110139001,110138138,...,155265,1140,154125,TTGGACATCTTCGTGGGTTCACTTGGCACAAGGCTAGTTGCTGGTA...,TGCTGATCCTCCTCATTGTCACTATGAGAAGACGAAAAAAAGAGCC...,277,154402,TGCTGATCCTCCTCATTGTCACTATGA,GAAGACGAAAAAAAGAGCCTCTCATTTTTGATGAGGAGAGGGATAT...,AAGTCAGCAGGAAAAAATTCAAAAGGAAAGAAAAGTGGAAAATACT...
5,Clasp1,chr1,+,118389057,118609497,118419723,118606994,118606907,118609497,118609181,...,220440,2590,217850,AAAGGAAGTTCCGGCTGGAAGCGCTCCTGACTCCTCTCTAGCTCTT...,ATGAAGCTGCTGAACTTATATATAAAGAGGGCCCAGACTACCAACA...,2274,220124,ATGAAGCTGCTGAACTTATATATAAAGAGGGCCCAGACTACCAACA...,GGGGGACCGCTCATCTGACTTCCTGCTTCCTGGCGAGCTGCCCCTC...,AAAAAAGAAAATTAAAAAAAAAAAAGCCGACCTTTTGAGTTTTTCC...
6,Thsd7b,chr1,+,129273343,130219278,129430830,130218180,130218098,130219278,130218485,...,945935,1180,944755,gccccgcggctcccggggACGCACCAGGCAGCCAATGGATGCGCAG...,CAAGAAGCCAAAACCACATCAAAGCACACCTCGCCATCAGAAGCCA...,387,945142,CAAGAAGCCAAAACCACATCAAAGCACACCTCGCCATCAGAAGCCA...,TTGTAGCTCTCAGACTTCTCAGTTCTTTGAGGAATCTCATGATGTG...,TAAATAATGTATTTTATTTTGTAGCCAGGGGATGATGGCACTTTGT...
7,Brinp3,chr1,+,146494759,146902471,146514717,146902117,146901000,146902471,146901878,...,407712,1471,406241,GACAGTAGATTTCCGAGAGGAAAGGAGAAGAGAAGATAGGGATGGA...,AACATCAACCTACTGGCTCACTCGCATCCAGTCTTTTCTCTACTGC...,878,407119,AACATCAACCTACTGGCTCACTCGCATCCAGTCTTTTCTCTACTGC...,CTACAGTGCTATAACTGGACACTGACTCTGGGAAACAAATGGAAGA...,AGCACAACCCAAAATCTTGAAGGAGTTTTTACAGTGCTTTTGTGGA...
8,Stx6,chr1,+,155158714,155205659,155158892,155202016,155201939,155205659,155205087,...,46945,3720,43225,GGAGGAGGCTGCCGTGGGGGCCGGCTTGGAGTGGGATTCGACCCGC...,ATCGGCGCCAGTGGTGTGCCATAGCCATCCTCTTCGCGGTCCTGGT...,3148,46373,ATCGGCGCCAGTGGTGTGCCATAGCCATCCTCTTCGCGGTCCTGGT...,GAGAAATCCAGTCTGATTCCAACAGAAATCTGTCTCTGTTAAGTGT...,TGAAGACTCGGCCACAGGATCCTCCCTGGACCCTCACAGCTGGGAG...
9,Cnih3,chr1,+,181352629,181460641,181353425,181459252,181459224,181460641,181459617,...,108012,1417,106595,ATTCCTCGCGGCGCCCgcggcagcagcagaagcagcagcagcagca...,CATGATTTACACCTTAGTGAGCTCCTAACTCAGACCCTGCTGATGG...,393,106988,CATGATTTACACCTTAGTGAGCTCCTAACTCAGACCCTGCTGATGG...,GGAGTCATACAGCAGTCAGAGCCATGTTACCCACGTGTGTCTGTTT...,AACTGGGGAGGGGGAGAAAATGGATTTCTAAAAATGTCTGCAACAT...


In [11]:
Data = Data.loc[0:499,:]
Data = Data.reset_index(drop = True)

In [12]:
Data

Unnamed: 0,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,LastExonStart,LastExonEnd,m6AStart,...,GeneLength,LastExonLength,Pre,PreSequence,LastExonSequence,m6APosition,POS,Up,Mid,Down
0,Gpr39,chr1,+,125676994,125873862,125677336,125872884,125872369,125873862,125872731,...,196868,1493,195375,GGGACCTGGAAGGGAGAGGAGGATCCGCCGCCAGACAGAGTTCTGC...,GACTGATTGTGGTGACGTTGGCCGTGTGTTGGATGCCCAATCAGAT...,362,195737,GACTGATTGTGGTGACGTTGGCCGTGTGTTGGATGCCCAATCAGAT...,GATCCTCCTGCCCTTCTCTGATACCTTCTTCTACCTCAGCTCTGTG...,CTACTGGGACAGAATGGAAGCTCAAGGCTTTGGGAAAGGCAGATGC...
1,Khdrbs2,chr1,+,32172713,32658568,32172917,32657541,32657443,32658568,32657720,...,485855,1125,484730,GGCCGCACGGGCATCCTCTCCAGCTGAGGCCACGGCCGGAGCCTGG...,CACCAGAAGAATGGGCCACAACTCGCTCCAGCCTGAAGGCACCACC...,277,485007,CACCAGAAGAATGGGCCACAACTCGCT,CCAGCCTGAAGGCACCACCACCAAGGTCAGCCAGAGGGGGATACAG...,TTCGAACTAGCTGATAATAAAGTTGTAGATAAGATGTTTTAACCTG...
2,Sh3bp4,chr1,+,89070414,89155068,89137685,89153351,89153126,89155068,89153400,...,84654,1942,82712,agaccacctctgcgcccgcTGGCGGGGGCCGAGCGCTCTGGGCTGT...,GCCATGTGGAAACCTGCCTATGACTTCTTACTCACCTGGAGCCACC...,274,82986,GCCATGTGGAAACCTGCCTATGAC,TTCTTACTCACCTGGAGCCACCAGATTGGGGACAGCTACCGAGATG...,TGAAAATGTACGTTTAAATTTAAAATCACTTTTTTAAACAGAAGAC...
3,Cdh20,chr1,+,104768528,104995481,104934096,104994385,104993879,104995481,104994314,...,226953,1602,225351,AGAAAACGCGCGAAGGAGGGGAAGAAGCGGACCGCGCGCTGGTCTT...,TGTTGGTGCTGCTCATCCTGTCCATGAGGCGACATCGCAAACAACC...,435,225786,TGTTGGTGCTGCTCATCCTGTCCATGAGGCGACATCGCAAACAACC...,AAGACGCGCCAGGACATGCTCCCCGAGATCGAGAGCCTGTCCCGCT...,AGGAAACCCAGGAGAAGAGGGCAGAATCTCCAATTACCATTTTAAT...
4,Cdh7,chr1,+,109983736,110139001,109994179,110138355,110137861,110139001,110138138,...,155265,1140,154125,TTGGACATCTTCGTGGGTTCACTTGGCACAAGGCTAGTTGCTGGTA...,TGCTGATCCTCCTCATTGTCACTATGAGAAGACGAAAAAAAGAGCC...,277,154402,TGCTGATCCTCCTCATTGTCACTATGA,GAAGACGAAAAAAAGAGCCTCTCATTTTTGATGAGGAGAGGGATAT...,AAGTCAGCAGGAAAAAATTCAAAAGGAAAGAAAAGTGGAAAATACT...
5,Clasp1,chr1,+,118389057,118609497,118419723,118606994,118606907,118609497,118609181,...,220440,2590,217850,AAAGGAAGTTCCGGCTGGAAGCGCTCCTGACTCCTCTCTAGCTCTT...,ATGAAGCTGCTGAACTTATATATAAAGAGGGCCCAGACTACCAACA...,2274,220124,ATGAAGCTGCTGAACTTATATATAAAGAGGGCCCAGACTACCAACA...,GGGGGACCGCTCATCTGACTTCCTGCTTCCTGGCGAGCTGCCCCTC...,AAAAAAGAAAATTAAAAAAAAAAAAGCCGACCTTTTGAGTTTTTCC...
6,Thsd7b,chr1,+,129273343,130219278,129430830,130218180,130218098,130219278,130218485,...,945935,1180,944755,gccccgcggctcccggggACGCACCAGGCAGCCAATGGATGCGCAG...,CAAGAAGCCAAAACCACATCAAAGCACACCTCGCCATCAGAAGCCA...,387,945142,CAAGAAGCCAAAACCACATCAAAGCACACCTCGCCATCAGAAGCCA...,TTGTAGCTCTCAGACTTCTCAGTTCTTTGAGGAATCTCATGATGTG...,TAAATAATGTATTTTATTTTGTAGCCAGGGGATGATGGCACTTTGT...
7,Brinp3,chr1,+,146494759,146902471,146514717,146902117,146901000,146902471,146901878,...,407712,1471,406241,GACAGTAGATTTCCGAGAGGAAAGGAGAAGAGAAGATAGGGATGGA...,AACATCAACCTACTGGCTCACTCGCATCCAGTCTTTTCTCTACTGC...,878,407119,AACATCAACCTACTGGCTCACTCGCATCCAGTCTTTTCTCTACTGC...,CTACAGTGCTATAACTGGACACTGACTCTGGGAAACAAATGGAAGA...,AGCACAACCCAAAATCTTGAAGGAGTTTTTACAGTGCTTTTGTGGA...
8,Stx6,chr1,+,155158714,155205659,155158892,155202016,155201939,155205659,155205087,...,46945,3720,43225,GGAGGAGGCTGCCGTGGGGGCCGGCTTGGAGTGGGATTCGACCCGC...,ATCGGCGCCAGTGGTGTGCCATAGCCATCCTCTTCGCGGTCCTGGT...,3148,46373,ATCGGCGCCAGTGGTGTGCCATAGCCATCCTCTTCGCGGTCCTGGT...,GAGAAATCCAGTCTGATTCCAACAGAAATCTGTCTCTGTTAAGTGT...,TGAAGACTCGGCCACAGGATCCTCCCTGGACCCTCACAGCTGGGAG...
9,Cnih3,chr1,+,181352629,181460641,181353425,181459252,181459224,181460641,181459617,...,108012,1417,106595,ATTCCTCGCGGCGCCCgcggcagcagcagaagcagcagcagcagca...,CATGATTTACACCTTAGTGAGCTCCTAACTCAGACCCTGCTGATGG...,393,106988,CATGATTTACACCTTAGTGAGCTCCTAACTCAGACCCTGCTGATGG...,GGAGTCATACAGCAGTCAGAGCCATGTTACCCACGTGTGTCTGTTT...,AACTGGGGAGGGGGAGAAAATGGATTTCTAAAAATGTCTGCAACAT...


In [13]:
PreSequence = Data["PreSequence"].tolist()
Up = Data["Up"].tolist()
Mid = Data["Mid"].tolist()
Down = Data["Down"].tolist()

### Saturation Single Nucleotide Mutation

In [14]:
def SingleNucleotideMutation(DNA):
    
    DNA = DNA.upper()
    
    if DNA == "A":
        Mutation = ["T","C","G"]
    if DNA == "T":
        Mutation = ["A","C","G"]
    if DNA == "C":
        Mutation = ["T","A","G"]
    if DNA == "G":
        Mutation = ["T","C","A"]
    return Mutation


In [15]:
df0 = pd.DataFrame(range(-250, 251), columns=["Position"])
df1 = pd.DataFrame(range(-250, 251), columns=["Position"])
df2 = pd.DataFrame(range(-250, 251), columns=["Position"])

In [None]:
for i in range(len(Data)):
    
    Pre = PreSequence[i]
    Pre = Pre.upper()
    up = Up[i]
    up = up.upper()
    mid = Mid[i]
    mid = mid.upper()
    down = Down[i]
    down =  down.upper()
    Probability = []
    Probability0 = []
    Probability1 = []
    Probability2 = []
    for j in range(501):
        DNA = mid[j:(j+1)]
        Mutation = SingleNucleotideMutation(DNA)
        
        # Mutation[0]
        seq = mid[0:j] + Mutation[0] + mid[(j+1):501]        
        input_sequence = Pre + up + seq + down
        x = one_hot_encode('N'*(context//2) + input_sequence + 'N'*(context//2))[None, :]
        y = np.mean([models[m].predict(x) for m in range(5)], axis=0)
        m6AAI_prob = y[0, :, 1]
        value0 = m6AAI_prob[(Data.loc[i,"POS"])-1]
        Dvalue0 = value0 - Data.loc[i,"Probabilty"]
        Probability0.append(Dvalue0)
        
        # Mutation[1]
        seq = mid[0:j] + Mutation[1] + mid[(j+1):501]        
        input_sequence = Pre + up + seq + down
        x = one_hot_encode('N'*(context//2) + input_sequence + 'N'*(context//2))[None, :]
        y = np.mean([models[m].predict(x) for m in range(5)], axis=0)
        m6AAI_prob = y[0, :, 1]
        value1 = m6AAI_prob[(Data.loc[i,"POS"])-1]
        Dvalue1 = value1 - Data.loc[i,"Probabilty"]
        Probability1.append(Dvalue1)
        
        # Mutation[2]
        seq = mid[0:j] + Mutation[2] + mid[(j+1):501]        
        input_sequence = Pre + up + seq + down
        x = one_hot_encode('N'*(context//2) + input_sequence + 'N'*(context//2))[None, :]
        y = np.mean([models[m].predict(x) for m in range(5)], axis=0)
        m6AAI_prob = y[0, :, 1]
        value2 = m6AAI_prob[(Data.loc[i,"POS"])-1]
        Dvalue2 = value2 - Data.loc[i,"Probabilty"]        
        Probability2.append(Dvalue2)
        
    
    Name0 = Data.loc[i,"name"] + "_" +"I"
    Probability0 = pd.DataFrame({Name0:Probability0})
    df0 = pd.concat([df0, Probability0], axis=1)

    Name1 = Data.loc[i,"name"] + "_" +"II"
    Probability1 = pd.DataFrame({Name1:Probability1})
    df1 = pd.concat([df1, Probability1], axis=1)
    
    Name2 = Data.loc[i,"name"] + "_" +"III"
    Probability2 = pd.DataFrame({Name2:Probability2})
    df2 = pd.concat([df2, Probability2], axis=1)

  if sys.path[0] == '':


In [None]:
df0.to_csv("Prob_SNM_I_Long.csv", index=0)
df1.to_csv("Prob_SNM_II_Long.csv", index=0)
df2.to_csv("Prob_SNM_III_Long.csv", index=0)