In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

In [2]:
# read the data base with other researchers' predicted (and partially confirmed) alignments
rawtxt = pd.read_csv('miRNA_targets_hsa.txt',sep='\t',engine='c',error_bad_lines=False)

In [3]:
# select TargetScan because it has target sequences 
tarscan = rawtxt[rawtxt["tool name"]=="TargetScan"]

In [4]:
tarscan[['miRNA 3-5','alinment','target 5-3']]

Unnamed: 0,miRNA 3-5,alinment,target 5-3
0,-------------------UUGAUAUGUUGGAU--GAUGGAGU,|| ||:| :|: ||||||||,GGGUGAUGUCCUUCUAGCCAAAGAUGCUGCUGCUCCUACCUCAC
1,-------------------UUGAUAUGUUGGAU--GAUGGAGU,|| ||:| :|: ||||||||,GGGUGAUGUCCUUCUAGCCAAAGAUGCUGCUGCUCCUACCUCAC
2,---------UUGAUAUGU---UGGAU---------GAUGGAGU,::||| ||: ||| |||||||:,UGUGCCCCCGGCUACACGCGCACCGGGAGUGGGCUCUACCUCGG
3,U--------------------UGAUAU--GUUGGAUGAUGGAGU,:||: |::|:| |||||||,CUCAUCCCAGCCAUCACGACUGCUGACGCCGGCUU-CUACCUCU
4,---------UUGAUAUGU---UGGAU---------GAUGGAGU,::||| ||: ||| |||||||:,UGUGCCCCCGGCUACACGCGCACCGGGAGUGGGCUCUACCUCGG
5,U--------------------UGAUAU--GUUGGAUGAUGGAGU,:||: |::|:| |||||||,CUCAUCCCAGCCAUCACGACUGCUGACGCCGGCUU-CUACCUCU
6,U----------UGAUA-UG--------UUGGAU--GAUGGAGU,:||: :| ::||| ||||||||,GCCCUGGAGCUGCUGGGGCCAAGCCAAGGCCUCCCCUACCUCAA
7,U----------UGAUA-UG--------UUGGAU--GAUGGAGU,:||: :| ::||| ||||||||,GCCCUGGAGCUGCUGGGGCCAAGCCAAGGCCUCCCCUACCUCAA
8,U----------UGAUA-UG--------UUGGAU--GAUGGAGU,:||: :| ::||| ||||||||,GCCCUGGAGCUGCUGGGGCCAAGCCAAGGCCUCCCCUACCUCAA
9,UU---------------------GAUAUGUU-GGAUGAUGGAGU,||| :| || ||||||||,CAAUAGACUGCCCUGGCUCUUUCCUAGGCCUUCC-ACUACCUCC


In [5]:
dropdup = tarscan[['miRNA 3-5','alinment','target 5-3']].drop_duplicates().reset_index(drop=True)

In [6]:
dropdup

Unnamed: 0,miRNA 3-5,alinment,target 5-3
0,-------------------UUGAUAUGUUGGAU--GAUGGAGU,|| ||:| :|: ||||||||,GGGUGAUGUCCUUCUAGCCAAAGAUGCUGCUGCUCCUACCUCAC
1,---------UUGAUAUGU---UGGAU---------GAUGGAGU,::||| ||: ||| |||||||:,UGUGCCCCCGGCUACACGCGCACCGGGAGUGGGCUCUACCUCGG
2,U--------------------UGAUAU--GUUGGAUGAUGGAGU,:||: |::|:| |||||||,CUCAUCCCAGCCAUCACGACUGCUGACGCCGGCUU-CUACCUCU
3,U----------UGAUA-UG--------UUGGAU--GAUGGAGU,:||: :| ::||| ||||||||,GCCCUGGAGCUGCUGGGGCCAAGCCAAGGCCUCCCCUACCUCAA
4,UU---------------------GAUAUGUU-GGAUGAUGGAGU,||| :| || ||||||||,CAAUAGACUGCCCUGGCUCUUUCCUAGGCCUUCC-ACUACCUCC
5,-------------UUGAUAUGU-------UGGAU-GAUGGAGU,||||:|| | ||: |||||||:,UUCCCCAUCUGUAAACUGUAGAUAUGACUACUGACCUACCUCGC
6,---------------UUGAUAUGU----UGGA--UGAUGGAGU,::| :|| :|| ||||||||,CGGGAGGCGGUCAGUGGCCUGGCAAAGAGCCGGGACUACCUCC
7,U------------------------UGAUAUGUUGGAUGAUGGAGU,||| ::|||:|||||||,AGGCAGGGGGCAGAAUCUUUUUUUCACUU---GGCCUGCUACCUCC
8,UUGA-------------UAUGUUGGAU---------GAUGGAGU,||:| |||: |||||||,UAGGUUAGUGAUGUGAAAUGCU-CCUGUCCCUGGCCCUACCUCC
9,UU----------------GAUA--UGUUGGAU---GAUGGAGU,||: || ||| |||||||:,UGGCCCUCCCCUGCUGCCCUGAAGACCACCCCAGUCUACCUCGG


In [7]:
# new columns with lengths
dropdup['lenmi'] = [len(i) for i in dropdup['miRNA 3-5']]
dropdup['lenta'] = [len(i) for i in dropdup['target 5-3']]

In [8]:
maxlen = int(dropdup.lenmi.describe()['max']) # max length of miRNA
def encoding(miseq,taseq,alinl):
    # let's differentiate possible bindings by their biological importances
    encode = {'AU':2,'UA':2,'CG':3,'GC':3,'UG':1,'GU':1,'AC':0,'CA':0,'AG':0,'GA':0,'CU':0,'UC':0}
    #for i in range(15):
    #alin = np.zeros((maxlen), dtype=int)
    minlen = min(len(miseq),len(taseq)) # it should cut the "tails" where's no alignment for sure
    taseq =taseq[:minlen]
    miseq =miseq[:minlen]
    # reverse them in order to place seed regions in the start
    taseq =taseq[::-1]
    miseq =miseq[::-1]
    for j in range(minlen):
            #if len(taseq)>j:
                mi = miseq[j]
                ta = taseq[j]
                if mi=='-' or ta=='-' or mi==ta: # in these cases there's no alignment
                    alinl[j]=0
                else:
                    pair = mi+ta
                    #print(encode[pair])
                    alinl[j] = encode[pair]
                    #print(alinl[j])
            #else:
                #alinl[-j]=0
    #print(alinl)
    return alinl
        
    #print(dropdup['target 5-3'][i])

In [11]:
dropdup = dropdup[1:]

In [None]:
# make negative examples
# the data base is sorted by mirnas, so make our shuffling even more random
halflen = int(len(dropdup['miRNA 3-5'])/2)
mirs0 = shuffle(dropdup['miRNA 3-5'][halflen:]).reset_index(drop=True)
gens0 = shuffle(dropdup['target 5-3'][:halflen]).reset_index(drop=True)
alin = np.zeros((len(dropdup['miRNA 3-5'])*2,maxlen))

for l in range(len(mirs0)):
    #print (mirs0[l])
    alin[l] = encoding(mirs0[l],gens0[l],alin[l])
    
mirs0 = shuffle(dropdup['miRNA 3-5'][:halflen]).reset_index(drop=True)
gens0 = shuffle(dropdup['target 5-3'][halflen:]).reset_index(drop=True)
for l in range(len(mirs0)):
    #print (mirs0[l])
    alin[l+halflen] = encoding(mirs0[l],gens0[l],alin[l+halflen])

    # positive examples
mirs1 = dropdup['miRNA 3-5'].reset_index(drop=True)
gens1 = dropdup['target 5-3'].reset_index(drop=True)
for l in range(halflen*2):
    #print (mirs1[l])
    alin[l+halflen*2] = encoding(mirs1[l],gens1[l],alin[l+halflen*2])



#pairs0 = [encoding(i,t,) for (i,t) in zip(mirs0,gens0)]
#pairs02 = np.array(pairs0)
#np.concatenate(pairs01,pairs02)

In [15]:
# prepare the data for the neural net
from numpy.random import permutation
y0 = np.zeros(halflen*2)
y1 = np.ones(halflen*2)
y_target = np.concatenate((y0,y1))
y_perm = permutation(halflen*4)
y_target = y_target[y_perm]
alin = alin[y_perm]

In [16]:
alin =alin.reshape(alin.shape[0],1,alin.shape[1])

In [17]:
alin.shape

(273192, 1, 56)

In [201]:
np.save('y_target.npy',y_target)
np.save('alinment.npy',alin)

In [18]:
import keras
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, Reshape, Merge
from keras.layers import Conv2D,Conv1D, MaxPooling2D, MaxPooling1D
from keras.optimizers import SGD, RMSprop

Using TensorFlow backend.


In [31]:
model = Sequential()
model.add(Conv1D(64,3, input_shape=(None,alin.shape[2]), padding='same'))
#mirs_branch.add(Dense(64))
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=3,padding='same'))
model.add(Dropout(0.25))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dropout(0.5))

#merged = Merge([genes_branch, mirs_branch], mode='concat')
#model = Sequential()
#model.add(merged)
#model.add(Flatten())
model.add(Dense(16))
model.add(Activation('relu'))

#model.add(Dropout(0.5))
#model.add(Dense(2))
#model.add(Activation('relu'))
#model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('relu')) #sigmoid

sgd = SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)

model.compile(loss='mean_squared_error', #'binary_crossentropy',
              optimizer=sgd,
              metrics=['accuracy'])


In [32]:
y_target = y_target.reshape(y_target.shape[0],1,1)

In [33]:
# test set
y_target_test = y_target[-20000:]
alin_test = alin[-20000:]

In [34]:
model.fit(alin[:-20000], y_target[:-20000],
              batch_size=50,
              nb_epoch=30,
              validation_split=0.3)



Train on 177234 samples, validate on 75958 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f0d9c875c18>

In [28]:
alin[:-20000].shape

(253192, 1, 56)

In [29]:
alin.shape

(273192, 1, 56)

In [35]:
model.evaluate(alin_test,y_target_test)



[0.001506077717943117, 0.99845]