In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

In [2]:
# read the data base with other researchers' predicted (and most probably confirmed) alignments
rawtxt = pd.read_csv('miRNA_targets_hsa.txt',sep='\t',engine='c',error_bad_lines=False)

In [3]:
# select TargetScan because it has target sequences 
tarscan = rawtxt[rawtxt["tool name"]=="miRanda"] #"TargetScan"]

In [4]:
tarscan[['miRNA 3-5','alinment','target 5-3']]

Unnamed: 0,miRNA 3-5,alinment,target 5-3
1254005,uUGGUGUGUUGG-AUGAUGGAGU,|||| || || ||| |||||,gACCAGACTCCCGGACT-CCTCA
1254006,uUGGUAUGUUGG-AUGAUGGAGU,|||| || || ||| |||||,gACCAGACTCCCGGACT-CCTCA
1254007,ugauAUGUUGGAGGAUGGAgu,| || :|||||:|||,ggggTCCACTCTCCTGCCTgc
1254008,gugUUCAAGCCUAGAUGCCCAa,||| | |: || |||||,gggAAGGTGAGGGCTTCGGGTg
1254009,uguuUGUGGUAACA-GUGUGAGGu,||| : || ||| ||||,gggcACAGAGGGGTCCACTCTCCt
1254010,gugUCCAAUUUCCCAGAGUCCCu,|||| :|||| ||: |||,ggaAGGT--GAGGG-CTTCGGGt
1254011,ucGGUUCGAGUCUGCCUAGGcu,|:::|| |||| ||:|||,ctCTGGGCACAGAGGGGTCCac
1254012,ugucGACCAACUUCCCCUGGuu,||| || ||||||,tctcCTGCCTG-CTGGGACCta
1254013,aucGACCAACUUCCCCUGGuu,||| || ||||||,ctcCTGCCTG-CTGGGACCta
1254014,GGUAGAAAUGGUCUGUCACaau,||: | :|||:|||||,CCGGACTCCTCAGGCAGTGccc


In [5]:
dropdup = tarscan[['miRNA 3-5','alinment','target 5-3']].drop_duplicates().reset_index(drop=True)

In [6]:
dropdup

Unnamed: 0,miRNA 3-5,alinment,target 5-3
0,uUGGUGUGUUGG-AUGAUGGAGU,|||| || || ||| |||||,gACCAGACTCCCGGACT-CCTCA
1,uUGGUAUGUUGG-AUGAUGGAGU,|||| || || ||| |||||,gACCAGACTCCCGGACT-CCTCA
2,ugauAUGUUGGAGGAUGGAgu,| || :|||||:|||,ggggTCCACTCTCCTGCCTgc
3,gugUUCAAGCCUAGAUGCCCAa,||| | |: || |||||,gggAAGGTGAGGGCTTCGGGTg
4,uguuUGUGGUAACA-GUGUGAGGu,||| : || ||| ||||,gggcACAGAGGGGTCCACTCTCCt
5,gugUCCAAUUUCCCAGAGUCCCu,|||| :|||| ||: |||,ggaAGGT--GAGGG-CTTCGGGt
6,ucGGUUCGAGUCUGCCUAGGcu,|:::|| |||| ||:|||,ctCTGGGCACAGAGGGGTCCac
7,ugucGACCAACUUCCCCUGGuu,||| || ||||||,tctcCTGCCTG-CTGGGACCta
8,aucGACCAACUUCCCCUGGuu,||| || ||||||,ctcCTGCCTG-CTGGGACCta
9,GGUAGAAAUGGUCUGUCACaau,||: | :|||:|||||,CCGGACTCCTCAGGCAGTGccc


In [7]:
# new columns with lengths
dropdup['lenmi'] = [len(i) for i in dropdup['miRNA 3-5']]
dropdup['lenta'] = [len(i) for i in dropdup['target 5-3']]

In [18]:
maxlen = int(dropdup.lenmi.describe()['max']) # max length of miRNA
def encoding(miseq,taseq,alinl):
    miseq = miseq.upper()
    taseq = taseq.upper()
    # let's differentiate possible bindings by their biological importances
    encode = {'AU':2,'UA':2,'AT':2,'TA':2,'CG':3,'GC':3,'UG':1,'GU':1,'TG':1,'GT':1,'AC':0,'CA':0,'AG':0,
              'GA':0,'CU':0,'UC':0,'CT':0,'TC':0,'TU':0,'UT':0}
    #for i in range(15):
    #alin = np.zeros((maxlen), dtype=int)
    minlen = min(len(miseq),len(taseq)) # it should cut the "tails" where's no alignment for sure
    taseq =taseq[:minlen]
    miseq =miseq[:minlen]
    # reverse them in order to place seed regions in the start
    taseq =taseq[::-1]
    miseq =miseq[::-1]
    for j in range(minlen):
            #if len(taseq)>j:
                mi = miseq[j]
                ta = taseq[j]
                if mi=='-' or ta=='-' or mi==ta: # in these cases there's no alignment
                    alinl[j]=0
                else:
                    pair = mi+ta
                    #print(encode[pair])
                    alinl[j] = encode[pair]
                    #print(alinl[j])
            #else:
                #alinl[-j]=0
    #print(alinl)
    return alinl
        
    #print(dropdup['target 5-3'][i])

In [9]:
dropdup = dropdup[1:]

In [19]:
# make negative examples
# the data base is sorted by mirnas, so make our shuffling even more random
halflen = int(len(dropdup['miRNA 3-5'])/2)
mirs0 = shuffle(dropdup['miRNA 3-5'][halflen:]).reset_index(drop=True)
gens0 = shuffle(dropdup['target 5-3'][:halflen]).reset_index(drop=True)
alin = np.zeros((len(dropdup['miRNA 3-5'])*2,maxlen))

for l in range(len(mirs0)):
    #print (mirs0[l])
    alin[l] = encoding(mirs0[l],gens0[l],alin[l])
    
mirs0 = shuffle(dropdup['miRNA 3-5'][:halflen]).reset_index(drop=True)
gens0 = shuffle(dropdup['target 5-3'][halflen:]).reset_index(drop=True)
for l in range(len(mirs0)):
    #print (mirs0[l])
    alin[l+halflen] = encoding(mirs0[l],gens0[l],alin[l+halflen])

    # positive examples
mirs1 = dropdup['miRNA 3-5'].reset_index(drop=True)
gens1 = dropdup['target 5-3'].reset_index(drop=True)
for l in range(halflen*2):
    #print (mirs1[l])
    alin[l+halflen*2] = encoding(mirs1[l],gens1[l],alin[l+halflen*2])



#pairs0 = [encoding(i,t,) for (i,t) in zip(mirs0,gens0)]
#pairs02 = np.array(pairs0)
#np.concatenate(pairs01,pairs02)

In [20]:
# prepare the data for the neural net
from numpy.random import permutation
y0 = np.zeros(halflen*2)
y1 = np.ones(halflen*2)
y_target = np.concatenate((y0,y1))
y_perm = permutation(halflen*4)
y_target = y_target[y_perm]
alin = alin[y_perm]

In [21]:
alin =alin.reshape(alin.shape[0],1,alin.shape[1])

In [22]:
alin.shape

(1204244, 1, 53)

In [201]:
np.save('y_target.npy',y_target)
np.save('alinment.npy',alin)

In [23]:
import keras
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, Reshape, Merge
from keras.layers import Conv2D,Conv1D, MaxPooling2D, MaxPooling1D
from keras.optimizers import SGD, RMSprop

Using TensorFlow backend.


In [24]:
model = Sequential()
model.add(Conv1D(64,3, input_shape=(None,alin.shape[2]), padding='same'))
#mirs_branch.add(Dense(64))
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=3,padding='same'))
model.add(Dropout(0.25))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dropout(0.5))

#merged = Merge([genes_branch, mirs_branch], mode='concat')
#model = Sequential()
#model.add(merged)
#model.add(Flatten())
model.add(Dense(16))
model.add(Activation('relu'))

#model.add(Dropout(0.5))
#model.add(Dense(2))
#model.add(Activation('relu'))
#model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('relu')) #sigmoid

sgd = SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)

model.compile(loss='mean_squared_error', #'binary_crossentropy',
              optimizer=sgd,
              metrics=['accuracy'])


In [26]:
y_target = y_target.reshape(y_target.shape[0],1,1)

In [27]:
# test set
y_target_test = y_target[-100000:]
alin_test = alin[-100000:]

In [28]:
model.fit(alin[:-100000], y_target[:-100000],
              batch_size=50,
              nb_epoch=30,
              validation_split=0.3)



Train on 772970 samples, validate on 331274 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7ff1b9ad2c18>

In [28]:
alin[:-20000].shape

(253192, 1, 56)

In [29]:
alin.shape

(273192, 1, 56)

In [29]:
model.evaluate(alin_test,y_target_test)



[0.002091595371439762, 0.99764]

In [31]:
model.save('conv64-maxpool1d-32-16-1-sgd-relus-miRanda.h5')

In [32]:
from keras.models import load_model
tarscan_model = load_model('conv64-maxpool1d-32-16-1-sgd-relus.h5')

In [34]:
maxlen = 56

In [35]:
# make negative examples
# the data base is sorted by mirnas, so make our shuffling even more random
halflen = int(len(dropdup['miRNA 3-5'])/2)
mirs0 = shuffle(dropdup['miRNA 3-5'][halflen:]).reset_index(drop=True)
gens0 = shuffle(dropdup['target 5-3'][:halflen]).reset_index(drop=True)
alin = np.zeros((len(dropdup['miRNA 3-5'])*2,maxlen))

for l in range(len(mirs0)):
    #print (mirs0[l])
    alin[l] = encoding(mirs0[l],gens0[l],alin[l])
    
mirs0 = shuffle(dropdup['miRNA 3-5'][:halflen]).reset_index(drop=True)
gens0 = shuffle(dropdup['target 5-3'][halflen:]).reset_index(drop=True)
for l in range(len(mirs0)):
    #print (mirs0[l])
    alin[l+halflen] = encoding(mirs0[l],gens0[l],alin[l+halflen])

    # positive examples
mirs1 = dropdup['miRNA 3-5'].reset_index(drop=True)
gens1 = dropdup['target 5-3'].reset_index(drop=True)
for l in range(halflen*2):
    #print (mirs1[l])
    alin[l+halflen*2] = encoding(mirs1[l],gens1[l],alin[l+halflen*2])



#pairs0 = [encoding(i,t,) for (i,t) in zip(mirs0,gens0)]
#pairs02 = np.array(pairs0)
#np.concatenate(pairs01,pairs02)

In [36]:
# prepare the data for the neural net
from numpy.random import permutation
y0 = np.zeros(halflen*2)
y1 = np.ones(halflen*2)
y_target = np.concatenate((y0,y1))
y_perm = permutation(halflen*4)
y_target = y_target[y_perm]
alin = alin[y_perm]

alin =alin.reshape(alin.shape[0],1,alin.shape[1])
y_target = y_target.reshape(y_target.shape[0],1,1)

# test set
y_target_test = y_target[-100000:]
alin_test = alin[-100000:]

In [37]:
tarscan_model.evaluate(alin_test,y_target_test)



[0.043626307227732614, 0.94826]

In [41]:
# evaluate miRanda model on targetscan set
y_target = np.load('y_target.npy')
alin = np.load('alinment.npy')
print(alin.shape,y_target.shape)
#alin =alin.reshape(alin.shape[0],1,alin.shape[1])
#y_target = y_target.reshape(y_target.shape[0],1,1)

# test set
y_target_test = y_target[-20000:]
alin_test = alin[-20000:,:,:53]

model.evaluate(alin_test,y_target_test)

(273192, 1, 56) (273192, 1, 1)


[0.003357299303288164, 0.99625]