In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from numpy.random import permutation

In [2]:
# read the data base with other researchers' predicted (and partially confirmed) alignments
rawtxt = pd.read_csv('miRNA_targets_hsa.txt',sep='\t',engine='c',error_bad_lines=False)

In [3]:
# select TargetScan because it has target sequences 
tarscan = rawtxt[rawtxt["tool name"]=="TargetScan"]

In [4]:
# show the relevant data
tarscan[['miRNA 3-5','alinment','target 5-3']]

Unnamed: 0,miRNA 3-5,alinment,target 5-3
0,-------------------UUGAUAUGUUGGAU--GAUGGAGU,|| ||:| :|: ||||||||,GGGUGAUGUCCUUCUAGCCAAAGAUGCUGCUGCUCCUACCUCAC
1,-------------------UUGAUAUGUUGGAU--GAUGGAGU,|| ||:| :|: ||||||||,GGGUGAUGUCCUUCUAGCCAAAGAUGCUGCUGCUCCUACCUCAC
2,---------UUGAUAUGU---UGGAU---------GAUGGAGU,::||| ||: ||| |||||||:,UGUGCCCCCGGCUACACGCGCACCGGGAGUGGGCUCUACCUCGG
3,U--------------------UGAUAU--GUUGGAUGAUGGAGU,:||: |::|:| |||||||,CUCAUCCCAGCCAUCACGACUGCUGACGCCGGCUU-CUACCUCU
4,---------UUGAUAUGU---UGGAU---------GAUGGAGU,::||| ||: ||| |||||||:,UGUGCCCCCGGCUACACGCGCACCGGGAGUGGGCUCUACCUCGG
5,U--------------------UGAUAU--GUUGGAUGAUGGAGU,:||: |::|:| |||||||,CUCAUCCCAGCCAUCACGACUGCUGACGCCGGCUU-CUACCUCU
6,U----------UGAUA-UG--------UUGGAU--GAUGGAGU,:||: :| ::||| ||||||||,GCCCUGGAGCUGCUGGGGCCAAGCCAAGGCCUCCCCUACCUCAA
7,U----------UGAUA-UG--------UUGGAU--GAUGGAGU,:||: :| ::||| ||||||||,GCCCUGGAGCUGCUGGGGCCAAGCCAAGGCCUCCCCUACCUCAA
8,U----------UGAUA-UG--------UUGGAU--GAUGGAGU,:||: :| ::||| ||||||||,GCCCUGGAGCUGCUGGGGCCAAGCCAAGGCCUCCCCUACCUCAA
9,UU---------------------GAUAUGUU-GGAUGAUGGAGU,||| :| || ||||||||,CAAUAGACUGCCCUGGCUCUUUCCUAGGCCUUCC-ACUACCUCC


In [5]:
# select the relevant data and drop the duplicated lines
dropdup = tarscan[['miRNA 3-5','alinment','target 5-3']].drop_duplicates().reset_index(drop=True)

In [6]:
dropdup

Unnamed: 0,miRNA 3-5,alinment,target 5-3
0,-------------------UUGAUAUGUUGGAU--GAUGGAGU,|| ||:| :|: ||||||||,GGGUGAUGUCCUUCUAGCCAAAGAUGCUGCUGCUCCUACCUCAC
1,---------UUGAUAUGU---UGGAU---------GAUGGAGU,::||| ||: ||| |||||||:,UGUGCCCCCGGCUACACGCGCACCGGGAGUGGGCUCUACCUCGG
2,U--------------------UGAUAU--GUUGGAUGAUGGAGU,:||: |::|:| |||||||,CUCAUCCCAGCCAUCACGACUGCUGACGCCGGCUU-CUACCUCU
3,U----------UGAUA-UG--------UUGGAU--GAUGGAGU,:||: :| ::||| ||||||||,GCCCUGGAGCUGCUGGGGCCAAGCCAAGGCCUCCCCUACCUCAA
4,UU---------------------GAUAUGUU-GGAUGAUGGAGU,||| :| || ||||||||,CAAUAGACUGCCCUGGCUCUUUCCUAGGCCUUCC-ACUACCUCC
5,-------------UUGAUAUGU-------UGGAU-GAUGGAGU,||||:|| | ||: |||||||:,UUCCCCAUCUGUAAACUGUAGAUAUGACUACUGACCUACCUCGC
6,---------------UUGAUAUGU----UGGA--UGAUGGAGU,::| :|| :|| ||||||||,CGGGAGGCGGUCAGUGGCCUGGCAAAGAGCCGGGACUACCUCC
7,U------------------------UGAUAUGUUGGAUGAUGGAGU,||| ::|||:|||||||,AGGCAGGGGGCAGAAUCUUUUUUUCACUU---GGCCUGCUACCUCC
8,UUGA-------------UAUGUUGGAU---------GAUGGAGU,||:| |||: |||||||,UAGGUUAGUGAUGUGAAAUGCU-CCUGUCCCUGGCCCUACCUCC
9,UU----------------GAUA--UGUUGGAU---GAUGGAGU,||: || ||| |||||||:,UGGCCCUCCCCUGCUGCCCUGAAGACCACCCCAGUCUACCUCGG


In [7]:
# new columns with lengths
dropdup['lenmi'] = [len(i) for i in dropdup['miRNA 3-5']]
dropdup['lenta'] = [len(i) for i in dropdup['target 5-3']]

In [8]:
maxlen = int(dropdup.lenmi.describe()['max']) # max length of miRNA
dropdup = dropdup[1:] # drop the first line with the column names
# define a function for data preprocessing
def encoding(miseq,taseq,alinl):
    # let's differentiate possible bindings by their biological importances
    encode = {'AU':2,'UA':2,'CG':3,'GC':3,'UG':1,'GU':1,'AC':0,'CA':0,'AG':0,'GA':0,'CU':0,'UC':0}
    #for i in range(15):
    #alin = np.zeros((maxlen), dtype=int)
    minlen = min(len(miseq),len(taseq)) # it should cut the "tails" where's no alignment for sure
    taseq =taseq[:minlen]
    miseq =miseq[:minlen]
    # reverse them in order to place seed regions in the start
    taseq =taseq[::-1]
    miseq =miseq[::-1]
    for j in range(minlen):
            #if len(taseq)>j:
                mi = miseq[j]
                ta = taseq[j]
                if mi=='-' or ta=='-' or mi==ta: # in these cases there's no alignment
                    alinl[j]=0
                else:
                    pair = mi+ta
                    #print(encode[pair])
                    alinl[j] = encode[pair]
                    #print(alinl[j])
    #print(alinl)
    return alinl

In [9]:
# make negative examples

# the data base is sorted by mirnas, so make our shuffling even more random
halflen = int(len(dropdup['miRNA 3-5'])/2)
mirs0 = shuffle(dropdup['miRNA 3-5'][halflen:]).reset_index(drop=True)
gens0 = shuffle(dropdup['target 5-3'][:halflen]).reset_index(drop=True)
alin = np.zeros((len(dropdup['miRNA 3-5'])*2,maxlen))

# preprocess the data
for l in range(len(mirs0)):
    #print (mirs0[l])
    alin[l] = encoding(mirs0[l],gens0[l],alin[l])
    
mirs0 = shuffle(dropdup['miRNA 3-5'][:halflen]).reset_index(drop=True)
gens0 = shuffle(dropdup['target 5-3'][halflen:]).reset_index(drop=True)
for l in range(len(mirs0)):
    #print (mirs0[l])
    alin[l+halflen] = encoding(mirs0[l],gens0[l],alin[l+halflen])

    # positive examples
mirs1 = dropdup['miRNA 3-5'].reset_index(drop=True)
gens1 = dropdup['target 5-3'].reset_index(drop=True)
for l in range(halflen*2):
    #print (mirs1[l])
    alin[l+halflen*2] = encoding(mirs1[l],gens1[l],alin[l+halflen*2])

In [10]:
# prepare the data for the neural net

# for the negative examples
y0 = np.zeros(halflen*2)
# for the positive examples
y1 = np.ones(halflen*2)
# target ("answer")
y_target = np.concatenate((y0,y1)) 
y_perm = permutation(halflen*4)
# shuffle them
y_target = y_target[y_perm] 
# shuffle the input data in the same order
alin = alin[y_perm]
# make a proper shape for the neuralnet's input
alin =alin.reshape(alin.shape[0],1,alin.shape[1])

In [12]:
print(alin.shape)

(273192, 1, 56)


In [201]:
# save the preprocessed data
np.save('y_target.npy',y_target)
np.save('alinment.npy',alin)