### Réseau de neurones basé sur iDeep

In [48]:
import keras
from keras.models import Sequential,load_model
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers import Concatenate, concatenate
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.utils import np_utils, generic_utils
from keras.optimizers import SGD, RMSprop, Adadelta, Adagrad, Adam
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Conv2D, MaxPooling2D,Conv1D, MaxPooling1D
from keras.models import model_from_config
from keras import regularizers
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.constraints import maxnorm
from keras.layers import Merge
#from keras.optimizers import kl_divergence

In [49]:
from sklearn import svm, grid_search
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.cross_validation import train_test_split, StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, roc_curve
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV
from sklearn.externals import joblib 
from scipy import sparse

In [50]:
import numpy as np
import random
from random import shuffle
import gzip
import pdb
from math import  sqrt
#import theano
import subprocess as sp
import scipy.stats as stats
import argparse
import time
import pandas as pd

In [51]:
def transformer_structure(structure):
    long = len(structure)
    a = np.zeros(25)
    a[:long] = structure
    return a

In [52]:
def transformer_sequence(sequence,long):
    nucleotides = np.zeros((4,int(max(long,25))))
    cnt = 0
    for lettre in sequence:
        if lettre=='a':
            nucleotides[0,cnt] = 1
            cnt = cnt+1
        elif lettre=='c':
            nucleotides[1,cnt] = 1
            cnt = cnt+1
        elif lettre=='g':
            nucleotides[2,cnt] = 1
            cnt = cnt+1
        else:
            nucleotides[3,cnt] = 1
            cnt = cnt+1
    return nucleotides

In [53]:
def get_cnn_network_microRNA(nbfilter = 22):    
    #print('configure cnn network for micro RNA sequence')

    model = Sequential()
    model.add(Conv1D(activation="relu", input_shape=(25, 4), filters=nbfilter, kernel_size=7, strides=1, padding="valid"))

    model.add(MaxPooling1D(pool_size=3))
    
    model.add(Dropout(0.3))
    
    model.add(Flatten())
    
    model.add(Dense(nbfilter, activation='relu'))
    model.add(Dropout(0.2))
    #model.add(Activation('relu'))
    #model.add(PReLU())
    #model.add(BatchNormalization(mode=2))
    #model.add(Dense(64))
 
    #model.fit(X_train, y_train)
    
    return model

In [54]:
def get_cnn_network_messengerRNA(nbfilter = 100):    
    #print('configure cnn network for messenger sequence')

    model = Sequential()
    model.add(Conv1D(activation="relu", input_shape=(101, 4), filters=nbfilter, kernel_size=7, strides=1, padding="valid"))

    model.add(MaxPooling1D(pool_size=3))
    
    model.add(Dropout(0.5))
    
    model.add(Flatten())
    model.add(Dropout(0.25))
    
    #model.add(Dense(nbfilter, activation='relu'))
    #model.add(Activation('relu'))
    #model.add(PReLU())
    #model.add(BatchNormalization(mode=2))
    #model.add(Dense(64))    
    
    #model.fit(X_train, y_train)
    
    return model

In [55]:
def cnn2D():
    nb_conv = 4
    nb_pool = 2
    model = Sequential()
    model.add(Conv2D(64, (nb_conv, nb_conv), padding='valid', input_shape=(1, 101,4),strides=(1,1)))
    model.add(Activation('relu'))
    model.add(Conv2D(64, (4, 4)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool)))
    model.add(Dropout(0.25))
    model.add(Flatten())

In [56]:
def get_mlp_microRNA(num_hidden = 64):
    model = Sequential()

    #model.add(Dense(num_hidden, input_dim=train.shape[1], activation='relu'))
    model.add(Dense(num_hidden, input_shape=(25,), activation='relu'))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.25))
    model.add(Dense(num_hidden, input_dim=num_hidden, activation='relu'))
    #model.add(Dense(num_hidden, input_shape=(num_hidden,), activation='relu'))
    model.add(PReLU())
    model.add(BatchNormalization())
    #model.add(Activation('relu'))
    model.add(Dropout(0.5))
    '''
    model.add(Dense(sec_num_hidden, input_shape=(num_hidden,), activation='relu'))
    model.add(PReLU())
    model.add(BatchNormalization())
    #model.add(Activation('relu'))
    model.add(Dropout(0.5))
    '''
    return model

In [57]:
def get_mlp_messengerRNA(num_hidden = 128):
    model = Sequential()

    #model.add(Dense(num_hidden, input_dim=train.shape[1], activation='relu'))
    model.add(Dense(num_hidden, input_shape=(101,), activation='relu'))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(num_hidden, input_dim=num_hidden, activation='relu'))
    #model.add(Dense(num_hidden, input_shape=(num_hidden,), activation='relu'))
    model.add(PReLU())
    model.add(BatchNormalization())
    #model.add(Activation('relu'))
    model.add(Dropout(0.5))
    '''
    model.add(Dense(sec_num_hidden, input_shape=(num_hidden,), activation='relu'))
    model.add(PReLU())
    model.add(BatchNormalization())
    #model.add(Activation('relu'))
    model.add(Dropout(0.5))
    '''
    return model

In [58]:
def calculate_performance(test_num, pred_y,  labels):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for index in range(test_num):
        if labels[index] ==1:
            if labels[index] == pred_y[index]:
                tp = tp +1
            else:
                fn = fn + 1
        else:
            if labels[index] == pred_y[index]:
                tn = tn +1
            else:
                fp = fp + 1               
            
    acc = float(tp + tn)/test_num
    precision = float(tp)/(tp+ fp)
    sensitivity = float(tp)/ (tp+fn)
    specificity = float(tn)/(tn + fp)
    MCC = float(tp*tn-fp*fn)/(np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)))
    return acc, precision, sensitivity, specificity, MCC 


In [59]:
def clean(seq):
    seq = seq.split('\n')
    seq2 = ''
    for j in seq:
        seq2 = seq2 + j
    seq2 = seq2[2:len(seq2)-1]
    seq2 = seq2.split(' ')
    #print(seq2)
    seq3=[]
    for j in seq2:
        #print(j)
        if j=='':
            a=0
        else:
            seq3.append(float(j))
    return seq3

### Lecture des données

In [60]:
verif = pd.read_csv("negatifs.csv", sep = "\t",header=None)
verif = np.array(verif)
for i in range(len(verif)):
    for j in range(4):
        if len(verif[i,j]) <= 15:
            print(i,"erreur")
            

for i in range(len(verif)):
    verif[i,2] = np.array(clean(verif[i,2]))
    verif[i,3] = np.array(clean(verif[i,3]))    

neg = verif

verif = pd.read_csv("positifs.csv", sep = "\t",header=None)  
verif = np.array(verif)
for i in range(len(verif)):
    for j in range(4):
        if len(verif[i,j]) <= 15:
            print(i,"erreur")
            

for i in range(len(verif)):
    verif[i,2] = np.array(clean(verif[i,2]))
    verif[i,3] = np.array(clean(verif[i,3]))

pos = verif
verif = []
bdd = np.concatenate((pos,neg))
pos = []
neg = []
labels = np.zeros((len(bdd),1))

bdd = np.concatenate((bdd,labels),axis=1)
for i in range(int(len(bdd)/2)):
    bdd[i,4]=1    
labels=[]
for i in range(len(bdd)):
    bdd[i,0] = transformer_sequence(bdd[i,0],len(bdd[i,0]))
    bdd[i,1] = transformer_sequence(bdd[i,1],101)
    bdd[i,2] = transformer_structure(bdd[i,2])

# shuffle pour mélanger positifs et négatifs

indices = np.zeros(len(bdd),dtype=int)
for i in range(int(len(bdd)/2)):
    indices[2*i] = int(i)
    indices[2*i+1] = int(i + int(len(bdd)/2))
bdd = bdd[indices]
indices = []

# shuffle total

indices = np.arange(len(bdd))
shuffle(indices)
bdd = bdd[indices]

for i in range(len(bdd)):
    bdd[i,0] = bdd[i,0].transpose()
    bdd[i,1] = bdd[i,1].transpose()

resh0 = np.zeros((20048,25,4))
resh1 = np.zeros((20048,101,4))
resh2 = np.zeros((20048,25))
resh3 = np.zeros((20048,101))

for i in range(len(bdd)):
    resh0[i,:,:] = bdd[i,0]
    resh1[i,:,:] = bdd[i,1]
    resh2[i,:] = bdd[i,2]
    resh3[i,:] = bdd[i,3]

In [61]:
nb_train = 17000
nb_val = 500
nb_test = 2500

In [62]:
resh0_train = resh0[:nb_train]
resh0_val = resh0[nb_train:nb_train+nb_val]
resh1_train = resh1[:nb_train]
resh1_val = resh1[nb_train:nb_train+nb_val]
resh2_train = resh2[:nb_train]
resh2_val = resh2[nb_train:nb_train+nb_val]
resh3_train = resh3[:nb_train]
resh3_val = resh3[nb_train:nb_train+nb_val]
resh0_test = resh0[-nb_test:]
resh1_test = resh1[-nb_test:]
resh2_test = resh2[-nb_test:]
resh3_test = resh3[-nb_test:]
resh0=[]
resh1=[]
resh2=[]
resh3=[]
train = bdd[:nb_train]
valid = bdd[nb_train:nb_train+nb_val]
y = train[:,4]
y = keras.utils.np_utils.to_categorical(y,2)
val_y = valid[:,4]
val_y = keras.utils.np_utils.to_categorical(val_y,2)
train=[]
valid=[]
test = bdd[-nb_test:]
bdd = []

In [None]:
def merge_networks_train_predict(micro_seq_hid = 22, messenger_seq_hid = 100, batch=10000, epoch=1000, mode = True):
    start_time = time.time()

    if mode :
        print("Fusion des réseaux deux à deux")
    else :
        print("Fusion simultané")
    
    print("Neural Network run with the following parameter : ")
    print("Number of filter for the micro sequence CNN : ", micro_seq_hid)
    print("Number of filter for the messenger sequence CNN : ", messenger_seq_hid)
    print("Batch size : ",batch)
    print("Number of epochs : ", epoch)
    
    print('Size of training database : ', nb_train, 'elements')
    
    print("Settings the Neural Network")
    
    micro_structure_hid = 64
    messenger_structure_hid = 128
    
    micro_seq_train = resh0_train
    micro_seq_validation = resh0_val
    micro_seq_net =  get_cnn_network_microRNA(micro_seq_hid)
    
    messenger_seq_train = resh1_train
    messenger_seq_validation = resh1_val
    messenger_seq_net = get_cnn_network_messengerRNA(messenger_seq_hid)
    
    micro_structure_train = resh2_train
    micro_structure_validation = resh2_val
    micro_structure_net = get_mlp_microRNA()
    
    messenger_structure_train = resh3_train
    messenger_structure_validation = resh3_val
    messenger_structure_net = get_mlp_messengerRNA()        
    
    #y, encoder = preprocess_labels(training_label)
    #val_y, encoder = preprocess_labels(validation_label, encoder = encoder)
       
    training = []
    validation = []
    total_hid = 0

    #print("Création des réseaux pour les deux séquences")

    training_net=[]
    training_net.append(micro_seq_net)
    training.append(micro_seq_train)
    validation.append(micro_seq_validation)
    total_hid = total_hid + micro_seq_hid
    micro_seq_train = []
    micro_seq_validation = [] 
    
    training_net.append(messenger_seq_net)
    training.append(messenger_seq_train)
    validation.append(messenger_seq_validation)
    total_hid = total_hid + messenger_seq_hid
    messenger_seq_train = []
    messenger_seq_validation = []
    
    if mode :
        #print("Concaténation des deux séquences")
    
        left = Sequential()
        left.add(Merge(training_net, mode='concat'))
        left.add(Dropout(0.6))
        #print(total_hid)
        left.add(Dense((micro_seq_hid+messenger_seq_hid), input_shape=((micro_seq_hid+messenger_seq_hid),)))
        left.add(Activation('softmax'))
    
        #print("Création des réseaux pour les deux structures")
    
        training_net=[]
        #'''
    
    training_net.append(micro_structure_net)
    training.append(micro_structure_train)
    validation.append(micro_structure_validation)
    total_hid = total_hid + micro_structure_hid
    micro_structure_train = []
    micro_structure_validation = []
    
    training_net.append(messenger_structure_net)
    training.append(messenger_structure_train)
    validation.append(messenger_structure_validation)
    total_hid = total_hid + messenger_structure_hid
    messenger_structure_train = []
    messenger_structure_validation = []
    
    if mode :
        #print("Concaténation des deux structures")
    
        right = Sequential()
        right.add(Merge(training_net, mode='concat'))
        right.add(Dropout(0.6))
        #print(total_hid)
        right.add(Dense(192, input_shape=(192,)))
        right.add(Activation('softmax'))
    
        #print("Concaténation des deux modèles")
        #'''
    
    model = Sequential()
    if mode :
        model.add(Merge([left,right], mode='concat'))
    else :
        model.add(Merge(training_net, mode='concat'))
 
    #model.add(Dense(total_hid, input_shape=(total_hid,)))
    #model.add(Activation('relu'))
    #model.add(PReLU())
    #model.add(BatchNormalization(mode=2))
    #model.add(Activation('relu'))
    
    model.add(Dropout(0.6))
    #print(total_hid)
    model.add(Dense(2, input_shape=(total_hid,)))
    model.add(Activation('softmax'))
    
    #sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    
    
    #checkpointer = ModelCheckpoint(filepath="models/bestmodel.hdf5", verbose=0, save_best_only=True)
    earlystopper = EarlyStopping(monitor='val_loss', patience=5, verbose=0)
    #validation_data=(np.transpose(validmat['validxdata'],axes=(0,2,1)), validmat['validdata']), callbacks=[checkpointer,earlystopper]
    print('model training')
    model.fit(training, y, batch_size=batch, epochs=epoch, verbose=0, validation_data=(validation, val_y), callbacks=[earlystopper])
    print('training finished')
    training = []
    validation = []
    
    # test
    true_y = test[:,4]
    
    print('predicting')
    testing = []
    testing.append(resh0_test)
    testing.append(resh1_test)
    testing.append(resh2_test)
    testing.append(resh3_test)
        
    predictions = model.predict_proba(testing)[:,1]
    #print(predictions)
    for i,nulll in enumerate(predictions):
        predictions[i] = round(predictions[i])
    #print(predictions,true_y)
    perfs = calculate_performance(len(predictions), predictions, true_y)
    print("acc : ", perfs[0])
    print("precision : ", perfs[1])
    print("sensitivity : ", perfs[2])
    print("specificity : ", perfs[3])
    print("MCC : ", perfs[4])

    elapsed_time = time.time() - start_time
    
    print("Elapsed Time : ",elapsed_time//60," min ",elapsed_time%60," sec")
    
    print('\n')
    
    return model

In [None]:
start_time = time.time()   
i = 25
j = 116
for k in range (750, 1200, 10):
    m = merge_networks_train_predict(i,j,k,k//10,True)
    m = merge_networks_train_predict(i,j,k,k//10,False)
elapsed_time = time.time() - start_time
print("Total Elapsed Time : ",elapsed_time//60," min ",elapsed_time%60," sec")

Fusion des réseaux deux à deux
Neural Network run with the following parameter : 
Number of filter for the micro sequence CNN :  25
Number of filter for the messenger sequence CNN :  116
Batch size :  750
Number of epochs :  75
Size of training database :  17000 elements
Settings the Neural Network




model training
training finished
predicting
acc :  0.5968
precision :  0.6003223207091055
sensitivity :  0.5926809864757359
specificity :  0.6009654062751408
MCC :  0.19364837578258573
Elapsed Time :  6.0  min  4.4944376945495605  sec


Fusion simultané
Neural Network run with the following parameter : 
Number of filter for the micro sequence CNN :  25
Number of filter for the messenger sequence CNN :  116
Batch size :  750
Number of epochs :  75
Size of training database :  17000 elements
Settings the Neural Network




model training
training finished
predicting
acc :  0.586
precision :  0.5806686046511628
sensitivity :  0.6356404136833731
specificity :  0.5358004827031375
MCC :  0.1723158629727202
Elapsed Time :  2.0  min  47.03413772583008  sec


Fusion des réseaux deux à deux
Neural Network run with the following parameter : 
Number of filter for the micro sequence CNN :  25
Number of filter for the messenger sequence CNN :  116
Batch size :  760
Number of epochs :  76
Size of training database :  17000 elements
Settings the Neural Network
model training
training finished
predicting
acc :  0.5964
precision :  0.596875
sensitivity :  0.6077963404932378
specificity :  0.584875301689461
MCC :  0.19272413361310226
Elapsed Time :  5.0  min  20.44149661064148  sec


Fusion simultané
Neural Network run with the following parameter : 
Number of filter for the micro sequence CNN :  25
Number of filter for the messenger sequence CNN :  116
Batch size :  760
Number of epochs :  76
Size of training database :

model training
training finished
predicting
acc :  0.5644
precision :  0.6063291139240506
sensitivity :  0.381066030230708
specificity :  0.7497988736926791
MCC :  0.1407391076199548
Elapsed Time :  3.0  min  21.014784336090088  sec


Fusion des réseaux deux à deux
Neural Network run with the following parameter : 
Number of filter for the micro sequence CNN :  25
Number of filter for the messenger sequence CNN :  116
Batch size :  840
Number of epochs :  84
Size of training database :  17000 elements
Settings the Neural Network
model training
training finished
predicting
acc :  0.59
precision :  0.5823863636363636
sensitivity :  0.6523468575974543
specificity :  0.5269509251810137
MCC :  0.18074466019020616
Elapsed Time :  9.0  min  53.256837368011475  sec


Fusion simultané
Neural Network run with the following parameter : 
Number of filter for the micro sequence CNN :  25
Number of filter for the messenger sequence CNN :  116
Batch size :  840
Number of epochs :  84
Size of training

model training
training finished
predicting
acc :  0.5912
precision :  0.5796610169491525
sensitivity :  0.6801909307875895
specificity :  0.501206757843926
MCC :  0.18440683298040622
Elapsed Time :  4.0  min  44.244300365448  sec


Fusion des réseaux deux à deux
Neural Network run with the following parameter : 
Number of filter for the micro sequence CNN :  25
Number of filter for the messenger sequence CNN :  116
Batch size :  920
Number of epochs :  92
Size of training database :  17000 elements
Settings the Neural Network
model training
training finished
predicting
acc :  0.5908
precision :  0.5986509274873525
sensitivity :  0.5648369132856006
specificity :  0.6170555108608206
MCC :  0.18212844810288806
Elapsed Time :  7.0  min  7.848782539367676  sec


Fusion simultané
Neural Network run with the following parameter : 
Number of filter for the micro sequence CNN :  25
Number of filter for the messenger sequence CNN :  116
Batch size :  920
Number of epochs :  92
Size of training 

model training
training finished
predicting
acc :  0.5948
precision :  0.5860366713681241
sensitivity :  0.6610978520286396
specificity :  0.5277554304102977
MCC :  0.19057941741402723
Elapsed Time :  3.0  min  22.78941559791565  sec


Fusion des réseaux deux à deux
Neural Network run with the following parameter : 
Number of filter for the micro sequence CNN :  25
Number of filter for the messenger sequence CNN :  116
Batch size :  1000
Number of epochs :  100
Size of training database :  17000 elements
Settings the Neural Network
model training
training finished
predicting
acc :  0.5968
precision :  0.5905454545454546
sensitivity :  0.6459824980111376
specificity :  0.5470635559131134
MCC :  0.1940155420451885
Elapsed Time :  5.0  min  16.07658576965332  sec


Fusion simultané
Neural Network run with the following parameter : 
Number of filter for the micro sequence CNN :  25
Number of filter for the messenger sequence CNN :  116
Batch size :  1000
Number of epochs :  100
Size of tra

model training
training finished
predicting
acc :  0.59
precision :  0.612621359223301
sensitivity :  0.5019888623707239
specificity :  0.6790024135156878
MCC :  0.18385843750887834
Elapsed Time :  4.0  min  8.24721384048462  sec


Fusion des réseaux deux à deux
Neural Network run with the following parameter : 
Number of filter for the micro sequence CNN :  25
Number of filter for the messenger sequence CNN :  116
Batch size :  1080
Number of epochs :  108
Size of training database :  17000 elements
Settings the Neural Network
model training
training finished
predicting
acc :  0.5888
precision :  0.5893832943013271
sensitivity :  0.6006364359586317
specificity :  0.5768302493966211
MCC :  0.17751850154181992
Elapsed Time :  8.0  min  9.275260925292969  sec


Fusion simultané
Neural Network run with the following parameter : 
Number of filter for the micro sequence CNN :  25
Number of filter for the messenger sequence CNN :  116
Batch size :  1080
Number of epochs :  108
Size of traini

model training
training finished
predicting
acc :  0.566
precision :  0.5451680672268907
sensitivity :  0.8257756563245824
specificity :  0.3032984714400644
MCC :  0.15145576195310453
Elapsed Time :  3.0  min  5.59475040435791  sec


Fusion des réseaux deux à deux
Neural Network run with the following parameter : 
Number of filter for the micro sequence CNN :  25
Number of filter for the messenger sequence CNN :  116
Batch size :  1160
Number of epochs :  116
Size of training database :  17000 elements
Settings the Neural Network
model training
training finished
predicting
acc :  0.586
precision :  0.5823442136498517
sensitivity :  0.624502784407319
specificity :  0.5470635559131134
MCC :  0.17209335643209311
Elapsed Time :  7.0  min  48.86770009994507  sec


Fusion simultané
Neural Network run with the following parameter : 
Number of filter for the micro sequence CNN :  25
Number of filter for the messenger sequence CNN :  116
Batch size :  1160
Number of epochs :  116
Size of traini

Neural Network run with the following parameter : 
Number of filter for the micro sequence CNN :  27
Number of filter for the messenger sequence CNN :  115
Batch size :  800
Number of epochs :  80
Size of training database :  17000 elements
training finished
predicting
[0.66268104 0.5558354  0.33351898 ... 0.6252597  0.34933755 0.5189561 ]
[1. 1. 0. ... 1. 0. 1.] [1 1 0.0 ... 1 1 1]
acc :  0.6008
precision :  0.61875
sensitivity :  0.48450244698205547
specificity :  0.7127158555729984
MCC :  0.2027128075842045
Elapsed Time :  9.0  min  54.87723112106323  sec

acc :  0.6096
precision :  0.6086956521739131
sensitivity :  0.5979049153908138
specificity :  0.6211278792692613
MCC :  0.21909450356110444

##### 100 trains : 
acc :  0.5
precision :  0.5333333333333333
sensitivity :  0.7272727272727273
specificity :  0.2222222222222222
MCC :  -0.058025885318565944

##### 17000 trains
acc :  0.5964
precision :  0.6069017254313578
sensitivity :  0.625193199381762
specificity :  0.5655058043117744
MCC :  0.19100235122777048
##### 
acc :  0.5564
precision :  0.5346628679962013
sensitivity :  0.8972111553784861
specificity :  0.21285140562248997
MCC :  0.15103195995226454
(10000 batch et 1000 epochs)