In [1]:
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras.models import load_model

import numpy as np
import scipy
import scipy.sparse as sp

import matplotlib.pyplot as plt

from sklearn import preprocessing
import pandas as pd

import random, os, h5py, math, time, glob

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

class IdentityEncoder :
    
    def __init__(self, seq_len, channel_map) :
        self.seq_len = seq_len
        self.n_channels = len(channel_map)
        self.encode_map = channel_map
        self.decode_map = {
            nt: ix for ix, nt in self.encode_map.items()
        }
    
    def encode(self, seq) :
        encoding = np.zeros((self.seq_len, self.n_channels))
        
        for i in range(len(seq)) :
            if seq[i] in self.encode_map :
                channel_ix = self.encode_map[seq[i]]
                encoding[i, channel_ix] = 1.

        return encoding
    
    def encode_inplace(self, seq, encoding) :
        for i in range(len(seq)) :
            if seq[i] in self.encode_map :
                channel_ix = self.encode_map[seq[i]]
                encoding[i, channel_ix] = 1.
    
    def encode_inplace_sparse(self, seq, encoding_mat, row_index) :
        raise NotImplementError()
    
    def decode(self, encoding) :
        seq = ''
    
        for pos in range(0, encoding.shape[0]) :
            argmax_nt = np.argmax(encoding[pos, :])
            max_nt = np.max(encoding[pos, :])
            seq += self.decode_map[argmax_nt]

        return seq
    
    def decode_sparse(self, encoding_mat, row_index) :
        raise NotImplementError()


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:

class CNNClassifier(nn.Module) :
    
    def __init__(self, batch_size) :
        super(CNNClassifier, self).__init__()
        
        self.conv1 = nn.Conv2d(4, 120, kernel_size=(1, 8), padding=(0, 4))
        self.conv2 = nn.Conv2d(120, 120, kernel_size=(1, 8), padding=(0, 4))
        self.conv3 = nn.Conv2d(120, 120, kernel_size=(1, 8), padding=(0, 4))
        
        self.fc1 = nn.Linear(in_features=50 * 120, out_features=40)
        self.drop1 = nn.Dropout(p=0.2)
        self.fc2 = nn.Linear(in_features=40, out_features=1)
        
        self.batch_size = batch_size
        self.use_cuda = True if torch.cuda.is_available() else False
        
    def forward(self, x):
        
        x = F.relu(self.conv1(x))[..., 1:]
        x = F.relu(self.conv2(x))[..., 1:]
        x = F.relu(self.conv3(x))[..., 1:]
        
        x = x.transpose(1, 3)
        x = x.reshape(-1, 50 * 120)
        
        x = F.relu(self.fc1(x))#F.relu(self.drop1(self.fc1(x)))
        x = self.fc2(x)
        
        return x


In [53]:
#Load pytorch model skeleton

model_pytorch = CNNClassifier(batch_size=32)


In [8]:
#Load Predictor
predictor_path = 'optimusRetrainedMain.hdf5'

saved_predictor = load_model(predictor_path)

saved_predictor.trainable = False
saved_predictor.compile(optimizer=keras.optimizers.SGD(lr=0.1), loss='mean_squared_error')


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [55]:
saved_predictor.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 50, 120)           3960      
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 50, 120)           115320    
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 120)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 50, 120)           115320    
_________________________________________________________________
dropout_2 (Dropout)          (None, 50, 120)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 6000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 40)               

In [56]:
#Collect weights from keras model

conv_1_weight, conv_1_bias = saved_predictor.get_layer('conv1d_1').get_weights()
conv_2_weight, conv_2_bias = saved_predictor.get_layer('conv1d_2').get_weights()
conv_3_weight, conv_3_bias = saved_predictor.get_layer('conv1d_3').get_weights()

conv_1_weight = conv_1_weight[..., None, :]
conv_2_weight = conv_2_weight[..., None, :]
conv_3_weight = conv_3_weight[..., None, :]

dense_1_weight, dense_1_bias = saved_predictor.get_layer('dense_1').get_weights()
dense_2_weight, dense_2_bias = saved_predictor.get_layer('dense_2').get_weights()


In [57]:
#Manually transfer model weights from keras to pytorch

with torch.no_grad() :
    model_pytorch.conv1.weight = nn.Parameter(torch.FloatTensor(np.transpose(conv_1_weight, (3, 1, 2, 0))))
    model_pytorch.conv1.bias = nn.Parameter(torch.FloatTensor(conv_1_bias))
    
    model_pytorch.conv2.weight = nn.Parameter(torch.FloatTensor(np.transpose(conv_2_weight, (3, 1, 2, 0))))
    model_pytorch.conv2.bias = nn.Parameter(torch.FloatTensor(conv_2_bias))
    
    model_pytorch.conv3.weight = nn.Parameter(torch.FloatTensor(np.transpose(conv_3_weight, (3, 1, 2, 0))))
    model_pytorch.conv3.bias = nn.Parameter(torch.FloatTensor(conv_3_bias))
    
    model_pytorch.fc1.weight = nn.Parameter(torch.FloatTensor(np.transpose(dense_1_weight, (1, 0))))
    model_pytorch.fc1.bias = nn.Parameter(torch.FloatTensor(dense_1_bias))
    
    model_pytorch.fc2.weight = nn.Parameter(torch.FloatTensor(np.transpose(dense_2_weight, (1, 0))))
    model_pytorch.fc2.bias = nn.Parameter(torch.FloatTensor(dense_2_bias))


In [58]:
#Save pytorch model

torch.save(model_pytorch.state_dict(), "saved_models/optimusRetrainedMain_pytorch.pth")


In [3]:

#optimus 5-prime functions 
def test_data(df, model, test_seq, obs_col, output_col='pred'):
    '''Predict mean ribosome load using model and test set UTRs'''
    
    # Scale the test set mean ribosome load
    scaler = preprocessing.StandardScaler()
    scaler.fit(df[obs_col].reshape(-1,1))
    
    # Make predictions
    predictions = model.predict(test_seq).reshape(-1)
    
    # Inverse scaled predicted mean ribosome load and return in a column labeled 'pred'
    df.loc[:,output_col] = scaler.inverse_transform(predictions)
    return df


def one_hot_encode(df, col='utr', seq_len=50):
    # Dictionary returning one-hot encoding of nucleotides. 
    nuc_d = {'a':[1,0,0,0],'c':[0,1,0,0],'g':[0,0,1,0],'t':[0,0,0,1], 'n':[0,0,0,0]}
    
    # Creat empty matrix.
    vectors=np.empty([len(df),seq_len,4])
    
    # Iterate through UTRs and one-hot encode
    for i,seq in enumerate(df[col].str[:seq_len]): 
        seq = seq.lower()
        a = np.array([nuc_d[x] for x in seq])
        vectors[i] = a
    return vectors


def r2(x,y):
    slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
    return r_value**2


#Train data
e_train = pd.read_csv("bottom5KIFuAUGTop5KIFuAUG.csv")
e_train.loc[:,'scaled_rl'] = preprocessing.StandardScaler().fit_transform(e_train.loc[:,'rl'].values.reshape(-1,1))

seq_e_train = one_hot_encode(e_train,seq_len=50)
x_train = seq_e_train
x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1], x_train.shape[2]))
y_train = np.array(e_train['scaled_rl'].values)
y_train = np.reshape(y_train, (y_train.shape[0],1))

print("x_train.shape = " + str(x_train.shape))
print("y_train.shape = " + str(y_train.shape))


x_train.shape = (15008, 1, 50, 4)
y_train.shape = (15008, 1)


In [4]:
#Load pytorch model

model_pytorch = CNNClassifier(batch_size=32)
_ = model_pytorch.load_state_dict(torch.load("saved_models/optimusRetrainedMain_pytorch.pth"))


In [5]:
np.transpose(x_train[:1], (0, 3, 1, 2)).shape

(1, 4, 1, 50)

In [9]:
#Predict using keras model

y_pred_keras = saved_predictor.predict(x=[x_train[:32, 0, ...]], batch_size=1)

#Predict using pytorch model
model_pytorch.eval()
        
input_var = Variable(torch.FloatTensor(np.transpose(x_train[:32], (0, 3, 1, 2))))
input_var = input_var.cuda() if model_pytorch.use_cuda else input_var

y_pred_pytorch = model_pytorch(input_var).data.cpu().numpy()


In [10]:

for i, [p_keras, p_pytorch] in enumerate(zip(y_pred_keras.tolist(), y_pred_pytorch.tolist())) :
    print("--------------------")
    print("Sequence " + str(i))
    print("prob (keras)   = " + str(np.round(p_keras, 4)))
    print("prob (pytorch) = " + str(np.round(p_pytorch, 4)))


--------------------
Sequence 0
prob (keras)   = [1.0168]
prob (pytorch) = [1.0168]
--------------------
Sequence 1
prob (keras)   = [0.5764]
prob (pytorch) = [0.5764]
--------------------
Sequence 2
prob (keras)   = [0.5024]
prob (pytorch) = [0.5024]
--------------------
Sequence 3
prob (keras)   = [0.8501]
prob (pytorch) = [0.8501]
--------------------
Sequence 4
prob (keras)   = [0.797]
prob (pytorch) = [0.797]
--------------------
Sequence 5
prob (keras)   = [0.6108]
prob (pytorch) = [0.6108]
--------------------
Sequence 6
prob (keras)   = [0.7627]
prob (pytorch) = [0.7627]
--------------------
Sequence 7
prob (keras)   = [0.8833]
prob (pytorch) = [0.8833]
--------------------
Sequence 8
prob (keras)   = [0.887]
prob (pytorch) = [0.887]
--------------------
Sequence 9
prob (keras)   = [0.9337]
prob (pytorch) = [0.9337]
--------------------
Sequence 10
prob (keras)   = [0.804]
prob (pytorch) = [0.804]
--------------------
Sequence 11
prob (keras)   = [0.7199]
prob (pytorch) = [0.71