In [1]:
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras.models import load_model

import numpy as np
import scipy
import scipy.sparse as sp

import matplotlib.pyplot as plt

import random, os, h5py, math, time, glob

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

class IdentityEncoder :
    
    def __init__(self, seq_len, channel_map) :
        self.seq_len = seq_len
        self.n_channels = len(channel_map)
        self.encode_map = channel_map
        self.decode_map = {
            nt: ix for ix, nt in self.encode_map.items()
        }
    
    def encode(self, seq) :
        encoding = np.zeros((self.seq_len, self.n_channels))
        
        for i in range(len(seq)) :
            if seq[i] in self.encode_map :
                channel_ix = self.encode_map[seq[i]]
                encoding[i, channel_ix] = 1.

        return encoding
    
    def encode_inplace(self, seq, encoding) :
        for i in range(len(seq)) :
            if seq[i] in self.encode_map :
                channel_ix = self.encode_map[seq[i]]
                encoding[i, channel_ix] = 1.
    
    def encode_inplace_sparse(self, seq, encoding_mat, row_index) :
        raise NotImplementError()
    
    def decode(self, encoding) :
        seq = ''
    
        for pos in range(0, encoding.shape[0]) :
            argmax_nt = np.argmax(encoding[pos, :])
            max_nt = np.max(encoding[pos, :])
            seq += self.decode_map[argmax_nt]

        return seq
    
    def decode_sparse(self, encoding_mat, row_index) :
        raise NotImplementError()


Using TensorFlow backend.


In [2]:

class CNNClassifier(nn.Module) :
    
    def __init__(self, batch_size, lib_index=5, distal_pas=1.) :
        super(CNNClassifier, self).__init__()
        
        lib_inp_numpy = np.zeros((batch_size, 13))
        lib_inp_numpy[:, lib_index] = 1.
        #self.lib_inp = Variable(torch.FloatTensor(lib_inp_numpy).to(torch.device('cuda:0')))
        self.lib_inp = Variable(torch.FloatTensor(lib_inp_numpy))
        
        d_inp_numpy = np.zeros((batch_size, 1))
        d_inp_numpy[:, 0] = distal_pas
        #self.d_inp = Variable(torch.FloatTensor(d_inp_numpy).to(torch.device('cuda:0')))
        self.d_inp = Variable(torch.FloatTensor(d_inp_numpy))
        
        self.conv1 = nn.Conv2d(4, 96, kernel_size=(1, 8))
        self.maxpool_1 = nn.MaxPool2d((1, 2))
        self.conv2 = nn.Conv2d(96, 128, kernel_size=(1, 6))
        
        self.fc1 = nn.Linear(in_features=94 * 128 + 1, out_features=256)
        self.drop1 = nn.Dropout(p=0.2)
        self.fc2 = nn.Linear(in_features=256 + 13, out_features=1)
        
        self.batch_size = batch_size
        self.use_cuda = True if torch.cuda.is_available() else False
        
    def forward(self, x):
        
        x = F.relu(self.conv1(x))
        x = self.maxpool_1(x)
        x = F.relu(self.conv2(x))
        
        x = x.transpose(1, 3)
        x = x.reshape(-1, 94 * 128)
        
        x = torch.cat([x, self.d_inp], dim=1)
        x = F.relu(self.drop1(self.fc1(x)))
        x = torch.cat([x, self.lib_inp], dim=1)
        
        x = F.sigmoid(self.fc2(x))
        
        return x


In [12]:
#Load pytorch model skeleton

model_pytorch = CNNClassifier(batch_size=32, lib_index=4, distal_pas=1.)


In [3]:
#Load APARENT Keras predictor model

#Specfiy file path to pre-trained predictor network

save_dir = os.path.join(os.getcwd(), '../../../aparent/saved_models')
saved_predictor_model_name = 'aparent_plasmid_iso_cut_distalpas_all_libs_no_sampleweights_sgd.h5'
saved_predictor_model_path = os.path.join(save_dir, saved_predictor_model_name)

saved_predictor = load_model(saved_predictor_model_path)

acgt_encoder = IdentityEncoder(205, {'A':0, 'C':1, 'G':2, 'T':3})




In [14]:
#Collect weights from keras model

conv_1_weight, conv_1_bias = saved_predictor.get_layer('conv2d_1').get_weights()
conv_2_weight, conv_2_bias = saved_predictor.get_layer('conv2d_2').get_weights()

dense_1_weight, dense_1_bias = saved_predictor.get_layer('dense_1').get_weights()
dense_iso_weight, dense_iso_bias = saved_predictor.get_layer('dense_3').get_weights()


In [15]:
#Manually transfer model weights from keras to pytorch

with torch.no_grad() :
    model_pytorch.conv1.weight = nn.Parameter(torch.FloatTensor(np.transpose(conv_1_weight, (3, 1, 2, 0))))
    model_pytorch.conv1.bias = nn.Parameter(torch.FloatTensor(conv_1_bias))
    
    model_pytorch.conv2.weight = nn.Parameter(torch.FloatTensor(np.transpose(conv_2_weight, (3, 2, 1, 0))))
    model_pytorch.conv2.bias = nn.Parameter(torch.FloatTensor(conv_2_bias))
    
    model_pytorch.fc1.weight = nn.Parameter(torch.FloatTensor(np.transpose(dense_1_weight, (1, 0))))
    model_pytorch.fc1.bias = nn.Parameter(torch.FloatTensor(dense_1_bias))
    
    model_pytorch.fc2.weight = nn.Parameter(torch.FloatTensor(np.transpose(dense_iso_weight, (1, 0))))
    model_pytorch.fc2.bias = nn.Parameter(torch.FloatTensor(dense_iso_bias))


In [16]:
#Save pytorch model

torch.save(model_pytorch.state_dict(), "saved_models/aparent_plasmid_iso_cut_distalpas_all_libs_no_sampleweights_sgd_pytorch.pth")


In [4]:
class MySequence :
    def __init__(self) :
        self.dummy = 1

keras.utils.Sequence = MySequence

import isolearn.io as isoio
import isolearn.keras as isol

import pickle

#Define dataset/experiment name
dataset_name = "apa_doubledope"

#Load cached dataframe
cached_dict = pickle.load(open('apa_doubledope_cached_set.pickle', 'rb'))
data_df = cached_dict['data_df']

print("len(data_df) = " + str(len(data_df)) + " (loaded)")


len(data_df) = 34748 (loaded)


In [5]:
#Make generators

valid_set_size = 0.05
test_set_size = 0.05

batch_size = 32

#Generate training and test set indexes
data_index = np.arange(len(data_df), dtype=np.int)

train_index = data_index[:-int(len(data_df) * (valid_set_size + test_set_size))]
valid_index = data_index[train_index.shape[0]:-int(len(data_df) * test_set_size)]
test_index = data_index[train_index.shape[0] + valid_index.shape[0]:]

print('Training set size = ' + str(train_index.shape[0]))
print('Validation set size = ' + str(valid_index.shape[0]))
print('Test set size = ' + str(test_index.shape[0]))


data_gens = {
    gen_id : isol.DataGenerator(
        idx,
        {'df' : data_df},
        batch_size=batch_size,
        inputs = [
            {
                'id' : 'seq',
                'source_type' : 'dataframe',
                'source' : 'df',
                'extractor' : isol.SequenceExtractor('padded_seq', start_pos=180, end_pos=180 + 205),
                'encoder' : isol.OneHotEncoder(seq_length=205),
                'dim' : (1, 205, 4),
                'sparsify' : False
            }
        ],
        outputs = [
            {
                'id' : 'hairpin',
                'source_type' : 'dataframe',
                'source' : 'df',
                'extractor' : lambda row, index: row['proximal_usage'],
                'transformer' : lambda t: t,
                'dim' : (1,),
                'sparsify' : False
            }
        ],
        randomizers = [],
        shuffle = True if gen_id == 'train' else False
    ) for gen_id, idx in [('all', data_index), ('train', train_index), ('valid', valid_index), ('test', test_index)]
}

#Load data matrices

x_train = np.concatenate([data_gens['train'][i][0][0] for i in range(len(data_gens['train']))], axis=0)
x_test = np.concatenate([data_gens['test'][i][0][0] for i in range(len(data_gens['test']))], axis=0)

y_train = np.concatenate([data_gens['train'][i][1][0] for i in range(len(data_gens['train']))], axis=0)
y_test = np.concatenate([data_gens['test'][i][1][0] for i in range(len(data_gens['test']))], axis=0)

print("x_train.shape = " + str(x_train.shape))
print("x_test.shape = " + str(x_test.shape))

print("y_train.shape = " + str(y_train.shape))
print("y_test.shape = " + str(y_test.shape))


Training set size = 31274
Validation set size = 1737
Test set size = 1737
x_train.shape = (31264, 1, 205, 4)
x_test.shape = (1728, 1, 205, 4)
y_train.shape = (31264, 1)
y_test.shape = (1728, 1)


In [6]:
#Load pytorch model

model_pytorch = CNNClassifier(batch_size=32, lib_index=4, distal_pas=1.)
_ = model_pytorch.load_state_dict(torch.load("saved_models/aparent_plasmid_iso_cut_distalpas_all_libs_no_sampleweights_sgd_pytorch.pth"))


In [7]:
np.transpose(x_test[:1], (0, 3, 1, 2)).shape

(1, 4, 1, 205)

In [10]:
#Predict using keras model

aparent_l_test = np.zeros((x_test.shape[0], 13))
aparent_l_test[:, 4] = 1.

aparent_d_test = np.ones((x_test.shape[0], 1))

y_pred_keras = saved_predictor.predict(x=[np.transpose(x_test[:32], (0, 2, 3, 1)), aparent_l_test[:32], aparent_d_test[:32]], batch_size=1)[0]

#Predict using pytorch model
model_pytorch.eval()
        
input_var = Variable(torch.FloatTensor(np.transpose(x_test[:32], (0, 3, 1, 2))))
input_var = input_var.cuda() if model_pytorch.use_cuda else input_var

y_pred_pytorch = model_pytorch(input_var).data.cpu().numpy()


In [11]:

for i, [p_keras, p_pytorch] in enumerate(zip(y_pred_keras.tolist(), y_pred_pytorch.tolist())) :
    print("--------------------")
    print("Sequence " + str(i))
    print("prob (keras)   = " + str(np.round(p_keras, 4)))
    print("prob (pytorch) = " + str(np.round(p_pytorch, 4)))


--------------------
Sequence 0
prob (keras)   = [0.0857]
prob (pytorch) = [0.0857]
--------------------
Sequence 1
prob (keras)   = [0.181]
prob (pytorch) = [0.181]
--------------------
Sequence 2
prob (keras)   = [0.7814]
prob (pytorch) = [0.7814]
--------------------
Sequence 3
prob (keras)   = [0.1501]
prob (pytorch) = [0.1501]
--------------------
Sequence 4
prob (keras)   = [0.27]
prob (pytorch) = [0.27]
--------------------
Sequence 5
prob (keras)   = [0.6723]
prob (pytorch) = [0.6723]
--------------------
Sequence 6
prob (keras)   = [0.9261]
prob (pytorch) = [0.9261]
--------------------
Sequence 7
prob (keras)   = [0.8405]
prob (pytorch) = [0.8405]
--------------------
Sequence 8
prob (keras)   = [0.5964]
prob (pytorch) = [0.5964]
--------------------
Sequence 9
prob (keras)   = [0.6466]
prob (pytorch) = [0.6466]
--------------------
Sequence 10
prob (keras)   = [0.0147]
prob (pytorch) = [0.0147]
--------------------
Sequence 11
prob (keras)   = [0.8999]
prob (pytorch) = [0.89