In [1]:
from __future__ import print_function
import keras
from keras.models import Sequential, Model, load_model

import tensorflow as tf

import pandas as pd

import os
import pickle
import numpy as np

import scipy.sparse as sp
import scipy.io as spio

import isolearn.io as isoio
import isolearn.keras as isol


Using TensorFlow backend.


In [6]:
#Load sequence data

df = pd.read_csv('../../data/native_data/segal_variants.tab', sep='\t')


In [9]:
#Create data features

encoder = isol.OneHotEncoder(250)

x = np.concatenate([encoder.encode(row['Sequence'])[None, None, :, :] for _, row in df.iterrows()], axis=0)


In [15]:
#Pad to even batch size

remainder = x.shape[0] % 32
to_fill = 32 - remainder

x = np.concatenate([x, np.zeros((to_fill, 1, 250, 4))], axis=0)


In [50]:

def predict_variable_len(aparent_model, onehots, sequence_stride=10) :
    cut_pred_padded_slices = []
    cut_pred_padded_masks = []
    
    l_fake = np.zeros((onehots.shape[0], 13))
    l_fake[:, 11] = 1.
    
    zeros = np.zeros((onehots.shape[0], 1, 205, 4))
    
    n = onehots.shape[0]
    
    start_pos = 0
    end_pos = 205
    while True :

        onehot_slice = None
        effective_len = 0

        if end_pos <= onehots.shape[2] :
            onehot_slice = onehots[:, :, start_pos: end_pos, :]
            effective_len = 205
        else :
            onehot_slice = np.concatenate([onehots[:, :, start_pos:, :], zeros], axis=2)[:, :, :205, :]
            effective_len = onehots[:, :, start_pos:, :].shape[2]

        _, cut_pred = aparent_model.predict(x=[onehot_slice, l_fake], batch_size=32, verbose=True)

        padded_slice = np.concatenate([
            np.zeros((n, start_pos)),
            cut_pred[:, :effective_len],
            np.zeros((n, onehots.shape[2] - start_pos - effective_len)),
            cut_pred[:, 205:206]
        ], axis=1)

        padded_mask = np.concatenate([
            np.zeros((n, start_pos)),
            np.ones((n, effective_len)),
            np.zeros((n, onehots.shape[2] - start_pos - effective_len)),
            np.ones((n, 1))
        ], axis=1)[:, :onehots.shape[2]+1]
        
        cut_pred_padded_slices.append(np.expand_dims(padded_slice, axis=1))
        cut_pred_padded_masks.append(np.expand_dims(padded_mask, axis=1))

        if end_pos >= onehots.shape[2] :
            break

        start_pos += sequence_stride
        end_pos += sequence_stride

    cut_slices = np.concatenate(cut_pred_padded_slices, axis=1)[:, :, :-1]
    cut_masks = np.concatenate(cut_pred_padded_masks, axis=1)[:, :, :-1]

    avg_cut_pred = np.sum(cut_slices, axis=1) / np.sum(cut_masks, axis=1)
    
    return avg_cut_pred


In [51]:
#Load APARENT Resnet

model_name = 'aparent_all_libs_resnet_no_clinvar_wt_ep_5'

save_dir = os.path.join(os.getcwd(), '../../saved_models')
model_path = os.path.join(save_dir, model_name + '.h5')

aparent_model = load_model(model_path)



In [52]:
#Predict APA on sequences

cut_pred = predict_variable_len(aparent_model, x, sequence_stride=10)




In [53]:
#Copy the dataframe and store isoform predictions

pred_df = df.copy().reset_index(drop=True)

pred_df = pred_df[['Sequence', 'Expression', 'ID']]


In [54]:
#Dump prediction dataframe and cut probability matrix

isoio.dump({'pred_df' : pred_df, 'cut_pred' : sp.csr_matrix(cut_pred)}, 'apa_segal_data/' + model_name + '_predictions')