In [1]:
from __future__ import print_function
import keras
from keras.models import Sequential, Model, load_model

import tensorflow as tf

import pandas as pd

import os
import pickle
import numpy as np

import scipy.sparse as sp
import scipy.io as spio

import isolearn.io as isoio
import isolearn.keras as iso


Using TensorFlow backend.


In [2]:
#Load sequence data

df = pd.read_csv('../../../aparent/data/polyadb_features_pas_3_utr3.csv', sep='\t')

save_dict = np.load("../../../aparent/data/polyadb_features_pas_3_utr3_no_x.npz")
m, l = save_dict['m'], save_dict['l']


In [3]:
#Load legacy APARENT model (lifted from theano)

model_name = 'aparent_theano_legacy_30_31_34_pasaligned'

save_dir = os.path.join(os.getcwd(), '../../../aparent/saved_models/legacy_models')
model_path = os.path.join(save_dir, model_name + '.h5')

aparent_model = load_model(model_path)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.




In [4]:
#Score all sequences with APARENT (use sum of cuts to capture OR-like logic)

max_n_pas = 10

encoder = iso.OneHotEncoder(185)

a = np.zeros((len(df), max_n_pas))
a_all_cuts = np.zeros((len(df), max_n_pas))

for k in range(max_n_pas) :
    
    print("Predicting for PAS #" + str(k) + "...")
    
    df.loc[df['wide_seq_ext_' + str(k)].isnull(), 'wide_seq_ext_' + str(k)] = 'X' * 356
    
    onehots = np.concatenate([encoder.encode(row['wide_seq_ext_' + str(k)][175-50:175-50+185])[None, None, :, :] for _, row in df.iterrows()], axis=0)
    
    fake_lib = np.zeros((onehots.shape[0], 36))
    fake_lib[:, 20] = 1.

    fake_d = np.ones((onehots.shape[0], 1))
    
    #Pad
    n_pad = 32 - onehots.shape[0] % 32 if onehots.shape[0] % 32 != 0 else 0

    fake_lib = np.concatenate([fake_lib, np.zeros((n_pad, 36))], axis=0)
    fake_d = np.concatenate([fake_d, np.zeros((n_pad, 1))], axis=0)
    onehots = np.concatenate([onehots, np.zeros((n_pad, 1, 185, 4))], axis=0)
    
    pred_iso, pred_cuts = aparent_model.predict(x=[onehots, fake_lib, fake_d], batch_size=32, verbose=1)
    
    if n_pad > 0 :
        pred_iso = pred_iso[:-n_pad, :]
        pred_cuts = pred_cuts[:-n_pad, :]
    
    pred_iso = pred_iso[:, 0]
    
    isoform_start = 57
    isoform_end = 107

    pred_iso_from_cuts = np.sum(pred_cuts[:, isoform_start:isoform_end], axis=1)
    score_from_iso = np.log(pred_iso / (1. - pred_iso))
    score_from_cuts = np.log(pred_iso_from_cuts / (1. - pred_iso_from_cuts))
    score = (score_from_iso + score_from_cuts) / 2.

    isoform_start = 0
    isoform_end = 185

    pred_iso_all_cuts_from_cuts = np.sum(pred_cuts[:, isoform_start:isoform_end], axis=1)
    score_all_cuts_from_cuts = np.log(pred_iso_all_cuts_from_cuts / (1. - pred_iso_all_cuts_from_cuts))
    score_all_cuts = (score_from_iso + score_all_cuts_from_cuts) / 2.
    
    a[:, k] = score[:]
    a_all_cuts[:, k] = score_all_cuts[:]

a = a * m
a = np.clip(a, -8., 8.)

a_all_cuts = a_all_cuts * m
a_all_cuts = np.clip(a_all_cuts, -8., 8.)


Predicting for PAS #0...
Predicting for PAS #1...
Predicting for PAS #2...
Predicting for PAS #3...
Predicting for PAS #4...
Predicting for PAS #5...
Predicting for PAS #6...
Predicting for PAS #7...
Predicting for PAS #8...
Predicting for PAS #9...


In [5]:
#Dump prediction matrix

np.save('apa_polyadb_data/' + model_name + '_native_scores_utr3', a)
np.save('apa_polyadb_data/' + model_name + '_native_scores_utr3_all_cuts', a_all_cuts)
