In [1]:
from __future__ import print_function
import keras
from keras.models import Sequential, Model, load_model

import tensorflow as tf

import pandas as pd

import os
import pickle
import numpy as np

import scipy.sparse as sp
import scipy.io as spio

import isolearn.io as isoio
import isolearn.keras as iso


Using TensorFlow backend.


In [2]:
#Load sequence data

df = pd.read_csv('../../../aparent/data/prepared_data/apa_leslie_derti_apadb_pair_data/apa_leslie_derti_apadb_pair_data_df_pair.csv', sep='\t')


In [3]:
#Load legacy APARENT model (lifted from theano)

model_name = 'aparent_theano_legacy_30_31_34_pasaligned'

save_dir = os.path.join(os.getcwd(), '../../../aparent/saved_models/legacy_models')
model_path = os.path.join(save_dir, model_name + '.h5')

aparent_model = load_model(model_path)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.




In [4]:
#Score all sequences with APARENT (use sum of cuts to capture OR-like logic)

max_n_pas = 10

encoder = iso.OneHotEncoder(185)

x_prox = np.concatenate([encoder.encode(row['wide_seq_ext_prox'][175-50:175-50+185])[None, None, :, :] for _, row in df.iterrows()], axis=0)
x_dist = np.concatenate([encoder.encode(row['wide_seq_ext_dist'][175-50:175-50+185])[None, None, :, :] for _, row in df.iterrows()], axis=0)

fake_lib = np.zeros((x_prox.shape[0], 36))
fake_lib[:, 20] = 1.

fake_d = np.ones((x_prox.shape[0], 1))

#Pad
n_pad = 32 - x_prox.shape[0] % 32 if x_prox.shape[0] % 32 != 0 else 0

fake_lib = np.concatenate([fake_lib, np.zeros((n_pad, 36))], axis=0)
fake_d = np.concatenate([fake_d, np.zeros((n_pad, 1))], axis=0)
x_prox = np.concatenate([x_prox, np.zeros((n_pad, 1, 185, 4))], axis=0)
x_dist = np.concatenate([x_dist, np.zeros((n_pad, 1, 185, 4))], axis=0)

pred_iso_prox, pred_cuts_prox = aparent_model.predict(x=[x_prox, fake_lib, fake_d], batch_size=32, verbose=1)
pred_iso_dist, pred_cuts_dist = aparent_model.predict(x=[x_dist, fake_lib, fake_d], batch_size=32, verbose=1)

if n_pad > 0 :
    pred_iso_prox = pred_iso_prox[:-n_pad, :]
    pred_iso_dist = pred_iso_dist[:-n_pad, :]
    
    pred_cuts_prox = pred_cuts_prox[:-n_pad, :]
    pred_cuts_dist = pred_cuts_dist[:-n_pad, :]

pred_iso_prox = pred_iso_prox[:, 0]
pred_iso_dist = pred_iso_dist[:, 0]



In [5]:

isoform_start = 57
isoform_end = 107

pred_iso_prox_from_cuts = np.sum(pred_cuts_prox[:, isoform_start:isoform_end], axis=1)
score_prox = np.log(pred_iso_prox_from_cuts / (1. - pred_iso_prox_from_cuts))

pred_iso_dist_from_cuts = np.sum(pred_cuts_dist[:, isoform_start:isoform_end], axis=1)
score_dist = np.log(pred_iso_dist_from_cuts / (1. - pred_iso_dist_from_cuts))

isoform_start = 0
isoform_end = 185

pred_iso_prox_all_cuts_from_cuts = np.sum(pred_cuts_prox[:, isoform_start:isoform_end], axis=1)
score_prox_all_cuts = np.log(pred_iso_prox_all_cuts_from_cuts / (1. - pred_iso_prox_all_cuts_from_cuts))

pred_iso_dist_all_cuts_from_cuts = np.sum(pred_cuts_dist[:, isoform_start:isoform_end], axis=1)
score_dist_all_cuts = np.log(pred_iso_dist_all_cuts_from_cuts / (1. - pred_iso_dist_all_cuts_from_cuts))

score_prox_apadb_region = []
score_dist_apadb_region = []

i = 0
for _, row in df.iterrows() :
    
    strand = row['strand']
    
    cut_start_prox = 0
    cut_end_prox = 185
    cut_start_dist = 0
    cut_end_dist = 185
    
    if strand == '+' :
        cut_start_prox = row['cut_start_prox'] - row['pas_pos_prox'] + 50
        cut_end_prox = row['cut_end_prox'] - row['pas_pos_prox'] + 50
        cut_start_dist = row['cut_start_dist'] - row['pas_pos_dist'] + 50
        cut_end_dist = row['cut_end_dist'] - row['pas_pos_dist'] + 50
    else :
        cut_start_prox = row['pas_pos_prox'] - row['cut_end_prox'] + 56
        cut_end_prox = row['pas_pos_prox'] - row['cut_start_prox'] + 56
        cut_start_dist = row['pas_pos_dist'] - row['cut_end_dist'] + 56
        cut_end_dist = row['pas_pos_dist'] - row['cut_start_dist'] + 56
    
    pred_iso_p = np.sum(pred_cuts_prox[i, cut_start_prox:cut_end_prox])
    score_p = np.log(pred_iso_p / (1. - pred_iso_p))

    pred_iso_d = np.sum(pred_cuts_dist[i, cut_start_dist:cut_end_dist])
    score_d = np.log(pred_iso_d / (1. - pred_iso_d))
    
    score_prox_apadb_region.append(score_p)
    score_dist_apadb_region.append(score_d)
    
    i += 1

score_prox_apadb_region = np.array(score_prox_apadb_region)
score_dist_apadb_region = np.array(score_dist_apadb_region)


In [6]:
#Copy the dataframe and store isoform predictions

pred_df = df.copy().reset_index(drop=True)

pred_df['score_prox'] = score_prox
pred_df['score_dist'] = score_dist
pred_df['score_prox_all_cuts'] = score_prox_all_cuts
pred_df['score_dist_all_cuts'] = score_dist_all_cuts
pred_df['score_prox_apadb_region'] = score_prox_apadb_region
pred_df['score_dist_apadb_region'] = score_dist_apadb_region


In [7]:
#Dump prediction dataframe and cut probability matrix

isoio.dump({'pred_df' : pred_df}, 'apa_leslie_derti_apadb_pair_data/' + model_name + '_predictions_cuts_only')