In [1]:
from __future__ import print_function
import keras
from keras.models import Sequential, Model, load_model

import tensorflow as tf

import pandas as pd

import os
import pickle
import numpy as np

import scipy.sparse as sp
import scipy.io as spio

import isolearn.io as isoio
import isolearn.keras as iso


Using TensorFlow backend.


In [2]:
#Load sequence data

df = pd.read_csv('../../../aparent/data/leslie_derti_apadb_features_pas_3_utr3_large.csv', sep='\t')

save_dict = np.load("../../../aparent/data/leslie_derti_apadb_features_pas_3_utr3_large_no_x.npz")
m, l, c, y, s = save_dict['m'], save_dict['l'], save_dict['c'], save_dict['y'], save_dict['s']


In [3]:
#Load APARENT Resnet

model_name = 'aparent_all_libs_resnet_no_clinvar_wt_ep_5'

save_dir = os.path.join(os.getcwd(), '../../../aparent-resnet/saved_models')
model_path = os.path.join(save_dir, model_name + '.h5')

aparent_model = load_model(model_path)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.




In [4]:
#Score all sequences with APARENT (use sum of cuts to capture OR-like logic)

max_n_pas = 20

encoder = iso.OneHotEncoder(205)

a = np.zeros((len(df), max_n_pas))
a_all_cuts = np.zeros((len(df), max_n_pas))
a_apadb_region = np.zeros((len(df), max_n_pas))

for k in range(max_n_pas) :
    
    print("Predicting for PAS #" + str(k) + "...")
    
    df.loc[df['wide_seq_ext_' + str(k)].isnull(), 'wide_seq_ext_' + str(k)] = 'X' * 205
    
    onehots = np.concatenate([encoder.encode(row['wide_seq_ext_' + str(k)][175-70:175-70+205])[None, None, :, :] for _, row in df.iterrows()], axis=0)
    
    fake_lib = np.zeros((onehots.shape[0], 13))
    fake_lib[:, 11] = 1.
    
    #Pad
    n_pad = 32 - onehots.shape[0] % 32 if onehots.shape[0] % 32 != 0 else 0

    fake_lib = np.concatenate([fake_lib, np.zeros((n_pad, 13))], axis=0)
    onehots = np.concatenate([onehots, np.zeros((n_pad, 1, 205, 4))], axis=0)
    
    _, pred_cuts = aparent_model.predict(x=[onehots, fake_lib], batch_size=32, verbose=1)
    
    if n_pad > 0 :
        pred_cuts = pred_cuts[:-n_pad, :]
    
    isoform_start = 77
    isoform_end = 127

    pred_iso = np.sum(pred_cuts[:, isoform_start:isoform_end], axis=1)
    score = np.log(pred_iso / (1. - pred_iso))

    isoform_start = 0
    isoform_end = 205

    pred_iso_all_cuts = np.sum(pred_cuts[:, isoform_start:isoform_end], axis=1)
    score_all_cuts = np.log(pred_iso_all_cuts / (1. - pred_iso_all_cuts))

    score_apadb_region = []

    i = 0
    for _, row in df.iterrows() :
        
        if row['wide_seq_ext_' + str(k)][0] == 'X' :
            score_apadb_region.append(0.)
            continue

        strand = row['strand']

        cut_start = 0
        cut_end = 205

        if strand == '+' :
            cut_start = int(row['cut_start_' + str(k)] - row['pas_pos_' + str(k)] + 70)
            cut_end = int(row['cut_end_' + str(k)] - row['pas_pos_' + str(k)] + 70)
        else :
            cut_start = int(row['pas_pos_' + str(k)] - row['cut_end_' + str(k)] + 76)
            cut_end = int(row['pas_pos_' + str(k)] - row['cut_start_' + str(k)] + 76)

        pred_iso_p = np.sum(pred_cuts[i, cut_start:cut_end])
        score_p = np.log(pred_iso_p / (1. - pred_iso_p))

        score_apadb_region.append(score_p)

        i += 1

    score_apadb_region = np.array(score_apadb_region)
    
    a[:, k] = score[:]
    a_all_cuts[:, k] = score_all_cuts[:]
    a_apadb_region[:, k] = score_apadb_region[:]

a = a * m
a = np.clip(a, -8., 8.)

a_all_cuts = a_all_cuts * m
a_all_cuts = np.clip(a_all_cuts, -8., 8.)

a_apadb_region = a_apadb_region * m
a_apadb_region = np.clip(a_apadb_region, -8., 8.)


Predicting for PAS #0...
Predicting for PAS #1...
Predicting for PAS #2...
Predicting for PAS #3...
Predicting for PAS #4...
Predicting for PAS #5...
Predicting for PAS #6...
Predicting for PAS #7...
Predicting for PAS #8...
Predicting for PAS #9...
Predicting for PAS #10...
Predicting for PAS #11...
Predicting for PAS #12...
Predicting for PAS #13...
Predicting for PAS #14...
Predicting for PAS #15...
Predicting for PAS #16...
Predicting for PAS #17...
Predicting for PAS #18...
Predicting for PAS #19...


KeyError: 'wide_seq_ext_19'

In [7]:
#Dump prediction matrix

np.save('apa_leslie_derti_apadb_data/' + model_name + '_native_scores_large', a)
np.save('apa_leslie_derti_apadb_data/' + model_name + '_native_scores_large_all_cuts', a_all_cuts)
np.save('apa_leslie_derti_apadb_data/' + model_name + '_native_scores_large_apadb_region', a_apadb_region)
