In [1]:
from __future__ import print_function
import keras
from keras.models import Sequential, Model, load_model
from keras import backend as K

import tensorflow as tf

import os
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm

#import aparent.visualization as vis

#from aparent_predictor import *


Using TensorFlow backend.


<h2>Load APARENT model</h2>
<br/>
-- Load APARENT, which has been trained to predict the isoform abundance and cut profile of a proximal PAS given a fixed background distal PAS (trained on random 3' UTR APA MPRA data).<br/>

In [2]:
#Load APARENT Resnet

#model_name = 'aparent_all_libs_resnet_no_clinvar_wt_ep_5'
#save_dir = os.path.join(os.getcwd(), './')

model_name = 'aparent_all_libs_resnet_no_clinvar_wt_ep_5_var_batch_size'
save_dir = os.path.join(os.getcwd(), '../../autoscrambler/analysis/apa/saved_models')

model_path = os.path.join(save_dir, model_name + '.h5')

aparent_model = load_model(model_path)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.




In [3]:
#Load APADB Data and filter on targeted genes

#genes = ['RUNX1', 'CEBPA', 'GATA2', 'ANKRD26', 'DDX41', 'ETV6', 'PTEN', 'BRCA1', 'BRCA2', 'TP53', 'APC', 'ATM', 'PALB2', 'MSH2', 'MLH1', 'MSH6', 'PMS2', 'MUTYH']

polyadb_df = pd.read_csv('polyadb_processed.csv', sep=',')

#polyadb_df = polyadb_df.loc[polyadb_df['gene'].isin(genes)].reset_index(drop=True).copy()
polyadb_df = polyadb_df.loc[((~polyadb_df['gene'].isnull()) & (polyadb_df['gene'] != 'na')) & (polyadb_df['pas'] != -1)].reset_index(drop=True).copy()

print('len(polyadb_df) = ' + str(len(polyadb_df)))


len(polyadb_df) = 175451


In [4]:
polyadb_df_minus = polyadb_df.query("strand == '-'").copy().reset_index(drop=True)
polyadb_df_plus = polyadb_df.query("strand == '+'").copy().reset_index(drop=True)

polyadb_df_minus = polyadb_df_minus.sort_values(by='pas_pos', ascending=False).copy().reset_index(drop=True)
polyadb_df_plus = polyadb_df_plus.sort_values(by='pas_pos', ascending=True).copy().reset_index(drop=True)

new_gene_id_list_plus = []
sitenum_list_plus = []
gene_id_dict = {}
for _, row in polyadb_df_plus.iterrows() :

    gene = row['gene']

    if gene not in gene_id_dict :
        gene_id_dict[gene] = 0

    gene_id_dict[gene] += 1

    new_gene_id_list_plus.append(gene + "." + str(gene_id_dict[gene]))
    sitenum_list_plus.append(gene_id_dict[gene])

polyadb_df_plus['gene_id'] = new_gene_id_list_plus
polyadb_df_plus['sitenum'] = sitenum_list_plus

new_gene_id_list_minus = []
sitenum_list_minus = []
gene_id_dict = {}
for _, row in polyadb_df_minus.iterrows() :

    gene = row['gene']

    if gene not in gene_id_dict :
        gene_id_dict[gene] = 0

    gene_id_dict[gene] += 1

    new_gene_id_list_minus.append(gene + "." + str(gene_id_dict[gene]))
    sitenum_list_minus.append(gene_id_dict[gene])

polyadb_df_minus['gene_id'] = new_gene_id_list_minus
polyadb_df_minus['sitenum'] = sitenum_list_minus

polyadb_df = pd.concat([polyadb_df_plus, polyadb_df_minus])

polyadb_df = polyadb_df.sort_values(by=['gene', 'sitenum'], ascending=True).reset_index(drop=True).copy()

In [5]:

print('len(polyadb_df) = ' + str(len(polyadb_df)))


len(polyadb_df) = 175451


In [6]:
class OneHotEncoder :
    def __init__(self, seq_length=100, default_fill_value=0) :
        self.seq_length = seq_length
        self.default_fill_value = default_fill_value
        self.encode_map = {
            'A' : 0,
            'C' : 1,
            'G' : 2,
            'T' : 3
        }
        self.decode_map = {
            0 : 'A',
            1 : 'C',
            2 : 'G',
            3 : 'T',
            -1 : 'X'
        }

    def encode(self, seq) :
        one_hot = np.zeros((self.seq_length, 4))
        self.encode_inplace(seq, one_hot)

        return one_hot

    def encode_inplace(self, seq, encoding) :
        for pos, nt in enumerate(list(seq)) :
            if nt in self.encode_map :
                encoding[pos, self.encode_map[nt]] = 1
            elif self.default_fill_value != 0 :
                encoding[pos, :] = self.default_fill_value

    def __call__(self, seq) :
        return self.encode(seq)

def get_aparent_encoder(lib_bias=11) :
    onehot_encoder = OneHotEncoder(205)

    def encode_for_aparent(sequences) :
        one_hots = np.concatenate([onehot_encoder(sequence)[None, None, ...] for sequence in sequences], axis=0)

        fake_lib = np.zeros((len(sequences), 13))

        if lib_bias is not None :
            fake_lib[:, lib_bias] = 1.

        return [
            one_hots,
            fake_lib
        ]

    return encode_for_aparent


In [7]:
#Perform in-silico saturation mutagenesis

aparent_encoder = get_aparent_encoder()

mutagenesis_end = 146

min_abs_logodds = 0.0

variant_dict = {
    'gene' : [],
    'gene_id' : [],
    #'ref_seq' : [],
    'chrom' : [],
    'strand' : [],
    'site_type' : [],
    'native_usage' : [],
    #'var_seq' : [],
    'var_position' : [],
    'ref_nucleotide' : [],
    'var_nucleotide' : [],
    'delta_logodds_77_127' : [],
    'delta_logodds_0_205' : []
}

for pd_index, row in polyadb_df.iterrows() :
    gene = row['gene']
    gene_id = row['gene_id']
    
    if pd_index % 100 == 0 :
        print("Predicting variants for PAS = " + str(gene_id) + " (" + str(pd_index) + ")")
    
    ref_seq = row['wide_seq_ext'][175-70:175-70+205]
    
    chrom = row['chrom']
    strand = row['strand']
    site_type = row['site_type']
    ref_usage = row['ratio']
    
    pas_pos = row['pas_pos']
    
    seq_start = 0
    if strand == '+' :
        seq_start = pas_pos - 70 + 1
    else :
        seq_start = pas_pos - (205 - 70)
    
    seq_end = seq_start + 205
    
    #Predict reference sequence with APARENT model
    _, ref_cut_pred = aparent_model.predict(x=aparent_encoder([ref_seq]), batch_size=1)
    
    ref_iso_pred_77_127 = np.clip(np.sum(ref_cut_pred[0, 77:127]), 1e-7, 1. - 1e-7)
    ref_iso_pred_0_205 = np.clip(np.sum(ref_cut_pred[0, 0:205]), 1e-7, 1. - 1e-7)
    
    #Predict all variants
    var_seqs = []
    for pos in range(mutagenesis_end) :
        for base in ['A', 'C', 'G', 'T'] :
            var_seq = ref_seq[:pos] + base + ref_seq[pos+1:]
            if var_seq == ref_seq :
                continue
            
            var_seqs.append(var_seq)
    
    _, var_cut_preds = aparent_model.predict(x=aparent_encoder(var_seqs), batch_size=len(var_seqs))
    
    #Predict all variants
    ii = 0
    for pos in range(mutagenesis_end) :
        for base in ['A', 'C', 'G', 'T'] :
            var_seq = ref_seq[:pos] + base + ref_seq[pos+1:]
            if var_seq == ref_seq :
                continue
            
            #Predict variant sequence with APARENT model
            var_cut_pred = var_cut_preds[ii, :]
            
            var_iso_pred_77_127 = np.clip(np.sum(var_cut_pred[77:127]), 1e-7, 1. - 1e-7)
            delta_logodds_77_127 = np.log(var_iso_pred_77_127 / (1. - var_iso_pred_77_127)) - np.log(ref_iso_pred_77_127 / (1. - ref_iso_pred_77_127))
            
            var_iso_pred_0_205 = np.clip(np.sum(var_cut_pred[0:205]), 1e-7, 1. - 1e-7)
            delta_logodds_0_205 = np.log(var_iso_pred_0_205 / (1. - var_iso_pred_0_205)) - np.log(ref_iso_pred_0_205 / (1. - ref_iso_pred_0_205))
            
            var_position = 0
            var_nucleotide = 'A'
            if strand == '+' :
                var_position = seq_start + pos
                var_nucleotide = base
            else :
                var_position = seq_end - pos
                if base == 'A' :
                    var_nucleotide = 'T'
                elif base == 'C' :
                    var_nucleotide = 'G'
                elif base == 'G' :
                    var_nucleotide = 'C'
                elif base == 'T' :
                    var_nucleotide = 'A'
            
            ref_nucleotide = 'A'
            if strand == '+' :
                ref_nucleotide = ref_seq[pos]
            else :
                if ref_seq[pos] == 'A' :
                    ref_nucleotide = 'T'
                elif ref_seq[pos] == 'C' :
                    ref_nucleotide = 'G'
                elif ref_seq[pos] == 'G' :
                    ref_nucleotide = 'C'
                elif ref_seq[pos] == 'T' :
                    ref_nucleotide = 'A'
            
            if np.abs(delta_logodds_77_127) >= min_abs_logodds :
                variant_dict['gene'].append(gene)
                variant_dict['gene_id'].append(gene_id)
                #variant_dict['ref_seq'].append(ref_seq)
                variant_dict['chrom'].append(chrom)
                variant_dict['strand'].append(strand)
                variant_dict['site_type'].append(site_type)
                variant_dict['native_usage'].append(np.round(ref_usage, 5))
                #variant_dict['var_seq'].append(var_seq)
                variant_dict['var_position'].append(var_position)
                variant_dict['ref_nucleotide'].append(ref_nucleotide)
                variant_dict['var_nucleotide'].append(var_nucleotide)
                variant_dict['delta_logodds_77_127'].append(np.round(delta_logodds_77_127, 5))
                variant_dict['delta_logodds_0_205'].append(np.round(delta_logodds_0_205, 5))
            
            ii += 1

variant_df = pd.DataFrame(variant_dict)
variant_df = variant_df[['gene','gene_id','chrom','strand','site_type','native_usage','var_position','ref_nucleotide','var_nucleotide','delta_logodds_77_127','delta_logodds_0_205']]


Predicting variants for PAS = A1BG.1 (0)
Predicting variants for PAS = AAK1.10 (100)
Predicting variants for PAS = AASDH.14 (200)
Predicting variants for PAS = AC007663.1 (1200)
Predicting variants for PAS = AC010542.2 (1300)
Predicting variants for PAS = AC020893.1 (1400)
Predicting variants for PAS = AC061975.1 (1500)
Predicting variants for PAS = AC091551.6 (1600)
Predicting variants for PAS = AC098934.2 (1700)
Predicting variants for PAS = AC117498.1 (1800)
Predicting variants for PAS = ACAA2.2 (1900)
Predicting variants for PAS = ACAD8.11 (2000)
Predicting variants for PAS = ACAT1.5 (2100)
Predicting variants for PAS = ACER2.3 (2200)
Predicting variants for PAS = ACO2.12 (2300)
Predicting variants for PAS = ACOX1.21 (2400)
Predicting variants for PAS = ACSBG1.3 (2500)
Predicting variants for PAS = ACSM3.3 (2600)
Predicting variants for PAS = ACTL6A.6 (2700)
Predicting variants for PAS = ACTR2.7 (2800)
Predicting variants for PAS = ACTR8.18 (2900)
Predicting variants for PAS = ACYP

Predicting variants for PAS = C17orf49.3 (18600)
Predicting variants for PAS = C18orf25.10 (18700)
Predicting variants for PAS = C19orf66.2 (18800)
Predicting variants for PAS = C1QTNF3.10 (18900)
Predicting variants for PAS = C1orf122.2 (19000)
Predicting variants for PAS = C1orf228.4 (19100)
Predicting variants for PAS = C20orf144.2 (19200)
Predicting variants for PAS = C21orf58.24 (19300)
Predicting variants for PAS = C2CD2.5 (19400)
Predicting variants for PAS = C2orf27A.24 (19500)
Predicting variants for PAS = C2orf76.6 (19600)
Predicting variants for PAS = C3orf67.11 (19700)
Predicting variants for PAS = C5AR1.1 (19800)
Predicting variants for PAS = C5orf49.2 (19900)
Predicting variants for PAS = C6orf120.8 (20000)
Predicting variants for PAS = C6orf62.3 (20100)
Predicting variants for PAS = C7orf49.15 (20200)
Predicting variants for PAS = C8orf33.12 (20300)
Predicting variants for PAS = C8orf88.10 (20400)
Predicting variants for PAS = C9orf43.2 (20500)
Predicting variants for PA

Predicting variants for PAS = CYBRD1.1 (36100)
Predicting variants for PAS = CYP11A1.3 (36200)
Predicting variants for PAS = CYP2C9.2 (36300)
Predicting variants for PAS = CYP4V2.10 (36400)
Predicting variants for PAS = CYTH3.7 (36500)
Predicting variants for PAS = DAB2IP.11 (36600)
Predicting variants for PAS = DAP3.7 (36700)
Predicting variants for PAS = DAZAP1.11 (36800)
Predicting variants for PAS = DBNL.23 (36900)
Predicting variants for PAS = DCAF12.5 (37000)
Predicting variants for PAS = DCAF6.16 (37100)
Predicting variants for PAS = DCBLD2.39 (37200)
Predicting variants for PAS = DCLRE1C.6 (37300)
Predicting variants for PAS = DCT.4 (37400)
Predicting variants for PAS = DCTN6.4 (37500)
Predicting variants for PAS = DCUN1D5.1 (37600)
Predicting variants for PAS = DDHD1.10 (37700)
Predicting variants for PAS = DDR1.6 (37800)
Predicting variants for PAS = DDX19A.10 (37900)
Predicting variants for PAS = DDX31.9 (38000)
Predicting variants for PAS = DDX47.6 (38100)
Predicting varian

Predicting variants for PAS = FNDC3B.10 (54600)
Predicting variants for PAS = FOLH1.7 (54700)
Predicting variants for PAS = FOXB1.1 (54800)
Predicting variants for PAS = FOXM1.6 (54900)
Predicting variants for PAS = FOXP1.10 (55000)
Predicting variants for PAS = FRA10AC1.6 (55100)
Predicting variants for PAS = FRG1BP.22 (55200)
Predicting variants for PAS = FRMD8.5 (55300)
Predicting variants for PAS = FSBP.2 (55400)
Predicting variants for PAS = FSTL4.4 (55500)
Predicting variants for PAS = FTO.5 (55600)
Predicting variants for PAS = FUNDC2.2 (55700)
Predicting variants for PAS = FUT2.2 (55800)
Predicting variants for PAS = FXR1.27 (55900)
Predicting variants for PAS = FYTTD1.25 (56000)
Predicting variants for PAS = FZD6.16 (56100)
Predicting variants for PAS = G3BP2.20 (56200)
Predicting variants for PAS = GABBR2.8 (56300)
Predicting variants for PAS = GABRB1.3 (56400)
Predicting variants for PAS = GABRQ.5 (56500)
Predicting variants for PAS = GALK2.10 (56600)
Predicting variants for

Predicting variants for PAS = KANSL1L.21 (72200)
Predicting variants for PAS = KAT7.9 (72300)
Predicting variants for PAS = KAZN.5 (72400)
Predicting variants for PAS = KCMF1.19 (72500)
Predicting variants for PAS = KCND1.1 (72600)
Predicting variants for PAS = KCNIP3.4 (72700)
Predicting variants for PAS = KCNJ9.3 (72800)
Predicting variants for PAS = KCNMA1.9 (72900)
Predicting variants for PAS = KCNQ3.11 (73000)
Predicting variants for PAS = KCTD1.22 (73100)
Predicting variants for PAS = KCTD20.15 (73200)
Predicting variants for PAS = KDELR2.11 (73300)
Predicting variants for PAS = KDM3A.11 (73400)
Predicting variants for PAS = KDM5A.31 (73500)
Predicting variants for PAS = KDSR.9 (73600)
Predicting variants for PAS = KHNYN.8 (73700)
Predicting variants for PAS = KIAA0355.4 (73800)
Predicting variants for PAS = KIAA0753.9 (73900)
Predicting variants for PAS = KIAA1147.10 (74000)
Predicting variants for PAS = KIAA1257.26 (74100)
Predicting variants for PAS = KIAA1468.19 (74200)
Predi

Predicting variants for PAS = MRPS7.2 (89700)
Predicting variants for PAS = MSANTD3-TMEFF1.8 (89800)
Predicting variants for PAS = MSH6.15 (89900)
Predicting variants for PAS = MSRA.3 (90000)
Predicting variants for PAS = MT1P3.1 (90100)
Predicting variants for PAS = MTATP6P2.1 (90200)
Predicting variants for PAS = MTCO1P40.3 (90300)
Predicting variants for PAS = MTF2.10 (90400)
Predicting variants for PAS = MTHFD1.8 (90500)
Predicting variants for PAS = MTHFD2L.30 (90600)
Predicting variants for PAS = MTMR10.12 (90700)
Predicting variants for PAS = MTMR7.13 (90800)
Predicting variants for PAS = MTND5P11.8 (90900)
Predicting variants for PAS = MTRF1L.12 (91000)
Predicting variants for PAS = MTSS1L.8 (91100)
Predicting variants for PAS = MUC20.2 (91200)
Predicting variants for PAS = MVD.4 (91300)
Predicting variants for PAS = MYBBP1A.11 (91400)
Predicting variants for PAS = MYEF2.6 (91500)
Predicting variants for PAS = MYH9.27 (91600)
Predicting variants for PAS = MYLK2.1 (91700)
Predic

Predicting variants for PAS = PHTF1.14 (107200)
Predicting variants for PAS = PHYKPL.18 (107300)
Predicting variants for PAS = PIAS1.31 (107400)
Predicting variants for PAS = PID1.5 (107500)
Predicting variants for PAS = PIGG.4 (107600)
Predicting variants for PAS = PIGN.9 (107700)
Predicting variants for PAS = PIGX.8 (107800)
Predicting variants for PAS = PIK3CA.11 (107900)
Predicting variants for PAS = PILRB.10 (108000)
Predicting variants for PAS = PIP4K2C.6 (108100)
Predicting variants for PAS = PITPNA.6 (108200)
Predicting variants for PAS = PIWIL2.3 (108300)
Predicting variants for PAS = PKIA.17 (108400)
Predicting variants for PAS = PKN3.2 (108500)
Predicting variants for PAS = PKP4.36 (108600)
Predicting variants for PAS = PLA2G7.5 (108700)
Predicting variants for PAS = PLAUR.13 (108800)
Predicting variants for PAS = PLCE1.10 (108900)
Predicting variants for PAS = PLD3.6 (109000)
Predicting variants for PAS = PLEKHA4.6 (109100)
Predicting variants for PAS = PLEKHB2.1 (109200)
P

Predicting variants for PAS = RPAP1.3 (124400)
Predicting variants for PAS = RPF2.4 (124500)
Predicting variants for PAS = RPL10A.6 (124600)
Predicting variants for PAS = RPL13A.16 (124700)
Predicting variants for PAS = RPL18P11.1 (124800)
Predicting variants for PAS = RPL23A.8 (124900)
Predicting variants for PAS = RPL27A.9 (125000)
Predicting variants for PAS = RPL31.32 (125100)
Predicting variants for PAS = RPL35A.1 (125200)
Predicting variants for PAS = RPL37A.21 (125300)
Predicting variants for PAS = RPL5.4 (125400)
Predicting variants for PAS = RPL8.2 (125500)
Predicting variants for PAS = RPP30.10 (125600)
Predicting variants for PAS = RPS11.2 (125700)
Predicting variants for PAS = RPS18P9.3 (125800)
Predicting variants for PAS = RPS24.8 (125900)
Predicting variants for PAS = RPS2P32.2 (126000)
Predicting variants for PAS = RPS4Y1.9 (126100)
Predicting variants for PAS = RPS6KA5.12 (126200)
Predicting variants for PAS = RPSA.7 (126300)
Predicting variants for PAS = RPUSD3.11 (12

Predicting variants for PAS = SRSF9.14 (141500)
Predicting variants for PAS = SSBP2.26 (141600)
Predicting variants for PAS = SSH3.1 (141700)
Predicting variants for PAS = SSR4.4 (141800)
Predicting variants for PAS = ST13.15 (141900)
Predicting variants for PAS = ST3GAL3.17 (142000)
Predicting variants for PAS = ST6GALNAC4.4 (142100)
Predicting variants for PAS = ST8SIA1.4 (142200)
Predicting variants for PAS = STAC.13 (142300)
Predicting variants for PAS = STAM2.15 (142400)
Predicting variants for PAS = STARD4.1 (142500)
Predicting variants for PAS = STAT2.6 (142600)
Predicting variants for PAS = STAU2.36 (142700)
Predicting variants for PAS = STEAP3.3 (142800)
Predicting variants for PAS = STK17A.15 (142900)
Predicting variants for PAS = STK32A.2 (143000)
Predicting variants for PAS = STK39.18 (143100)
Predicting variants for PAS = STMP1.7 (143200)
Predicting variants for PAS = STON2.11 (143300)
Predicting variants for PAS = STRIP1.1 (143400)
Predicting variants for PAS = STT3A.9 (1

Predicting variants for PAS = TXNDC17.10 (158600)
Predicting variants for PAS = TXNRD1.6 (158700)
Predicting variants for PAS = TYR.13 (158800)
Predicting variants for PAS = U2SURP.13 (158900)
Predicting variants for PAS = UBA5.13 (159000)
Predicting variants for PAS = UBALD2.3 (159100)
Predicting variants for PAS = UBE2B.22 (159200)
Predicting variants for PAS = UBE2E1.8 (159300)
Predicting variants for PAS = UBE2J1.1 (159400)
Predicting variants for PAS = UBE2O.2 (159500)
Predicting variants for PAS = UBE2W.17 (159600)
Predicting variants for PAS = UBE3C.35 (159700)
Predicting variants for PAS = UBL3.11 (159800)
Predicting variants for PAS = UBP1.15 (159900)
Predicting variants for PAS = UBR4.4 (160000)
Predicting variants for PAS = UBXN2A.7 (160100)
Predicting variants for PAS = UCHL5.3 (160200)
Predicting variants for PAS = UFD1.9 (160300)
Predicting variants for PAS = UGGT1.18 (160400)
Predicting variants for PAS = UGT3A2.3 (160500)
Predicting variants for PAS = UHRF2.3 (160600)
P

In [8]:
#Store variant prediction dataframe

variant_df.to_csv('aparent_resnet_variant_predictions_polyadb_no_sequences_no_cutoff.csv', sep='\t')


In [9]:

variant_df


Unnamed: 0,gene,gene_id,chrom,strand,site_type,native_usage,var_position,ref_nucleotide,var_nucleotide,delta_logodds_77_127,delta_logodds_0_205
0,A1BG,A1BG.1,chr19,-,Intron,0.06413,58859922,T,G,0.01185,0.00747
1,A1BG,A1BG.1,chr19,-,Intron,0.06413,58859922,T,C,-0.00505,-0.00600
2,A1BG,A1BG.1,chr19,-,Intron,0.06413,58859922,T,A,0.09807,0.09027
3,A1BG,A1BG.1,chr19,-,Intron,0.06413,58859921,A,T,-0.02839,-0.01798
4,A1BG,A1BG.1,chr19,-,Intron,0.06413,58859921,A,G,-0.02564,-0.02576
...,...,...,...,...,...,...,...,...,...,...,...
76847533,ZZZ3,ZZZ3.26,chr1,-,3_most_exon,0.00257,78022097,G,C,-0.04053,-0.01775
76847534,ZZZ3,ZZZ3.26,chr1,-,3_most_exon,0.00257,78022097,G,A,0.00094,0.00233
76847535,ZZZ3,ZZZ3.26,chr1,-,3_most_exon,0.00257,78022096,A,T,0.01273,0.01195
76847536,ZZZ3,ZZZ3.26,chr1,-,3_most_exon,0.00257,78022096,A,G,0.03467,0.01759
