In [1]:
from __future__ import print_function
import keras
from keras.models import Sequential, Model, load_model
from keras import backend as K

import tensorflow as tf

import os
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm

import aparent.visualization as vis

from aparent.predictor import *


Using TensorFlow backend.


<h2>Load APARENT model</h2>
<br/>
-- Load APARENT, which has been trained to predict the isoform abundance and cut profile of a proximal PAS given a fixed background distal PAS (trained on random 3' UTR APA MPRA data).<br/>

In [2]:
#Load base APARENT model

save_dir = os.path.join(os.getcwd(), '../../saved_models')
model_name = 'aparent_large_lessdropout_all_libs_no_sampleweights.h5'
model_path = os.path.join(save_dir, model_name)

aparent_model = load_model(model_path)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.




In [4]:
#Load APADB Data and filter on targeted genes

genes = ['RUNX1', 'CEBPA', 'GATA2', 'ANKRD26', 'DDX41', 'ETV6', 'PTEN', 'BRCA1', 'BRCA2', 'TP53', 'APC', 'ATM', 'PALB2', 'MSH2', 'MLH1', 'MSH6', 'PMS2', 'MUTYH']

polyadb_df = pd.read_csv('polyadb_processed.csv', sep=',')

polyadb_df = polyadb_df.loc[polyadb_df['gene'].isin(genes)].reset_index(drop=True).copy()

print('len(polyadb_df) = ' + str(len(polyadb_df)))


len(polyadb_df) = 294


In [5]:
polyadb_df_minus = polyadb_df.query("strand == '-'").copy().reset_index(drop=True)
polyadb_df_plus = polyadb_df.query("strand == '+'").copy().reset_index(drop=True)

polyadb_df_minus = polyadb_df_minus.sort_values(by='pas_pos', ascending=False).copy().reset_index(drop=True)
polyadb_df_plus = polyadb_df_plus.sort_values(by='pas_pos', ascending=True).copy().reset_index(drop=True)

new_gene_id_list_plus = []
sitenum_list_plus = []
gene_id_dict = {}
for _, row in polyadb_df_plus.iterrows() :

    gene = row['gene']

    if gene not in gene_id_dict :
        gene_id_dict[gene] = 0

    gene_id_dict[gene] += 1

    new_gene_id_list_plus.append(gene + "." + str(gene_id_dict[gene]))
    sitenum_list_plus.append(gene_id_dict[gene])

polyadb_df_plus['gene_id'] = new_gene_id_list_plus
polyadb_df_plus['sitenum'] = sitenum_list_plus

new_gene_id_list_minus = []
sitenum_list_minus = []
gene_id_dict = {}
for _, row in polyadb_df_minus.iterrows() :

    gene = row['gene']

    if gene not in gene_id_dict :
        gene_id_dict[gene] = 0

    gene_id_dict[gene] += 1

    new_gene_id_list_minus.append(gene + "." + str(gene_id_dict[gene]))
    sitenum_list_minus.append(gene_id_dict[gene])

polyadb_df_minus['gene_id'] = new_gene_id_list_minus
polyadb_df_minus['sitenum'] = sitenum_list_minus

polyadb_df = pd.concat([polyadb_df_plus, polyadb_df_minus])

polyadb_df = polyadb_df.sort_values(by=['gene', 'sitenum'], ascending=True).reset_index(drop=True).copy()

In [7]:
#Perform in-silico saturation mutagenesis

aparent_encoder = get_aparent_encoder()

isoform_start = 77
isoform_end = 77 + 40

variant_dict = {
    'gene' : [],
    'gene_id' : [],
    #'ref_seq' : [],
    'chrom' : [],
    'strand' : [],
    'site_type' : [],
    'native_usage' : [],
    #'var_seq' : [],
    'var_position' : [],
    'ref_nucleotide' : [],
    'var_nucleotide' : [],
    'delta_logodds_fixed_window' : []
}

for _, row in polyadb_df.iterrows() :
    gene = row['gene']
    gene_id = row['gene_id']
    
    print("Predicting variants for PAS = " + str(gene_id))
    
    ref_seq = row['wide_seq_ext'][175-70:175-70+205]
    
    chrom = row['chrom']
    strand = row['strand']
    site_type = row['site_type']
    ref_usage = row['ratio']
    
    pas_pos = row['pas_pos']
    
    seq_start = 0
    if strand == '+' :
        #seq_start = pas_pos - 70
        seq_start = pas_pos - 70 + 1
    else :
        #seq_start = pas_pos - (205 - 76)
        seq_start = pas_pos - (205 - 70)
    
    seq_end = seq_start + 205
    
    #Predict reference sequence with APARENT model
    _, ref_cut_pred = aparent_model.predict(x=aparent_encoder([ref_seq]))
    
    ref_iso_pred_fixed_window = np.sum(ref_cut_pred[0, isoform_start:isoform_end])
    
    #Predict all variants
    for pos in range(205) :
        for base in ['A', 'C', 'G', 'T'] :
            var_seq = ref_seq[:pos] + base + ref_seq[pos+1:]
            if var_seq == ref_seq :
                continue
            
            #Predict variant sequence with APARENT model
            _, var_cut_pred = aparent_model.predict(x=aparent_encoder([var_seq]))
            
            var_iso_pred_fixed_window = np.sum(var_cut_pred[0, isoform_start:isoform_end])
            
            delta_logodds_fixed_window = np.log(var_iso_pred_fixed_window / (1. - var_iso_pred_fixed_window)) - np.log(ref_iso_pred_fixed_window / (1. - ref_iso_pred_fixed_window))
            
            
            var_position = 0
            var_nucleotide = 'A'
            if strand == '+' :
                var_position = seq_start + pos
                var_nucleotide = base
            else :
                var_position = seq_end - pos
                if base == 'A' :
                    var_nucleotide = 'T'
                elif base == 'C' :
                    var_nucleotide = 'G'
                elif base == 'G' :
                    var_nucleotide = 'C'
                elif base == 'T' :
                    var_nucleotide = 'A'
            
            ref_nucleotide = 'A'
            if strand == '+' :
                ref_nucleotide = ref_seq[pos]
            else :
                if ref_seq[pos] == 'A' :
                    ref_nucleotide = 'T'
                elif ref_seq[pos] == 'C' :
                    ref_nucleotide = 'G'
                elif ref_seq[pos] == 'G' :
                    ref_nucleotide = 'C'
                elif ref_seq[pos] == 'T' :
                    ref_nucleotide = 'A'
            
            variant_dict['gene'].append(gene)
            variant_dict['gene_id'].append(gene_id)
            #variant_dict['ref_seq'].append(ref_seq)
            variant_dict['chrom'].append(chrom)
            variant_dict['strand'].append(strand)
            variant_dict['site_type'].append(site_type)
            variant_dict['native_usage'].append(ref_usage)
            #variant_dict['var_seq'].append(var_seq)
            variant_dict['var_position'].append(var_position)
            variant_dict['ref_nucleotide'].append(ref_nucleotide)
            variant_dict['var_nucleotide'].append(var_nucleotide)
            variant_dict['delta_logodds_fixed_window'].append(delta_logodds_fixed_window)

variant_df = pd.DataFrame(variant_dict)
#variant_df = variant_df[['gene','gene_id','ref_seq','chrom','strand','site_type','native_usage','var_seq','var_position','ref_nucleotide','var_nucleotide','delta_logodds_fixed_window']]
variant_df = variant_df[['gene','gene_id','chrom','strand','site_type','native_usage','var_position','ref_nucleotide','var_nucleotide','delta_logodds_fixed_window']]


Predicting variants for PAS = ANKRD26.1
Predicting variants for PAS = ANKRD26.2
Predicting variants for PAS = ANKRD26.3
Predicting variants for PAS = ANKRD26.4
Predicting variants for PAS = ANKRD26.5
Predicting variants for PAS = ANKRD26.6
Predicting variants for PAS = ANKRD26.7
Predicting variants for PAS = ANKRD26.8
Predicting variants for PAS = ANKRD26.9
Predicting variants for PAS = ANKRD26.10
Predicting variants for PAS = ANKRD26.11
Predicting variants for PAS = ANKRD26.12
Predicting variants for PAS = ANKRD26.13
Predicting variants for PAS = ANKRD26.14
Predicting variants for PAS = ANKRD26.15
Predicting variants for PAS = ANKRD26.16
Predicting variants for PAS = ANKRD26.17
Predicting variants for PAS = ANKRD26.18
Predicting variants for PAS = ANKRD26.19
Predicting variants for PAS = ANKRD26.20
Predicting variants for PAS = ANKRD26.21
Predicting variants for PAS = ANKRD26.22
Predicting variants for PAS = ANKRD26.23
Predicting variants for PAS = ANKRD26.24
Predicting variants for P

Predicting variants for PAS = PTEN.21
Predicting variants for PAS = PTEN.22
Predicting variants for PAS = PTEN.23
Predicting variants for PAS = PTEN.24
Predicting variants for PAS = PTEN.25
Predicting variants for PAS = PTEN.26
Predicting variants for PAS = PTEN.27
Predicting variants for PAS = PTEN.28
Predicting variants for PAS = PTEN.29
Predicting variants for PAS = PTEN.30
Predicting variants for PAS = PTEN.31
Predicting variants for PAS = PTEN.32
Predicting variants for PAS = PTEN.33
Predicting variants for PAS = PTEN.34
Predicting variants for PAS = PTEN.35
Predicting variants for PAS = PTEN.36
Predicting variants for PAS = PTEN.37
Predicting variants for PAS = PTEN.38
Predicting variants for PAS = PTEN.39
Predicting variants for PAS = PTEN.40
Predicting variants for PAS = PTEN.41
Predicting variants for PAS = PTEN.42
Predicting variants for PAS = PTEN.43
Predicting variants for PAS = PTEN.44
Predicting variants for PAS = PTEN.45
Predicting variants for PAS = PTEN.46
Predicting v

In [8]:

variant_df['native_usage_pred_var'] = 1. - 1. / (1. + (variant_df['native_usage'] / (1. - variant_df['native_usage'])) * np.exp(variant_df['delta_logodds_fixed_window']))
variant_df['delta_usage'] = variant_df['native_usage_pred_var'] - variant_df['native_usage']


In [9]:
#Store variant prediction dataframe

variant_df.to_csv('variant_predictions_polyadb_no_sequences.csv', sep='\t')


In [10]:
#Load ClinVar variant summary

clinvar_df = pd.read_csv('clinvar_variant_summary_20200205.txt', sep='\t')


  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
clinvar_df = clinvar_df.query("Type == 'single nucleotide variant' and Assembly == 'GRCh37'").reset_index(drop=True).copy()

In [12]:
variant_df['flat_id'] = variant_df['chrom'].apply(str) + "_" + variant_df['var_position'].apply(str) + "_" + variant_df['var_nucleotide'].apply(str)
clinvar_df['flat_id'] = 'chr' + clinvar_df['Chromosome'].apply(str) + "_" + clinvar_df['Start'].apply(str) + "_" + clinvar_df['AlternateAllele'].apply(str)

In [13]:
clinvar_df = clinvar_df[['flat_id', 'Name', 'GeneSymbol', 'Chromosome', 'Start', 'Stop', 'ReferenceAllele', 'AlternateAllele', 'ClinicalSignificance']]

clinvar_df = clinvar_df.rename(columns={
    'Name' : 'clinvar_id',
    'GeneSymbol' : 'clinvar_gene',
    'Chromosome' : 'clinvar_chrom',
    'Start' : 'clinvar_start',
    'End' : 'clinvar_end',
    'ReferenceAllele' : 'clinvar_ref_nucleotide',
    'AlternateAllele' : 'clinvar_var_nucleotide',
    'ClinicalSignificance' : 'clinvar_significance'
})

In [14]:

mapped_variant_df = variant_df.join(clinvar_df.set_index('flat_id'), on='flat_id', how='left')


In [15]:
mapped_variant_df = mapped_variant_df.rename(columns={
    'gene_id' : 'pas_id',
    'var_position' : 'position',
    'ref_nucleotide' : 'ref_allele',
    'var_nucleotide' : 'var_allele',
    'delta_logodds_fixed_window' : 'delta_logodds'
})

In [16]:
#mapped_variant_df = mapped_variant_df[['gene', 'chrom', 'strand', 'pas_id', 'site_type', 'ref_seq', 'var_seq', 'position', 'ref_allele', 'var_allele', 'delta_logodds', 'native_usage', 'delta_usage', 'clinvar_id', 'clinvar_significance']]
mapped_variant_df = mapped_variant_df[['gene', 'chrom', 'strand', 'pas_id', 'site_type', 'position', 'ref_allele', 'var_allele', 'delta_logodds', 'native_usage', 'delta_usage', 'clinvar_id', 'clinvar_significance']]

mapped_variant_df.to_csv("all_variant_predictions_polyadb_no_sequences.csv", sep='\t', index=False)

mapped_variant_df.loc[~mapped_variant_df['clinvar_id'].isnull()].to_csv("clinvar_variant_predictions_polyadb_no_sequences.csv", sep='\t', index=False)

In [17]:
mapped_variant_df = mapped_variant_df.sort_values(by='delta_logodds').copy().reset_index(drop=True)

mapped_variant_df.to_csv("all_variant_predictions_polyadb_no_sequences_sorted.csv", sep='\t', index=False)

mapped_variant_df.loc[~mapped_variant_df['clinvar_id'].isnull()].to_csv("clinvar_variant_predictions_polyadb_no_sequences_sorted.csv", sep='\t', index=False)