In [1]:
from __future__ import print_function
import keras
from keras.models import Sequential, Model, load_model

import tensorflow as tf

import pandas as pd

import os
import pickle
import numpy as np

#Function to one-hot encode sequences
def one_hot_encode(seq) :
    
    one_hot = np.zeros((len(seq), 4))
    
    for j in range(len(seq)) :
        if seq[j] == 'A' :
            one_hot[j, 0] = 1.
        elif seq[j] == 'C' :
            one_hot[j, 1] = 1.
        elif seq[j] == 'G' :
            one_hot[j, 2] = 1.
        elif seq[j] == 'T' :
            one_hot[j, 3] = 1.
        elif seq[j] == 'N' :
            one_hot[j, :] = 0.25
    
    return one_hot


Using TensorFlow backend.


In [2]:
#Example variation data

### Format:
# ref_seq : PAS sequence, 205bp; core hexamer (e.g. AATAAA) should start at position 70 (0-index)
# var_seq : Mutated PAS sequence, 205bp; core hexamer (e.g. AATAAA) should start at position 70 (0-index)

genes = [
    'PTEN',
    'TP53',
    'F2'
]

ref_seqs = [
    "AGTAGTTTCTGATCCCAGATGGTAATGTGTAGGTTCAAGGGTATTGTGTGTAGCAAGTGAAGATTGCAGAAATAAAACTTCAGTTCATGCTTGAAATTTAAGTATTGTTGTGATGCCAGAATTGCTGCTCACCGTTTTTAGGTTTCAGGTCCTCTGACACCTTTTGGTATCGTTAATTTTACTGATTTGTGTAGAATGTCAGTTG",
    "CATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCACCTGTGTGTCTGAGGGGTGAACGCCAGTGCAGGCTACTGGGGTCAGCAGGTGCAGGGGTGAGTGAGGAGGTGCTGGGAAGCAGCCACCTGAGTCTGCAATGAGTGTGGGCTGGGGGG",
    "CTCATATTCTGGGCTCCTGGAACCAATCCCGTGAAAGAATTATTTTTGTGTTTCTAAAACTATGGTTCCCGATAAAAGTGACTCTCAGCGAGCCTCAATGCTCCCAGTGCTATTCATGGGCAGCTCTCTGGGCTCAGGAAGAGCCAGTAATACTACTGGATAAAGAAGACTTAAGAATCCACCACCTGGTGCACGCTGGTAGTCC",
]

var_seqs = [
    "AGTAGTTTCTGATCCCAGATGGTAATGTGTAGGTTCAAGGGTATTGTGTGTAGCAAGTGAAGATTGCAGAAATAAAACTTCAGTTCATGCTTGAAATTTTAGTATTGTTGTGATGCCAGAATTGCTGCTCACCGTTTTTAGGTTTCAGGTCCTCTGACACCTTTTGGTATCGTTAATTTTACTGATTTGTGTAGAATGTCAGTTG",
    "CATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCACCTGTGTGTCTGAGTGGTGAACGCCAGTGCAGGCTACTGGGGTCAGCAGGTGCAGGGGTGAGTGAGGAGGTGCTGGGAAGCAGCCACCTGAGTCTGCAATGAGTGTGGGCTGGGGGG",
    "CTCATATTCTGGGCTCCTGGAACCAATCCCGTGAAAGAATTATTTTTGTGTTTCTAAAACTATGGTTCCCGATAGAAGTGACTCTCAGCGAGCCTCAATGCTCCCAGTGCTATTCATGGGCAGCTCTCTGGGCTCAGGAAGAGCCAGTAATACTACTGGATAAAGAAGACTTAAGAATCCACCACCTGGTGCACGCTGGTAGTCC",
]

df = pd.DataFrame({
    'gene' : genes,
    'ref_seq' : ref_seqs,
    'var_seq' : var_seqs,
})


In [3]:
#Create data features

ref_onehots = np.concatenate([one_hot_encode(row['ref_seq'])[None, None, :, :] for _, row in df.iterrows()], axis=0)
var_onehots = np.concatenate([one_hot_encode(row['var_seq'])[None, None, :, :] for _, row in df.iterrows()], axis=0)

#Always set this one-hot variable to 11 (training sub-library bias)
lib = np.zeros((len(df), 13))
lib[:, 11] = 1.


In [4]:
#Load APARENT2 model

model_name = 'aparent_all_libs_resnet_no_clinvar_wt_ep_5_var_batch_size_inference_mode_no_drop'

save_dir = os.path.join(os.getcwd(), '../saved_models')
model_path = os.path.join(save_dir, model_name + '.h5')

aparent_model = load_model(model_path)


Instructions for updating:
Colocations handled automatically by placer.




In [5]:
#Predict

_, ref_cut_pred = aparent_model.predict(x=[ref_onehots, lib], batch_size=32, verbose=True)
_, var_cut_pred = aparent_model.predict(x=[var_onehots, lib], batch_size=32, verbose=True)

#Calculate isoform log odds ratios (cleavage downstream of core hexamer)
isoform_start = 77
isoform_end = 127

ref_iso_pred_narrow = np.sum(ref_cut_pred[:, isoform_start:isoform_end], axis=1)
var_iso_pred_narrow = np.sum(var_cut_pred[:, isoform_start:isoform_end], axis=1)

delta_logodds_narrow = np.log(var_iso_pred_narrow / (1. - var_iso_pred_narrow)) - np.log(ref_iso_pred_narrow / (1. - ref_iso_pred_narrow))

#Calculate isoform log odds ratios (cleavage anywhere in sequence)
isoform_start = 0
isoform_end = 205

ref_iso_pred = np.sum(ref_cut_pred[:, isoform_start:isoform_end], axis=1)
var_iso_pred = np.sum(var_cut_pred[:, isoform_start:isoform_end], axis=1)

delta_logodds = np.log(var_iso_pred / (1. - var_iso_pred)) - np.log(ref_iso_pred / (1. - ref_iso_pred))




In [6]:
#Copy the dataframe and append effect sizes

pred_df = df.copy().reset_index(drop=True)

pred_df['delta_logodds_narrow'] = delta_logodds_narrow
pred_df['delta_logodds'] = delta_logodds


In [7]:

print(pred_df)


   gene                                            ref_seq  \
0  PTEN  AGTAGTTTCTGATCCCAGATGGTAATGTGTAGGTTCAAGGGTATTG...   
1  TP53  CATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCAT...   
2    F2  CTCATATTCTGGGCTCCTGGAACCAATCCCGTGAAAGAATTATTTT...   

                                             var_seq  delta_logodds_narrow  \
0  AGTAGTTTCTGATCCCAGATGGTAATGTGTAGGTTCAAGGGTATTG...              0.299790   
1  CATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCAT...              1.122925   
2  CTCATATTCTGGGCTCCTGGAACCAATCCCGTGAAAGAATTATTTT...             -3.744810   

   delta_logodds  
0       0.180913  
1       1.108848  
2      -1.107177  
