In [1]:
import keras
from keras.models import Sequential, Model, load_model

import os
import pickle
import numpy as np
import pandas as pd

import scipy.sparse as sp
import scipy.io as spio

import matplotlib.pyplot as plt

from scrambler.models.scrambler_models_w_diff import *
from scrambler.utils import OneHotEncoder, get_sequence_masks
from scrambler.visualizations import plot_dna_logo, plot_dna_importance_scores


Using TensorFlow backend.


In [2]:
#Variant data parameters

isoform_pseudo_count = 1.
proximal_start = 77#80
proximal_end = 127#80 + 30


In [3]:
#Load APARENT Resnet

model_name = 'aparent_all_libs_resnet_no_clinvar_wt_ep_5_var_batch_size_inference_mode_no_drop'

save_dir = os.path.join(os.getcwd(), '../../../../autoscrambler/analysis/apa/saved_models')
model_path = os.path.join(save_dir, model_name + '.h5')

aparent_model = load_model(model_path)

#Re-define model for variant lor prediction
ref_input = Input(name="ref_input", shape=(1, 205, 4))
var_input = Input(name="var_input", shape=(1, 205, 4))
l_input = Input(name="l_input", shape=(13,))

ref_cut = aparent_model([ref_input, l_input])[1]
var_cut = aparent_model([var_input, l_input])[1]

def _lor(inp, iso_start=proximal_start, iso_end=proximal_end) :

    cuts_ref, cuts_var = inp
    
    y_ref = K.sum(cuts_ref[:, iso_start:iso_end], axis=-1)[:, None]
    y_var = K.sum(cuts_var[:, iso_start:iso_end], axis=-1)[:, None]
    
    y_ref = K.clip(y_ref, K.epsilon(), 1.0 - K.epsilon())
    y_var = K.clip(y_var, K.epsilon(), 1.0 - K.epsilon())

    lor = K.log(y_var / (1. - y_var)) - K.log(y_ref / (1. - y_ref))

    return lor

lor_pred = Lambda(_lor, name='lor')([ref_cut, var_cut])

predictor = Model(
    inputs=[ref_input, var_input, l_input],
    outputs=[lor_pred]
)

predictor.trainable = False

predictor.compile(
    optimizer=keras.optimizers.SGD(lr=0.1),
    loss='mean_squared_error'
)




In [3]:
import isolearn.io as isoio

def append_predictions(seq_df, seq_cuts, variant_df, variant_cuts_var, variant_cuts_ref, pred_df, cuts_pred, proximal_start=49, proximal_end=90, isoform_pseudo_count=1.0) :
    #Join dataframe with prediction table and calculate true cut probabilities

    seq_df['row_index_true'] = np.arange(len(seq_df), dtype=np.int)
    pred_df['row_index_pred'] = np.arange(len(pred_df), dtype=np.int)

    seq_df = seq_df.join(pred_df.set_index('master_seq'), on='master_seq', how='inner').copy().reset_index(drop=True)

    seq_cuts = seq_cuts[np.ravel(seq_df['row_index_true'].values), :]
    cut_true = np.concatenate([np.array(seq_cuts[:, 180 + 0: 180 + 205].todense()), np.array(seq_cuts[:, -1].todense()).reshape(-1, 1)], axis=-1)# - 1
    
    seq_df['proximal_count'] = [np.sum(cut_true[i, proximal_start:proximal_end]) for i in range(len(seq_df))]
    seq_df['total_count'] = [np.sum(cut_true[i, :]) for i in range(len(seq_df))]
    
    seq_df['iso_true'] = (seq_df['proximal_count'] + isoform_pseudo_count) / (seq_df['total_count'] + 2. * isoform_pseudo_count)
    seq_df['logodds_true'] = np.log(seq_df['iso_true'] / (1.0 - seq_df['iso_true']))

    if cuts_pred is not None :
        cut_pred = np.array(cuts_pred[np.ravel(seq_df['row_index_pred'].values), :].todense())
        
        seq_df['iso_pred_from_cuts'] = [np.clip(np.sum(cut_pred[i, proximal_start:proximal_end]), 1e-6, 1. - 1e-6) for i in range(len(seq_df))]
        seq_df['logodds_pred_from_cuts'] = np.log(seq_df['iso_pred_from_cuts'] / (1.0 - seq_df['iso_pred_from_cuts']))

        seq_df['mean_logodds_pred'] = (seq_df['logodds_pred'] + seq_df['logodds_pred_from_cuts']) / 2.0

    #Join variant dataframe with prediction table and calculate true cut probabilities

    variant_df['row_index_true'] = np.arange(len(variant_df), dtype=np.int)

    variant_df = variant_df.join(pred_df.rename(columns={'iso_pred' : 'iso_pred_var', 'logodds_pred' : 'logodds_pred_var', 'row_index_pred' : 'row_index_pred_var'}).set_index('master_seq'), on='master_seq', how='inner').copy().reset_index(drop=True)
    variant_df = variant_df.join(pred_df.rename(columns={'iso_pred' : 'iso_pred_ref', 'logodds_pred' : 'logodds_pred_ref', 'row_index_pred' : 'row_index_pred_ref'}).set_index('master_seq'), on='wt_seq', how='inner').copy().reset_index(drop=True)

    variant_cuts_var = variant_cuts_var[np.ravel(variant_df['row_index_true'].values), :]
    variant_cuts_ref = variant_cuts_ref[np.ravel(variant_df['row_index_true'].values), :]

    cut_true_var = np.concatenate([np.array(variant_cuts_var[:, 180 + 0: 180 + 205].todense()), np.array(variant_cuts_var[:, -1].todense()).reshape(-1, 1)], axis=-1)# - 1
    
    cut_true_ref = np.concatenate([np.array(variant_cuts_ref[:, 180 + 0: 180 + 205].todense()), np.array(variant_cuts_ref[:, -1].todense()).reshape(-1, 1)], axis=-1)# - 1
    
    variant_df['proximal_count_var'] = [np.sum(cut_true_var[i, proximal_start:proximal_end]) for i in range(len(variant_df))]
    variant_df['total_count_var'] = [np.sum(cut_true_var[i, :]) for i in range(len(variant_df))]
    
    variant_df['iso_true_var'] = (variant_df['proximal_count_var'] + isoform_pseudo_count) / (variant_df['total_count_var'] + 2. * isoform_pseudo_count)
    variant_df['logodds_true_var'] = np.log(variant_df['iso_true_var'] / (1.0 - variant_df['iso_true_var']))
    
    variant_df['proximal_count_ref'] = [np.sum(cut_true_ref[i, proximal_start:proximal_end]) for i in range(len(variant_df))]
    variant_df['total_count_ref'] = [np.sum(cut_true_ref[i, :]) for i in range(len(variant_df))]
    
    variant_df['iso_true_ref'] = (variant_df['proximal_count_ref'] + isoform_pseudo_count) / (variant_df['total_count_ref'] + 2. * isoform_pseudo_count)
    variant_df['logodds_true_ref'] = np.log(variant_df['iso_true_ref'] / (1.0 - variant_df['iso_true_ref']))
    
    variant_df['delta_logodds_true'] = variant_df['logodds_true_var'] - variant_df['logodds_true_ref']
    
    variant_df['delta_logodds_pred'] = variant_df['logodds_pred_var'] - variant_df['logodds_pred_ref']
    
    if cuts_pred is not None :
        cut_pred_var = np.array(cuts_pred[np.ravel(variant_df['row_index_pred_var'].values), :].todense())
        cut_pred_ref = np.array(cuts_pred[np.ravel(variant_df['row_index_pred_ref'].values), :].todense())
        
        variant_df['iso_pred_from_cuts_var'] = [np.clip(np.sum(cut_pred_var[i, proximal_start:proximal_end]), 1e-6, 1. - 1e-6) for i in range(len(variant_df))]
        variant_df['iso_pred_from_cuts_ref'] = [np.clip(np.sum(cut_pred_ref[i, proximal_start:proximal_end]), 1e-6, 1. - 1e-6) for i in range(len(variant_df))]

        variant_df['logodds_pred_from_cuts_var'] = np.log(variant_df['iso_pred_from_cuts_var'] / (1.0 - variant_df['iso_pred_from_cuts_var']))
        variant_df['logodds_pred_from_cuts_ref'] = np.log(variant_df['iso_pred_from_cuts_ref'] / (1.0 - variant_df['iso_pred_from_cuts_ref']))
        
        variant_df['delta_logodds_pred_from_cuts'] = variant_df['logodds_pred_from_cuts_var'] - variant_df['logodds_pred_from_cuts_ref']

        variant_df['mean_delta_logodds_pred'] = (variant_df['delta_logodds_pred'] + variant_df['delta_logodds_pred_from_cuts']) / 2.0

        variant_df['delta_logodds_pred'] = variant_df['delta_logodds_pred_from_cuts']
        
    return seq_df, variant_df

#Load variant dataframe
seq_dict = isoio.load('../../../../autoscrambler/analysis/apa/apa_array_data_master_seq')
variant_dict = isoio.load('../../../../autoscrambler/analysis/apa/apa_variant_data_master_seq')
print("n (variants) = " + str(len(variant_dict['variant_df'])))

#Load predictions (Resnet)
model_name = 'aparent_all_libs_resnet_no_clinvar_wt_ep_5'
pred_dict = isoio.load('../../../../autoscrambler/analysis/apa/' + model_name + '_predictions_master_seq')

#pred_dict['cut_prob'] = pred_dict['cut_prob'][:, 20:]

#Join mpra dataframes with prediction table and calculate cut probabilities
_, data_df = append_predictions(seq_dict['array_df'], seq_dict['pooled_cuts'], variant_dict['variant_df'], variant_dict['pooled_cuts_var'], variant_dict['pooled_cuts_ref'], pred_dict['array_df'], pred_dict['cut_prob'], proximal_start=proximal_start - 0, proximal_end=proximal_end - 0, isoform_pseudo_count=isoform_pseudo_count)

data_df['wt_seq'] = "ACACGACGCTCTTCCGATCT" + data_df['wt_seq'] + "GGAGCAGATACTGGCTTAACT"
data_df['master_seq'] = "ACACGACGCTCTTCCGATCT" + data_df['master_seq'] + "GGAGCAGATACTGGCTTAACT"


n (variants) = 21734


In [4]:
#Filter variant data

data_df = data_df.loc[data_df['master_seq'].str.slice(70, 76).isin([
    'AATAAA',
    'ATTAAA',
    'TATAAA',
    'CATAAA',
    'GATAAA',
    'AGTAAA',
    'ACTAAA',
    'AATACA',
])].copy().reset_index(drop=True)

data_df = data_df.loc[data_df['master_seq'].str.slice(70, 76) == data_df['wt_seq'].str.slice(70, 76)].copy().reset_index(drop=True)

data_df = data_df.query("delta_logodds_pred < -0.3 or delta_logodds_pred > 0.3").copy().reset_index(drop=True)

#Shuffle data

#Create new shuffle index
#shuffle_index = np.arange(len(data_df))
#np.random.shuffle(shuffle_index)
#np.save('apa_variants_resnet_mpra_shuffle_index', shuffle_index)

#Load existing shuffle index
shuffle_index = np.load('apa_variants_resnet_mpra_shuffle_index.npy')

data_df = data_df.iloc[shuffle_index].copy().reset_index(drop=True)

data_df['clinvar_variant_train'] = (data_df['clinvar_id'] != 'Missing').apply(lambda x: 1 if x == True else 0)
data_df['clinvar_variant_test'] = 0

print("len(data_df) = " + str(len(data_df)) + " (loaded)")

#Set selected variants as test
n_clinvar_test = 100

sel_index = [3052, 3859, 1596, 3927, 3612, 731, 2968, 3840, 1763, 231, 1997, 1253, 2754, 2129, 370, 3227, 820, 562, 1666, 2708, 3544]

data_df.loc[sel_index, 'clinvar_variant_train'] = 0
data_df.loc[sel_index, 'clinvar_variant_test'] = 1

n_remaining_clinvar_test = n_clinvar_test - int(np.sum(data_df['clinvar_variant_test'].values))

clinvar_train_index = data_df.query("clinvar_variant_train == 1").index.tolist()

sel_index = clinvar_train_index[-n_remaining_clinvar_test:]

data_df.loc[sel_index, 'clinvar_variant_train'] = 0
data_df.loc[sel_index, 'clinvar_variant_test'] = 1

#Mark corresponding wildtype genes in test set
data_df['clinvar_gene_test'] = data_df['wt_seq'].isin(data_df.query("clinvar_variant_test == 1")['wt_seq'].unique().tolist()).apply(lambda x: 1 if x == True else 0)
data_df['clinvar_gene_train'] = data_df['wt_seq'].isin(data_df.query("clinvar_variant_train == 1")['wt_seq'].unique().tolist()).apply(lambda x: 1 if x == True else 0)
data_df.loc[data_df['clinvar_gene_test'] == 1, 'clinvar_gene_train'] = 0
data_df.loc[data_df['clinvar_gene_test'] == 1, 'clinvar_variant_train'] = 0

print("n clinvar train = " + str(int(np.sum(data_df['clinvar_variant_train']))))
print("n clinvar test = " + str(int(np.sum(data_df['clinvar_variant_test'].values))))

sel_index = []


len(data_df) = 4040 (loaded)
n clinvar train = 60
n clinvar test = 100


In [6]:
#Store variant data

data_df.to_csv("apa_variant_data.csv", sep='\t')


In [7]:
#Generate training and test set indexes

test_set_size = 0.1

data_index = np.arange(len(data_df), dtype=np.int)

train_index = data_index[:-int(len(data_df) * test_set_size)]
test_index = data_index[train_index.shape[0]:]

train_df = data_df.iloc[train_index].copy().reset_index(drop=True)
test_df = data_df.iloc[test_index].copy().reset_index(drop=True)

#Load data matrices
encoder = OneHotEncoder(seq_length=205, channel_map={'A' : 0, 'C' : 1, 'G' : 2, 'T' : 3})

x_ref_train = np.concatenate([encoder(row['wt_seq'])[None, None, ...] for _, row in train_df.iterrows()], axis=0)
x_var_train = np.concatenate([encoder(row['master_seq'])[None, None, ...] for _, row in train_df.iterrows()], axis=0)

b_train = np.max(x_ref_train - x_var_train, axis=-1)[..., None] * 50.
l_train = np.zeros((x_ref_train.shape[0], 13))
l_train[:, 11] = 1.

y_true_train = train_df['delta_logodds_true'].values.reshape((len(train_df), 1))
y_pred_train = train_df['delta_logodds_pred'].values.reshape((len(train_df), 1))

clinvar_variant_train = train_df['clinvar_variant_test'].values.reshape((len(train_df), 1))
clinvar_gene_train = train_df['clinvar_gene_test'].values.reshape((len(train_df), 1))

x_ref_test = np.concatenate([encoder(row['wt_seq'])[None, None, ...] for _, row in test_df.iterrows()], axis=0)
x_var_test = np.concatenate([encoder(row['master_seq'])[None, None, ...] for _, row in test_df.iterrows()], axis=0)

b_test = np.max(x_ref_test - x_var_test, axis=-1)[..., None] * 50.
l_test = np.zeros((x_ref_test.shape[0], 13))
l_test[:, 11] = 1.

y_true_test = test_df['delta_logodds_true'].values.reshape((len(test_df), 1))
y_pred_test = test_df['delta_logodds_pred'].values.reshape((len(test_df), 1))

clinvar_variant_test = test_df['clinvar_variant_test'].values.reshape((len(test_df), 1))
clinvar_gene_test = test_df['clinvar_gene_test'].values.reshape((len(test_df), 1))

x_ref = np.concatenate([x_ref_train, x_ref_test], axis=0)
x_var = np.concatenate([x_var_train, x_var_test], axis=0)
b = np.concatenate([b_train, b_test], axis=0)
l = np.concatenate([l_train, l_test], axis=0)
y_true = np.concatenate([y_true_train, y_true_test], axis=0)
y_pred = np.concatenate([y_pred_train, y_pred_test], axis=0)
clinvar_variant = np.concatenate([clinvar_variant_train, clinvar_variant_test], axis=0)
clinvar_gene = np.concatenate([clinvar_gene_train, clinvar_gene_test], axis=0)

print("x_ref_train.shape = " + str(x_ref_train.shape))
print("x_ref_test.shape = " + str(x_ref_test.shape))

print("b_train.shape = " + str(b_train.shape))
print("b_test.shape = " + str(b_test.shape))

print("l_train.shape = " + str(l_train.shape))
print("l_test.shape = " + str(l_test.shape))

print("y_true_train.shape = " + str(y_true_train.shape))
print("y_true_test.shape = " + str(y_true_test.shape))

print("clinvar_variant_train.shape = " + str(clinvar_variant_train.shape))
print("clinvar_variant_test.shape = " + str(clinvar_variant_test.shape))

#Overwrite with new resnet lor predictions

y_pred_train = predictor.predict(x=[x_ref_train, x_var_train, l_train], batch_size=32, verbose=True)
y_pred_test = predictor.predict(x=[x_ref_test, x_var_test, l_test], batch_size=32, verbose=True)

y_true = np.concatenate([y_true_train, y_true_test], axis=0)
y_pred = np.concatenate([y_pred_train, y_pred_test], axis=0)


x_ref_train.shape = (3636, 1, 205, 4)
x_ref_test.shape = (404, 1, 205, 4)
b_train.shape = (3636, 1, 205, 1)
b_test.shape = (404, 1, 205, 1)
l_train.shape = (3636, 13)
l_test.shape = (404, 13)
y_true_train.shape = (3636, 1)
y_true_test.shape = (404, 1)
clinvar_variant_train.shape = (3636, 1)
clinvar_variant_test.shape = (404, 1)


In [8]:
#Store processed examples

np.savez(
    "apa_variant_data",
    x_ref_train=x_ref_train,
    x_var_train=x_var_train,
    b_train=b_train,
    l_train=l_train,
    y_pred_train=y_pred_train,
    y_true_train=y_true_train,
    clinvar_variant_train=clinvar_variant_train,
    clinvar_gene_train=clinvar_gene_train,
    
    x_ref_test=x_ref_test,
    x_var_test=x_var_test,
    b_test=b_test,
    l_test=l_test,
    y_pred_test=y_pred_test,
    y_true_test=y_true_test,
    clinvar_variant_test=clinvar_variant_test,
    clinvar_gene_test=clinvar_gene_test,
)


In [9]:
#Create more examples by sat mut of wt seqs

wt_seqs = data_df['wt_seq'].unique().tolist()

nts = ['A', 'C', 'G', 'T']

mut_range = (np.arange(50) + 20).tolist() + (np.arange(108) + 76).tolist()

wt_seq_sat_mut_ref = []
wt_seq_sat_mut_var = []
for wt_seq in wt_seqs :
    
    for j in mut_range :
        for nt in nts :
            mut_seq = wt_seq[:j] + nt + wt_seq[j+1:]
            if mut_seq != wt_seq :
                wt_seq_sat_mut_ref.append(wt_seq)
                wt_seq_sat_mut_var.append(mut_seq)

print("Sequences created by wt mutagenesis = " + str(len(wt_seq_sat_mut_var)))

#Create additional examples by sat mut of clinvar var seqs

var_seqs = data_df.query("clinvar_id != 'Missing'").drop_duplicates(subset=['gene'])['master_seq'].values.tolist()
ref_seqs = data_df.query("clinvar_id != 'Missing'").drop_duplicates(subset=['gene'])['wt_seq'].values.tolist()

mut_range = (np.arange(50) + 20).tolist() + (np.arange(108) + 76).tolist()

var_seq_sat_mut_ref = []
var_seq_sat_mut_var = []
for var_seq, ref_seq in zip(var_seqs, ref_seqs) :
    
    for j in mut_range :
        for nt in nts :
            mut_seq = var_seq[:j] + nt + var_seq[j+1:]
            if mut_seq != var_seq and mut_seq != ref_seq :
                var_seq_sat_mut_ref.append(var_seq)
                var_seq_sat_mut_var.append(mut_seq)

print("Sequences created by var mutagenesis = " + str(len(var_seq_sat_mut_var)))

extra_ref_seqs = wt_seq_sat_mut_ref + var_seq_sat_mut_ref
extra_var_seqs = wt_seq_sat_mut_var + var_seq_sat_mut_var

encoder = OneHotEncoder(seq_length=205, channel_map={'A' : 0, 'C' : 1, 'G' : 2, 'T' : 3})

x_ref_train_extra = np.concatenate([encoder(extra_ref_seq)[None, None, ...] for extra_ref_seq in extra_ref_seqs], axis=0)
x_var_train_extra = np.concatenate([encoder(extra_var_seq)[None, None, ...] for extra_var_seq in extra_var_seqs], axis=0)

b_train_extra = np.max(x_ref_train_extra - x_var_train_extra, axis=-1)[..., None] * 50.
l_train_extra = np.zeros((x_ref_train_extra.shape[0], 13))
l_train_extra[:, 11] = 1.

y_pred_train_extra = predictor.predict(x=[x_ref_train_extra, x_var_train_extra, l_train_extra], batch_size=32, verbose=True)[:, 0]


Sequences created by wt mutagenesis = 195762
Sequences created by var mutagenesis = 70951


In [10]:
#Mark clinvar variants/genes

clinvar_var_seqs = set(data_df.query("clinvar_variant_test == 1")['master_seq'].unique().tolist())
clinvar_wt_seqs = set(data_df.query("clinvar_variant_test == 1")['wt_seq'].unique().tolist())

extra_clinvar_variant = []
extra_clinvar_gene = []

for extra_ix in range(len(extra_ref_seqs)) :
    extra_ref_seq = extra_ref_seqs[extra_ix]
    extra_var_seq = extra_var_seqs[extra_ix]
    
    if extra_ref_seq in clinvar_var_seqs or extra_var_seq in clinvar_var_seqs :
        extra_clinvar_variant.append(1)
    else :
        extra_clinvar_variant.append(0)
    
    if (extra_ref_seq in clinvar_var_seqs or extra_var_seq in clinvar_var_seqs) or (extra_ref_seq in clinvar_wt_seqs or extra_var_seq in clinvar_wt_seqs) :
        extra_clinvar_gene.append(1)
    else :
        extra_clinvar_gene.append(0)

clinvar_variant_train_extra = np.array(extra_clinvar_variant, dtype=np.int)[:, None]
clinvar_gene_train_extra = np.array(extra_clinvar_gene, dtype=np.int)[:, None]


In [11]:
#Filter on high-impact mutations only

min_abs_delta_logodds = 0.5

keep_index = np.nonzero((np.abs(y_pred_train_extra) >= min_abs_delta_logodds))[0]

x_ref_train_extra = x_ref_train_extra[keep_index, ...]
x_var_train_extra = x_var_train_extra[keep_index, ...]

b_train_extra = b_train_extra[keep_index, ...]
l_train_extra = l_train_extra[keep_index, ...]

clinvar_variant_train_extra = clinvar_variant_train_extra[keep_index, ...]
clinvar_gene_train_extra = clinvar_gene_train_extra[keep_index, ...]

y_pred_train_extra = y_pred_train_extra[keep_index, ...]
y_true_train_extra = np.copy(y_pred_train_extra)

print("x_ref_train_extra.shape = " + str(x_ref_train_extra.shape))
print("x_var_train_extra.shape = " + str(x_var_train_extra.shape))

print("b_train_extra.shape = " + str(b_train_extra.shape))
print("l_train_extra.shape = " + str(l_train_extra.shape))

print("clinvar_variant_train_extra.shape = " + str(clinvar_variant_train_extra.shape))
print("clinvar_gene_train_extra.shape = " + str(clinvar_gene_train_extra.shape))

print("y_pred_train_extra.shape = " + str(y_pred_train_extra.shape))
print("y_true_train_extra.shape = " + str(y_true_train_extra.shape))


x_ref_train_extra.shape = (10200, 1, 205, 4)
x_var_train_extra.shape = (10200, 1, 205, 4)
b_train_extra.shape = (10200, 1, 205, 1)
l_train_extra.shape = (10200, 13)
clinvar_variant_train_extra.shape = (10200, 1)
clinvar_gene_train_extra.shape = (10200, 1)
y_pred_train_extra.shape = (10200,)
y_true_train_extra.shape = (10200,)


In [12]:
#Store extra generated examples

np.savez(
    "apa_variant_data_extra",
    x_ref_train_extra=x_ref_train_extra,
    x_var_train_extra=x_var_train_extra,
    b_train_extra=b_train_extra,
    l_train_extra=l_train_extra,
    y_pred_train_extra=y_pred_train_extra,
    y_true_train_extra=y_true_train_extra,
    clinvar_variant_train_extra=clinvar_variant_train_extra,
    clinvar_gene_train_extra=clinvar_gene_train_extra,
)


In [13]:
#Make selection of variants to visualize

sel_df_patho = data_df[data_df['clinvar_id'].isin(['NM_001453.2(FOXC1):c.*734A>T', 'NM_000518.4(HBB):c.*113A>G', 'NM_000506.4(F2):c.*97G>A'])].sort_values(
    by='delta_logodds_true', ascending=False
)

sel_index_patho = sel_df_patho.index.values.tolist()

sel_df_confl = data_df[data_df['clinvar_id'].isin(['NM_000030.2(AGXT):c.*289A>C', 'NM_178452.5(DNAAF1):c.*18T>C'])].sort_values(
    by='delta_logodds_true', ascending=False
)

sel_index_confl = sel_df_confl.index.values.tolist()

sel_index_vus = [731, 2968, 3840, 1763, 231, 1997, 1253, 2754, 2129, 370, 3227, 820, 562, 1666, 2708, 3544]
sel_df_vus = data_df.loc[sel_index_vus]

sel_index = sel_index_patho + sel_index_confl + sel_index_vus
sel_df = pd.concat([sel_df_patho, sel_df_confl, sel_df_vus])

x_ref_sel = x_ref[sel_index, ...]
x_var_sel = x_var[sel_index, ...]

b_sel = b[sel_index, ...]
l_sel = l[sel_index, ...]

y_true_sel = y_true[sel_index, ...]
y_pred_sel = y_pred[sel_index, ...]


In [14]:

sel_df[['clinvar_id', 'gene', 'sitetype', 'snv_pos', 'significance', 'delta_logodds_true', 'delta_logodds_pred', 'clinvar_variant_test', 'clinvar_gene_test']]


Unnamed: 0,clinvar_id,gene,sitetype,snv_pos,significance,delta_logodds_true,delta_logodds_pred,clinvar_variant_test,clinvar_gene_test
3052,NM_001453.2(FOXC1):c.*734A>T,FOXC1.2,Missing,91,Pathogenic,0.899665,0.766821,1,1
3859,NM_000518.4(HBB):c.*113A>G,HBB.2,UTR3,136,Pathogenic,0.821801,0.807131,1,1
1596,NM_000506.4(F2):c.*97G>A,F2.1,Missing,69,Pathogenic,0.313912,0.375867,1,1
3927,NM_000030.2(AGXT):c.*289A>C,AGXT.1,UTR3,67,Conflicting,-0.61128,-0.363797,1,1
3612,NM_178452.5(DNAAF1):c.*18T>C,DNAAF1.1,UTR3,10,Conflicting,-0.620877,-0.433554,1,1
731,NM_002203.3(ITGA2):c.*4028A>C,MOCS2.2,Extension,35,Undetermined,1.32338,0.882017,1,1
2968,NM_000382.2(ALDH3A2):c.*1788C>G,ALDH3A2.6,UTR3,23,Undetermined,0.889245,0.70066,1,1
3840,NM_005566.3(LDHA):c.*586A>G,LDHA.4,UTR3,33,Undetermined,0.841781,1.356996,1,1
1763,NM_001031726.3(C19orf12):c.*718G>T,C19orf12.8,UTR3,64,Undetermined,0.790768,0.804468,1,1
231,NM_003640.4(ELP1):c.*502C>T,IKBKAP.3,UTR3,95,Undetermined,0.773178,0.552309,1,1


In [15]:
#Store variant data

sel_df.to_csv("apa_variant_data_sel.csv", sep='\t')


In [16]:
#Store extra generated examples

np.savez(
    "apa_variant_data_sel",
    x_ref_sel=x_ref_sel,
    x_var_sel=x_var_sel,
    b_sel=b_sel,
    l_sel=l_sel,
    y_pred_sel=y_pred_sel,
    y_true_sel=y_true_sel,
)


In [17]:
#Create dataframe of all clinvar variants

sel_df = data_df.query("clinvar_id != 'Missing'").sort_values(
    by='delta_logodds_true', ascending=False
)

sel_index = np.array(sel_df.index.values, dtype=np.int).tolist()

x_ref_sel = x_ref[sel_index, ...]
x_var_sel = x_var[sel_index, ...]

b_sel = b[sel_index, ...]
l_sel = l[sel_index, ...]

y_true_sel = y_true[sel_index, ...]
y_pred_sel = y_pred[sel_index, ...]


In [18]:

sel_df[['clinvar_id', 'gene', 'sitetype', 'snv_pos', 'significance', 'delta_logodds_true', 'delta_logodds_pred', 'clinvar_variant_test', 'clinvar_gene_test']]


Unnamed: 0,clinvar_id,gene,sitetype,snv_pos,significance,delta_logodds_true,delta_logodds_pred,clinvar_variant_test,clinvar_gene_test
888,NM_207122.1(EXT2):c.*1153A>T,EXT2.2,UTR3,88,Likely benign,1.607199,0.538346,0,0
731,NM_002203.3(ITGA2):c.*4028A>C,MOCS2.2,Extension,35,Undetermined,1.323380,0.882017,1,1
726,NM_002203.3(ITGA2):c.*3986A>C,MOCS2.2,Extension,77,Benign,1.286626,0.677374,0,1
3564,NM_000601.5(HGF):c.*467A>G,HGF.1,UTR3,84,Likely benign,0.981624,0.632942,1,1
3052,NM_001453.2(FOXC1):c.*734A>T,FOXC1.2,Missing,91,Pathogenic,0.899665,0.766821,1,1
...,...,...,...,...,...,...,...,...,...
1791,NM_000021.3(PSEN1):c.*4367T>C,PSEN1.1,UTR3,35,Undetermined,-1.245211,-0.474321,1,1
298,NM_006017.2(PROM1):c.*1160A>G,PROM1.1,UTR3,67,Undetermined,-1.292480,-0.676032,0,0
1756,NM_138361.5(LRSAM1):c.*597T>G,LRSAM1.1,UTR3,67,Undetermined,-1.450223,-0.573967,1,1
2316,NM_000302.3(PLOD1):c.*721T>G,PLOD1.2,UTR3,61,Undetermined,-1.479988,-0.904134,1,1


In [19]:
#Store variant data

sel_df.to_csv("apa_variant_data_clinvar.csv", sep='\t')


In [20]:
#Store extra generated examples

np.savez(
    "apa_variant_data_clinvar",
    x_ref_sel=x_ref_sel,
    x_var_sel=x_var_sel,
    b_sel=b_sel,
    l_sel=l_sel,
    y_pred_sel=y_pred_sel,
    y_true_sel=y_true_sel,
)
