In [1]:
from __future__ import print_function
import keras
from keras.models import Sequential, Model, load_model
from keras import backend as K

import tensorflow as tf

import os
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm

#import aparent.visualization as vis

#from aparent_predictor import *


Using TensorFlow backend.


In [2]:
#Load Hexamer weights and make lookup dictionary / predict functions

weight_df = pd.read_csv('apa_regression_6mer_v_pasaligned_margin_doubledope_weights.csv', sep='\t')

use_dict = {row['nmer'] : row['use'] for _, row in weight_df.iterrows()}
cse_dict = {row['nmer'] : row['cse'] for _, row in weight_df.iterrows()}
dse_dict = {row['nmer'] : row['dse'] for _, row in weight_df.iterrows()}

def _predict(ref_seq, var_seq, hexamer_dict) :
    
    #Score reference sequence
    
    ref_score = 0.
    for j in range(len(ref_seq) - 6 + 1) :
        ref_score += hexamer_dict[ref_seq[j:j+6]]
    
    var_score = 0.
    for j in range(len(var_seq) - 6 + 1) :
        var_score += hexamer_dict[var_seq[j:j+6]]

    return var_score - ref_score
    

In [3]:
#Load APADB Data and filter on targeted genes

#genes = ['RUNX1', 'CEBPA', 'GATA2', 'ANKRD26', 'DDX41', 'ETV6', 'PTEN', 'BRCA1', 'BRCA2', 'TP53', 'APC', 'ATM', 'PALB2', 'MSH2', 'MLH1', 'MSH6', 'PMS2', 'MUTYH']

polyadb_df = pd.read_csv('polyadb_processed.csv', sep=',')

#polyadb_df = polyadb_df.loc[polyadb_df['gene'].isin(genes)].reset_index(drop=True).copy()
polyadb_df = polyadb_df.loc[((~polyadb_df['gene'].isnull()) & (polyadb_df['gene'] != 'na')) & (polyadb_df['pas'] != -1)].reset_index(drop=True).copy()

print('len(polyadb_df) = ' + str(len(polyadb_df)))


len(polyadb_df) = 175451


In [4]:
polyadb_df_minus = polyadb_df.query("strand == '-'").copy().reset_index(drop=True)
polyadb_df_plus = polyadb_df.query("strand == '+'").copy().reset_index(drop=True)

polyadb_df_minus = polyadb_df_minus.sort_values(by='pas_pos', ascending=False).copy().reset_index(drop=True)
polyadb_df_plus = polyadb_df_plus.sort_values(by='pas_pos', ascending=True).copy().reset_index(drop=True)

new_gene_id_list_plus = []
sitenum_list_plus = []
gene_id_dict = {}
for _, row in polyadb_df_plus.iterrows() :

    gene = row['gene']

    if gene not in gene_id_dict :
        gene_id_dict[gene] = 0

    gene_id_dict[gene] += 1

    new_gene_id_list_plus.append(gene + "." + str(gene_id_dict[gene]))
    sitenum_list_plus.append(gene_id_dict[gene])

polyadb_df_plus['gene_id'] = new_gene_id_list_plus
polyadb_df_plus['sitenum'] = sitenum_list_plus

new_gene_id_list_minus = []
sitenum_list_minus = []
gene_id_dict = {}
for _, row in polyadb_df_minus.iterrows() :

    gene = row['gene']

    if gene not in gene_id_dict :
        gene_id_dict[gene] = 0

    gene_id_dict[gene] += 1

    new_gene_id_list_minus.append(gene + "." + str(gene_id_dict[gene]))
    sitenum_list_minus.append(gene_id_dict[gene])

polyadb_df_minus['gene_id'] = new_gene_id_list_minus
polyadb_df_minus['sitenum'] = sitenum_list_minus

polyadb_df = pd.concat([polyadb_df_plus, polyadb_df_minus])

polyadb_df = polyadb_df.sort_values(by=['gene', 'sitenum'], ascending=True).reset_index(drop=True).copy()

In [5]:

print('len(polyadb_df) = ' + str(len(polyadb_df)))


len(polyadb_df) = 175451


In [6]:
#Perform in-silico saturation mutagenesis

mutagenesis_end = 146

variant_dict = {
    'gene' : [],
    'gene_id' : [],
    #'ref_seq' : [],
    'chrom' : [],
    'strand' : [],
    'site_type' : [],
    'native_usage' : [],
    #'var_seq' : [],
    'var_position' : [],
    'ref_nucleotide' : [],
    'var_nucleotide' : [],
    'delta_logodds_lr' : []
}

for pd_index, row in polyadb_df.iterrows() :
    gene = row['gene']
    gene_id = row['gene_id']
    
    if pd_index % 100 == 0 :
        print("Predicting variants for PAS = " + str(gene_id) + " (" + str(pd_index) + ")")
        print(" - n variants recorded = " + str(len(variant_dict['gene'])))
    
    ref_seq = row['wide_seq_ext'][175-70:175-70+205]
    
    chrom = row['chrom']
    strand = row['strand']
    site_type = row['site_type']
    ref_usage = row['ratio']
    
    pas_pos = row['pas_pos']
    
    seq_start = 0
    if strand == '+' :
        seq_start = pas_pos - 70 + 1
    else :
        seq_start = pas_pos - (205 - 70)
    
    seq_end = seq_start + 205
    
    #Predict all variants
    ii = 0
    for pos in range(mutagenesis_end) :
        for base in ['A', 'C', 'G', 'T'] :
            var_seq = ref_seq[:pos] + base + ref_seq[pos+1:]
            if var_seq == ref_seq :
                continue
            
            ref_window = ref_seq[pos - 6 + 1:pos + 6]
            var_window = var_seq[pos - 6 + 1:pos + 6]
            
            hexamer_dict = None
            if pos < 70 :
                hexamer_dict = use_dict
            elif pos >= 70 and pos < 76 :
                hexamer_dict = cse_dict
            else :
                hexamer_dict = dse_dict
            
            delta_logodds_lr = _predict(ref_window, var_window, hexamer_dict)
            
            var_position = 0
            var_nucleotide = 'A'
            if strand == '+' :
                var_position = seq_start + pos
                var_nucleotide = base
            else :
                var_position = seq_end - pos
                if base == 'A' :
                    var_nucleotide = 'T'
                elif base == 'C' :
                    var_nucleotide = 'G'
                elif base == 'G' :
                    var_nucleotide = 'C'
                elif base == 'T' :
                    var_nucleotide = 'A'
            
            ref_nucleotide = 'A'
            if strand == '+' :
                ref_nucleotide = ref_seq[pos]
            else :
                if ref_seq[pos] == 'A' :
                    ref_nucleotide = 'T'
                elif ref_seq[pos] == 'C' :
                    ref_nucleotide = 'G'
                elif ref_seq[pos] == 'G' :
                    ref_nucleotide = 'C'
                elif ref_seq[pos] == 'T' :
                    ref_nucleotide = 'A'
            
            hash_key = chrom + "_" + gene_id + "_" + str(var_position) + "_" + ref_nucleotide + "_" + var_nucleotide
            
            if True :
                variant_dict['gene'].append(gene)
                variant_dict['gene_id'].append(gene_id)
                #variant_dict['ref_seq'].append(ref_seq)
                variant_dict['chrom'].append(chrom)
                variant_dict['strand'].append(strand)
                variant_dict['site_type'].append(site_type)
                variant_dict['native_usage'].append(np.round(ref_usage, 5))
                #variant_dict['var_seq'].append(var_seq)
                variant_dict['var_position'].append(var_position)
                variant_dict['ref_nucleotide'].append(ref_nucleotide)
                variant_dict['var_nucleotide'].append(var_nucleotide)
                variant_dict['delta_logodds_lr'].append(np.round(delta_logodds_lr, 5))
            
            ii += 1

variant_df = pd.DataFrame(variant_dict)
variant_df = variant_df[['gene','gene_id','chrom','strand','site_type','native_usage','var_position','ref_nucleotide','var_nucleotide','delta_logodds_lr']]


Predicting variants for PAS = A1BG.1 (0)
 - n variants recorded = 0
Predicting variants for PAS = AAK1.10 (100)
 - n variants recorded = 43800
Predicting variants for PAS = AASDH.14 (200)
 - n variants recorded = 87600
Predicting variants for PAS = ABCA13.8 (300)
 - n variants recorded = 131400
Predicting variants for PAS = ABCB7.2 (400)
 - n variants recorded = 175200
Predicting variants for PAS = ABCC4.4 (500)
 - n variants recorded = 219000
Predicting variants for PAS = ABCE1.10 (600)
 - n variants recorded = 262800
Predicting variants for PAS = ABHD12B.3 (700)
 - n variants recorded = 306600
Predicting variants for PAS = ABHD3.6 (800)
 - n variants recorded = 350400
Predicting variants for PAS = ABL1.8 (900)
 - n variants recorded = 394200
Predicting variants for PAS = ABR.22 (1000)
 - n variants recorded = 438000
Predicting variants for PAS = AC004453.1 (1100)
 - n variants recorded = 481800
Predicting variants for PAS = AC007663.1 (1200)
 - n variants recorded = 525600
Predicting

Predicting variants for PAS = ARHGEF40.7 (10500)
 - n variants recorded = 4599000
Predicting variants for PAS = ARID2.10 (10600)
 - n variants recorded = 4642800
Predicting variants for PAS = ARIH2.1 (10700)
 - n variants recorded = 4686600
Predicting variants for PAS = ARL17B.2 (10800)
 - n variants recorded = 4730400
Predicting variants for PAS = ARL5A.34 (10900)
 - n variants recorded = 4774200
Predicting variants for PAS = ARL8B.5 (11000)
 - n variants recorded = 4818000
Predicting variants for PAS = ARMC3.1 (11100)
 - n variants recorded = 4861800
Predicting variants for PAS = ARMCX2.5 (11200)
 - n variants recorded = 4905600
Predicting variants for PAS = ARNT.5 (11300)
 - n variants recorded = 4949400
Predicting variants for PAS = ARPC3.8 (11400)
 - n variants recorded = 4993200
Predicting variants for PAS = ARRDC1.6 (11500)
 - n variants recorded = 5037000
Predicting variants for PAS = ARSG.25 (11600)
 - n variants recorded = 5080800
Predicting variants for PAS = ASAH1.10 (11700

Predicting variants for PAS = CACNA1A.4 (20800)
 - n variants recorded = 9110400
Predicting variants for PAS = CACNA2D1.29 (20900)
 - n variants recorded = 9154200
Predicting variants for PAS = CACNG7.4 (21000)
 - n variants recorded = 9198000
Predicting variants for PAS = CADM2.14 (21100)
 - n variants recorded = 9241800
Predicting variants for PAS = CALB1.5 (21200)
 - n variants recorded = 9285600
Predicting variants for PAS = CALM1.2 (21300)
 - n variants recorded = 9329400
Predicting variants for PAS = CALR.12 (21400)
 - n variants recorded = 9373200
Predicting variants for PAS = CAMK2D.14 (21500)
 - n variants recorded = 9417000
Predicting variants for PAS = CAMKMT.18 (21600)
 - n variants recorded = 9460800
Predicting variants for PAS = CAMTA2.7 (21700)
 - n variants recorded = 9504600
Predicting variants for PAS = CAP2.3 (21800)
 - n variants recorded = 9548400
Predicting variants for PAS = CAPN6.1 (21900)
 - n variants recorded = 9592200
Predicting variants for PAS = CAPS2.18 (

Predicting variants for PAS = CNOT1.15 (31000)
 - n variants recorded = 13578000
Predicting variants for PAS = CNOT4.24 (31100)
 - n variants recorded = 13621800
Predicting variants for PAS = CNPPD1.4 (31200)
 - n variants recorded = 13665600
Predicting variants for PAS = CNTFR.5 (31300)
 - n variants recorded = 13709400
Predicting variants for PAS = CNTNAP2.11 (31400)
 - n variants recorded = 13753200
Predicting variants for PAS = COA1.50 (31500)
 - n variants recorded = 13797000
Predicting variants for PAS = COBLL1.9 (31600)
 - n variants recorded = 13840800
Predicting variants for PAS = COG6.4 (31700)
 - n variants recorded = 13884600
Predicting variants for PAS = COL15A1.9 (31800)
 - n variants recorded = 13928400
Predicting variants for PAS = COL25A1.11 (31900)
 - n variants recorded = 13972200
Predicting variants for PAS = COL4A2.30 (32000)
 - n variants recorded = 14016000
Predicting variants for PAS = COL5A1.15 (32100)
 - n variants recorded = 14059800
Predicting variants for P

Predicting variants for PAS = DNM1L.22 (41200)
 - n variants recorded = 18045600
Predicting variants for PAS = DNPH1.4 (41300)
 - n variants recorded = 18089400
Predicting variants for PAS = DOCK3.9 (41400)
 - n variants recorded = 18133200
Predicting variants for PAS = DOHH.2 (41500)
 - n variants recorded = 18177000
Predicting variants for PAS = DPAGT1.1 (41600)
 - n variants recorded = 18220800
Predicting variants for PAS = DPM1.2 (41700)
 - n variants recorded = 18264600
Predicting variants for PAS = DPPA4.3 (41800)
 - n variants recorded = 18308400
Predicting variants for PAS = DPY19L4.8 (41900)
 - n variants recorded = 18352200
Predicting variants for PAS = DR1.6 (42000)
 - n variants recorded = 18396000
Predicting variants for PAS = DRG1.3 (42100)
 - n variants recorded = 18439800
Predicting variants for PAS = DSCAM.8 (42200)
 - n variants recorded = 18483600
Predicting variants for PAS = DSEL.20 (42300)
 - n variants recorded = 18527400
Predicting variants for PAS = DSTN.18 (42

Predicting variants for PAS = FAM78B.1 (51400)
 - n variants recorded = 22513200
Predicting variants for PAS = FAM84B.2 (51500)
 - n variants recorded = 22557000
Predicting variants for PAS = FAM8A1.2 (51600)
 - n variants recorded = 22600800
Predicting variants for PAS = FAM98B.25 (51700)
 - n variants recorded = 22644600
Predicting variants for PAS = FANCF.2 (51800)
 - n variants recorded = 22688400
Predicting variants for PAS = FAR1.9 (51900)
 - n variants recorded = 22732200
Predicting variants for PAS = FARS2.9 (52000)
 - n variants recorded = 22776000
Predicting variants for PAS = FAT1.5 (52100)
 - n variants recorded = 22819800
Predicting variants for PAS = FBLN1.1 (52200)
 - n variants recorded = 22863600
Predicting variants for PAS = FBRSL1.9 (52300)
 - n variants recorded = 22907400
Predicting variants for PAS = FBXL17.45 (52400)
 - n variants recorded = 22951200
Predicting variants for PAS = FBXL3.3 (52500)
 - n variants recorded = 22995000
Predicting variants for PAS = FBXO

Predicting variants for PAS = GRIK3.3 (61600)
 - n variants recorded = 26980800
Predicting variants for PAS = GRK3.8 (61700)
 - n variants recorded = 27024600
Predicting variants for PAS = GRM7.5 (61800)
 - n variants recorded = 27068400
Predicting variants for PAS = GS1-124K5.6 (61900)
 - n variants recorded = 27112200
Predicting variants for PAS = GSK3B.21 (62000)
 - n variants recorded = 27156000
Predicting variants for PAS = GSTCD.8 (62100)
 - n variants recorded = 27199800
Predicting variants for PAS = GTDC1.8 (62200)
 - n variants recorded = 27243600
Predicting variants for PAS = GTF2F1.16 (62300)
 - n variants recorded = 27287400
Predicting variants for PAS = GTF2IRD1P1.2 (62400)
 - n variants recorded = 27331200
Predicting variants for PAS = GTPBP1.5 (62500)
 - n variants recorded = 27375000
Predicting variants for PAS = GTSE1.8 (62600)
 - n variants recorded = 27418800
Predicting variants for PAS = GUF1.13 (62700)
 - n variants recorded = 27462600
Predicting variants for PAS =

Predicting variants for PAS = JAM2.10 (71800)
 - n variants recorded = 31448400
Predicting variants for PAS = JMJD1C.4 (71900)
 - n variants recorded = 31492200
Predicting variants for PAS = JPH4.1 (72000)
 - n variants recorded = 31536000
Predicting variants for PAS = KAAG1.1 (72100)
 - n variants recorded = 31579800
Predicting variants for PAS = KANSL1L.21 (72200)
 - n variants recorded = 31623600
Predicting variants for PAS = KAT7.9 (72300)
 - n variants recorded = 31667400
Predicting variants for PAS = KAZN.5 (72400)
 - n variants recorded = 31711200
Predicting variants for PAS = KCMF1.19 (72500)
 - n variants recorded = 31755000
Predicting variants for PAS = KCND1.1 (72600)
 - n variants recorded = 31798800
Predicting variants for PAS = KCNIP3.4 (72700)
 - n variants recorded = 31842600
Predicting variants for PAS = KCNJ9.3 (72800)
 - n variants recorded = 31886400
Predicting variants for PAS = KCNMA1.9 (72900)
 - n variants recorded = 31930200
Predicting variants for PAS = KCNQ3.

Predicting variants for PAS = MAGI3.3 (82000)
 - n variants recorded = 35916000
Predicting variants for PAS = MAK16.2 (82100)
 - n variants recorded = 35959800
Predicting variants for PAS = MAMDC2.18 (82200)
 - n variants recorded = 36003600
Predicting variants for PAS = MAN1A2.32 (82300)
 - n variants recorded = 36047400
Predicting variants for PAS = MANBAL.4 (82400)
 - n variants recorded = 36091200
Predicting variants for PAS = MAP1LC3B.8 (82500)
 - n variants recorded = 36135000
Predicting variants for PAS = MAP2K6.12 (82600)
 - n variants recorded = 36178800
Predicting variants for PAS = MAP3K20.7 (82700)
 - n variants recorded = 36222600
Predicting variants for PAS = MAP3K8.6 (82800)
 - n variants recorded = 36266400
Predicting variants for PAS = MAP6.4 (82900)
 - n variants recorded = 36310200
Predicting variants for PAS = MAPK10.2 (83000)
 - n variants recorded = 36354000
Predicting variants for PAS = MAPK1IP1L.21 (83100)
 - n variants recorded = 36397800
Predicting variants fo

Predicting variants for PAS = MZF1.1 (92200)
 - n variants recorded = 40383600
Predicting variants for PAS = N4BP2L2.26 (92300)
 - n variants recorded = 40427400
Predicting variants for PAS = NAA25.11 (92400)
 - n variants recorded = 40471200
Predicting variants for PAS = NAA50.3 (92500)
 - n variants recorded = 40515000
Predicting variants for PAS = NAB1.2 (92600)
 - n variants recorded = 40558800
Predicting variants for PAS = NADK2.19 (92700)
 - n variants recorded = 40602600
Predicting variants for PAS = NALCN.23 (92800)
 - n variants recorded = 40646400
Predicting variants for PAS = NAP1L1.41 (92900)
 - n variants recorded = 40690200
Predicting variants for PAS = NARS.6 (93000)
 - n variants recorded = 40734000
Predicting variants for PAS = NAV1.12 (93100)
 - n variants recorded = 40777800
Predicting variants for PAS = NBDY.18 (93200)
 - n variants recorded = 40821600
Predicting variants for PAS = NBPF1.16 (93300)
 - n variants recorded = 40865400
Predicting variants for PAS = NBPF

Predicting variants for PAS = PAM.20 (102400)
 - n variants recorded = 44851200
Predicting variants for PAS = PANK3.27 (102500)
 - n variants recorded = 44895000
Predicting variants for PAS = PAPOLA.6 (102600)
 - n variants recorded = 44938800
Predicting variants for PAS = PAQR4.3 (102700)
 - n variants recorded = 44982600
Predicting variants for PAS = PARD6G.10 (102800)
 - n variants recorded = 45026400
Predicting variants for PAS = PARP15.2 (102900)
 - n variants recorded = 45070200
Predicting variants for PAS = PARVA.23 (103000)
 - n variants recorded = 45114000
Predicting variants for PAS = PATJ.47 (103100)
 - n variants recorded = 45157800
Predicting variants for PAS = PAX6.12 (103200)
 - n variants recorded = 45201600
Predicting variants for PAS = PBRM1.1 (103300)
 - n variants recorded = 45245400
Predicting variants for PAS = PC.5 (103400)
 - n variants recorded = 45289200
Predicting variants for PAS = PCBP2.27 (103500)
 - n variants recorded = 45333000
Predicting variants for P

Predicting variants for PAS = PPP2R5A.6 (112500)
 - n variants recorded = 49275000
Predicting variants for PAS = PPP3CB.8 (112600)
 - n variants recorded = 49318800
Predicting variants for PAS = PPP4R3B.10 (112700)
 - n variants recorded = 49362600
Predicting variants for PAS = PPP6R2.13 (112800)
 - n variants recorded = 49406400
Predicting variants for PAS = PQLC3.4 (112900)
 - n variants recorded = 49450200
Predicting variants for PAS = PRDM11.15 (113000)
 - n variants recorded = 49494000
Predicting variants for PAS = PRDX2.5 (113100)
 - n variants recorded = 49537800
Predicting variants for PAS = PREP.6 (113200)
 - n variants recorded = 49581600
Predicting variants for PAS = PRICKLE1.6 (113300)
 - n variants recorded = 49625400
Predicting variants for PAS = PRKAA2.14 (113400)
 - n variants recorded = 49669200
Predicting variants for PAS = PRKAR1A.9 (113500)
 - n variants recorded = 49713000
Predicting variants for PAS = PRKCA.20 (113600)
 - n variants recorded = 49756800
Predicting 

Predicting variants for PAS = RIC8B.3 (122600)
 - n variants recorded = 53698800
Predicting variants for PAS = RIMBP2.3 (122700)
 - n variants recorded = 53742600
Predicting variants for PAS = RIN2.2 (122800)
 - n variants recorded = 53786400
Predicting variants for PAS = RIPK1.15 (122900)
 - n variants recorded = 53830200
Predicting variants for PAS = RLIM.8 (123000)
 - n variants recorded = 53874000
Predicting variants for PAS = RMND1.8 (123100)
 - n variants recorded = 53917800
Predicting variants for PAS = RNASEK.3 (123200)
 - n variants recorded = 53961600
Predicting variants for PAS = RNF11.15 (123300)
 - n variants recorded = 54005400
Predicting variants for PAS = RNF135.7 (123400)
 - n variants recorded = 54049200
Predicting variants for PAS = RNF144B.4 (123500)
 - n variants recorded = 54093000
Predicting variants for PAS = RNF167.5 (123600)
 - n variants recorded = 54136800
Predicting variants for PAS = RNF19A.6 (123700)
 - n variants recorded = 54180600
Predicting variants f

Predicting variants for PAS = SIN3A.14 (132700)
 - n variants recorded = 58122600
Predicting variants for PAS = SIRPAP1.2 (132800)
 - n variants recorded = 58166400
Predicting variants for PAS = SIX4.9 (132900)
 - n variants recorded = 58210200
Predicting variants for PAS = SKIL.1 (133000)
 - n variants recorded = 58254000
Predicting variants for PAS = SLAIN2.12 (133100)
 - n variants recorded = 58297800
Predicting variants for PAS = SLC12A2.8 (133200)
 - n variants recorded = 58341600
Predicting variants for PAS = SLC15A2.4 (133300)
 - n variants recorded = 58385400
Predicting variants for PAS = SLC16A4.6 (133400)
 - n variants recorded = 58429200
Predicting variants for PAS = SLC19A1.9 (133500)
 - n variants recorded = 58473000
Predicting variants for PAS = SLC20A1.4 (133600)
 - n variants recorded = 58516800
Predicting variants for PAS = SLC22A3.2 (133700)
 - n variants recorded = 58560600
Predicting variants for PAS = SLC24A5.2 (133800)
 - n variants recorded = 58604400
Predicting 

Predicting variants for PAS = STAU2.36 (142700)
 - n variants recorded = 62502600
Predicting variants for PAS = STEAP3.3 (142800)
 - n variants recorded = 62546400
Predicting variants for PAS = STK17A.15 (142900)
 - n variants recorded = 62590200
Predicting variants for PAS = STK32A.2 (143000)
 - n variants recorded = 62634000
Predicting variants for PAS = STK39.18 (143100)
 - n variants recorded = 62677800
Predicting variants for PAS = STMP1.7 (143200)
 - n variants recorded = 62721600
Predicting variants for PAS = STON2.11 (143300)
 - n variants recorded = 62765400
Predicting variants for PAS = STRIP1.1 (143400)
 - n variants recorded = 62809200
Predicting variants for PAS = STT3A.9 (143500)
 - n variants recorded = 62853000
Predicting variants for PAS = STX17.18 (143600)
 - n variants recorded = 62896800
Predicting variants for PAS = STX6.28 (143700)
 - n variants recorded = 62940600
Predicting variants for PAS = STXBP3.12 (143800)
 - n variants recorded = 62984400
Predicting varian

Predicting variants for PAS = TMEM86A.1 (152700)
 - n variants recorded = 66882600
Predicting variants for PAS = TMEM94.4 (152800)
 - n variants recorded = 66926400
Predicting variants for PAS = TMOD2.3 (152900)
 - n variants recorded = 66970200
Predicting variants for PAS = TMPRSS15.5 (153000)
 - n variants recorded = 67014000
Predicting variants for PAS = TMTC2.10 (153100)
 - n variants recorded = 67057800
Predicting variants for PAS = TMX3.11 (153200)
 - n variants recorded = 67101600
Predicting variants for PAS = TNFAIP8L2-SCNM1.2 (153300)
 - n variants recorded = 67145400
Predicting variants for PAS = TNFRSF1A.6 (153400)
 - n variants recorded = 67189200
Predicting variants for PAS = TNIK.15 (153500)
 - n variants recorded = 67233000
Predicting variants for PAS = TNPO1.4 (153600)
 - n variants recorded = 67276800
Predicting variants for PAS = TNS1.9 (153700)
 - n variants recorded = 67320600
Predicting variants for PAS = TOLLIP.15 (153800)
 - n variants recorded = 67364400
Predict

Predicting variants for PAS = VIM.16 (162800)
 - n variants recorded = 71306400
Predicting variants for PAS = VLDLR.4 (162900)
 - n variants recorded = 71350200
Predicting variants for PAS = VOPP1.12 (163000)
 - n variants recorded = 71394000
Predicting variants for PAS = VPS13C.15 (163100)
 - n variants recorded = 71437800
Predicting variants for PAS = VPS33A.1 (163200)
 - n variants recorded = 71481600
Predicting variants for PAS = VPS37C.1 (163300)
 - n variants recorded = 71525400
Predicting variants for PAS = VPS4B.9 (163400)
 - n variants recorded = 71569200
Predicting variants for PAS = VPS54.15 (163500)
 - n variants recorded = 71613000
Predicting variants for PAS = VSIG10.19 (163600)
 - n variants recorded = 71656800
Predicting variants for PAS = VTCN1.9 (163700)
 - n variants recorded = 71700600
Predicting variants for PAS = VWA8.17 (163800)
 - n variants recorded = 71744400
Predicting variants for PAS = WASF1.1 (163900)
 - n variants recorded = 71788200
Predicting variants f

Predicting variants for PAS = ZNF608.4 (172800)
 - n variants recorded = 75686400
Predicting variants for PAS = ZNF620.6 (172900)
 - n variants recorded = 75730200
Predicting variants for PAS = ZNF638.23 (173000)
 - n variants recorded = 75774000
Predicting variants for PAS = ZNF653.2 (173100)
 - n variants recorded = 75817800
Predicting variants for PAS = ZNF664-RFLNA.25 (173200)
 - n variants recorded = 75861600
Predicting variants for PAS = ZNF678.18 (173300)
 - n variants recorded = 75905400
Predicting variants for PAS = ZNF696.5 (173400)
 - n variants recorded = 75949200
Predicting variants for PAS = ZNF704.17 (173500)
 - n variants recorded = 75993000
Predicting variants for PAS = ZNF713.1 (173600)
 - n variants recorded = 76036800
Predicting variants for PAS = ZNF720.4 (173700)
 - n variants recorded = 76080600
Predicting variants for PAS = ZNF732.4 (173800)
 - n variants recorded = 76124400
Predicting variants for PAS = ZNF747.3 (173900)
 - n variants recorded = 76168200
Predic

In [7]:
#Store variant prediction dataframe

variant_df.to_csv('hexamer_regression_variant_predictions_polyadb_no_sequences_no_cutoff.csv', sep='\t')


In [8]:

variant_df


Unnamed: 0,gene,gene_id,chrom,strand,site_type,native_usage,var_position,ref_nucleotide,var_nucleotide,delta_logodds_lr
0,A1BG,A1BG.1,chr19,-,Intron,0.06413,58859922,T,G,0.00000
1,A1BG,A1BG.1,chr19,-,Intron,0.06413,58859922,T,C,0.00000
2,A1BG,A1BG.1,chr19,-,Intron,0.06413,58859922,T,A,0.00000
3,A1BG,A1BG.1,chr19,-,Intron,0.06413,58859921,A,T,0.00000
4,A1BG,A1BG.1,chr19,-,Intron,0.06413,58859921,A,G,0.00000
...,...,...,...,...,...,...,...,...,...,...
76847533,ZZZ3,ZZZ3.26,chr1,-,3_most_exon,0.00257,78022097,G,C,-0.18728
76847534,ZZZ3,ZZZ3.26,chr1,-,3_most_exon,0.00257,78022097,G,A,-0.06474
76847535,ZZZ3,ZZZ3.26,chr1,-,3_most_exon,0.00257,78022096,A,T,-0.79502
76847536,ZZZ3,ZZZ3.26,chr1,-,3_most_exon,0.00257,78022096,A,G,-0.43024
