In [1]:
import pandas as pd
import xml.etree.ElementTree as et

import numpy as np
import scipy.sparse as sp
import scipy.io as spio

def reverse_complement(seq) :
    seq_prime = ''
    for j in range(0, len(seq)) :
        if seq[j] == 'A' :
            seq_prime = 'T' + seq_prime
        elif seq[j] == 'C' :
            seq_prime = 'G' + seq_prime
        elif seq[j] == 'G' :
            seq_prime = 'C' + seq_prime
        elif seq[j] == 'T' :
            seq_prime = 'A' + seq_prime

    return seq_prime

In [2]:
apadb_bed    = 'hg19.apadb_v2_final.bed'#'../../data/genome_hg19/features/hg19.apadb_v2_final.bed'

hg19_fai     = 'hg19.fa.fai'#'../../data/genome_hg19/hg19.fa.fai' 
hg19_fa      = 'hg19.fa'#'../../data/genome_hg19/hg19.fa'

df_columns = ['chr', 'cut_start', 'cut_end', 'gene', 'reads', 'strand', 'feature', 'mode', 'miRNA']
df = pd.read_csv(apadb_bed, sep='\t', header=None, names=df_columns, usecols=[0,1,2,3,4,5,6,8,9])

df = df.ix[df.miRNA == 'None']
df = df.ix[df.cut_end - df.cut_start <= 90]


In [3]:
print(df.head())

     chr  cut_start  cut_end       gene  reads strand feature    mode miRNA
0  chr17      62260    62312   RPH3AL.1    122      -    UTR3   62297  None
2  chr17     289758   289785  FAM101B.1     24      -    UTR3  289766  None
3  chr17     289935   289947  FAM101B.2    170      -    UTR3  289946  None
4  chr17     290016   290021  FAM101B.3     12      -    UTR3  290017  None
5  chr17     290068   290076  FAM101B.4      6      -    UTR3  290071  None


In [4]:



#SNP search range surrounding PAS (+- 50bps)
df.ix[df.strand == '+', 'start'] = df['mode']-75#50#mode#cut_start
df.ix[df.strand == '+', 'end'] = df['mode']+50#25#mode#cut_start

df.ix[df.strand == '-', 'start'] = df['mode']-75#25#mode#cut_end
df.ix[df.strand == '-', 'end'] = df['mode']+50#50#mode#cut_end

df.start = df.start.astype(int)
df.end = df.end.astype(int)

print(df.head())
print('')

output_bed = 'apadb_shortsites.bed'
bed_columns = ['chr', 'start', 'end', 'gene', 'reads', 'strand']
df.to_csv(output_bed, sep='\t', header=False, columns=bed_columns, index=False)



#Create bed file of apadb sites
df.ix[df.strand == '+', 'start'] = df['mode']-175
df.ix[df.strand == '+', 'end'] = df['mode']+175

df.ix[df.strand == '-', 'start'] = df['mode']-175
df.ix[df.strand == '-', 'end'] = df['mode']+175

df.start = df.start.astype(int)
df.end = df.end.astype(int)

df['genefam']     = df['gene'].str.split('\\.').apply(lambda x: x[0])
df['num_sites']         = df.groupby('genefam')['genefam'].transform(len)
df['total_reads'] = df.groupby('genefam')['reads'].transform(lambda x: sum(x))
df['rel_use']     = (df['reads'] / df['total_reads']).round(3)

print(df.head())
print('')

output_bed = 'apadb_sites.bed'
bed_columns = ['chr', 'start', 'end', 'gene', 'reads', 'strand']
df.to_csv(output_bed, sep='\t', header=False, columns=bed_columns, index=False)


     chr  cut_start  cut_end       gene  reads strand feature    mode miRNA  \
0  chr17      62260    62312   RPH3AL.1    122      -    UTR3   62297  None   
2  chr17     289758   289785  FAM101B.1     24      -    UTR3  289766  None   
3  chr17     289935   289947  FAM101B.2    170      -    UTR3  289946  None   
4  chr17     290016   290021  FAM101B.3     12      -    UTR3  290017  None   
5  chr17     290068   290076  FAM101B.4      6      -    UTR3  290071  None   

    start     end  
0   62222   62347  
2  289691  289816  
3  289871  289996  
4  289942  290067  
5  289996  290121  

     chr  cut_start  cut_end       gene  reads strand feature    mode miRNA  \
0  chr17      62260    62312   RPH3AL.1    122      -    UTR3   62297  None   
2  chr17     289758   289785  FAM101B.1     24      -    UTR3  289766  None   
3  chr17     289935   289947  FAM101B.2    170      -    UTR3  289946  None   
4  chr17     290016   290021  FAM101B.3     12      -    UTR3  290017  None   
5  chr17 

In [5]:
hg19_fai     = 'hg19.fa.fai'
hg19_fa      = 'hg19.fa'

# fasta
output_fa = 'apadb_seqs.fa'
#!bedtools getfasta -name -s -fi "$hg19_fa" -bed "$output_bed" -fo | cut -d : -f-4 > "$output_fa"
!bedtools getfasta -name -s -fi "$hg19_fa" -bed "$output_bed" -fo "$output_fa"
    
# file tops
!head -5 "$output_bed" | column -t ; echo
!head -10 "$output_fa" ; echo




chr17  62122   62472   RPH3AL.1   122  -
chr17  289591  289941  FAM101B.1  24   -
chr17  289771  290121  FAM101B.2  170  -
chr17  289842  290192  FAM101B.3  12   -
chr17  289896  290246  FAM101B.4  6    -

>RPH3AL.1
CATCCTTCCTGCCTCCTGCCCCAAACCCCGGGTTCCTGGGTCTGGAAGGGCCTTCTCTCCAAGCTGGGAGCTCCTGGGCCCCCACCATTCACTTTTTGTCCTTGCTGCTGGCAAACAGTAAAGAAACTCACTTTCCCTGTGGCACGTTATGCTTCAGAATTAAAACAATGAAGATTAAAATTTGCACCGAGCCAGTGTGTTGATCGAAGACCACGATTGCCTGTGTTTCTGAGATGCGTCCATGGAAAAATGGAAAAAACTGTGGTGCGTTGACTTGCTGGAACCCTTCCTAAGCCGCAGTGAAAGGAGGGGCTAGATCTGTGTGTGTTCATGCAGCTCACACACCTGAT
>FAM101B.1
AAACTTAATTTGAGCGAGTACCTTTTCATTTGACACTTTTCCTGTTTCTAACCTTAGGAAACCAGAATAGCGTTTGGCAGACACGACGTTTTCAGTTTACCTTTGACACCTGCCCCACTCCATTTTGCTTTGTGATGTCTTCATTTAACAATAAATTATCTGAAAAAACAAAACTTAGAGAGATGCTTCTGTTTTTAAAGTAGAATTATGTTTGTTTACGCAAAATGAGAAAAACAGCTCCTCATCTTGAGAAATTTTAAGACGTGATTATATTTAACAGTATTAATCTACAAGTACAAGATTTTCCGAGTGTggctgggcatggtggctcacacctataatcccagcgc
>FAM101B.2
GGAAGCAGTGATTTTTAGGACCCACTGATTAAAAACAAACATTCCCAAGTGTCTCTGAGA

In [6]:

apadb_dict = {}

for index, row in df.iterrows() :
    apadb_dict[row['gene']] = {}
    apadb_dict[row['gene']]['chr'] = row['chr']
    apadb_dict[row['gene']]['start'] = row['start']
    apadb_dict[row['gene']]['end'] = row['end']
    apadb_dict[row['gene']]['mode'] = row['mode']
    apadb_dict[row['gene']]['feature'] = row['feature']
    apadb_dict[row['gene']]['strand'] = row['strand']
    
    apadb_dict[row['gene']]['num_sites'] = row['num_sites']
    apadb_dict[row['gene']]['reads'] = row['reads']
    apadb_dict[row['gene']]['total_reads'] = row['total_reads']
    apadb_dict[row['gene']]['rel_use'] = row['rel_use']


for gene in apadb_dict :
    
    genefam = gene.split('.')[0]
    site = int(gene.split('.')[1])
    
    #Upstream site if any
    up_gene = genefam + '.' + str(site + 1)
    if up_gene in apadb_dict :
        apadb_dict[gene]['up_site_dist'] = np.abs(apadb_dict[up_gene]['mode'] - apadb_dict[gene]['mode'])
    else :
        apadb_dict[gene]['up_site_dist'] = 10000
    
    #Downstream site if any
    dn_gene = genefam + '.' + str(site - 1)
    if dn_gene in apadb_dict :
        apadb_dict[gene]['dn_site_dist'] = np.abs(apadb_dict[dn_gene]['mode'] - apadb_dict[gene]['mode'])
    else :
        apadb_dict[gene]['dn_site_dist'] = 10000


with open('apadb_seqs.fa') as seq_f:
    seq_id = ''
    for line in seq_f:
        if line[0] == '>' :
            seq_id = line[1:len(line) - 1]
        else :
            apadb_dict[seq_id]['seq'] = line[:len(line) - 1].upper()
            
print(apadb_dict['FAM101B.3'])
        
        
        


{'total_reads': 212, 'chr': 'chr17', 'seq': 'GGAGAGAAACAGAACCCTTCAGGGTGGGTCAGAGGACGCCATCCACAGTGGATTCGTGTTCGTTTGCAGGTGGAAGCAGTGATTTTTAGGACCCACTGATTAAAAACAAACATTCCCAAGTGTCTCTGAGAGATGCTGTTTATTTGTTAATTAAAAAGCTTTTTTCTCTGTCTTTTAAATTATGGCTTTCATGTAATAAGGATATTTTTAGTGAAAAATTGTTTTCCTTTCAAATTACAGACCTTTTAAAAAAACTTAATTTGAGCGAGTACCTTTTCATTTGACACTTTTCCTGTTTCTAACCTTAGGAAACCAGAATAGCGTTTGGCAGACACGACGTTTTCAGTTTA', 'dn_site_dist': 71, 'start': 289842, 'mode': 290017, 'up_site_dist': 54, 'end': 290192, 'feature': 'UTR3', 'reads': 12, 'num_sites': 4, 'rel_use': 0.057, 'strand': '-'}


In [7]:

#Parse clinvar molecular conseq data


cons_dict = {}

unique_consequences = {}

i = 0
with open('molecular_consequences.txt') as f:
    for line in f:
        
        lineparts = line[:-1].split('\t')
        
        hgvs = lineparts[0]
        cons = lineparts[2]
        
        if i > 0 :
            cons_dict[hgvs] = cons
            
            if cons not in unique_consequences :
                unique_consequences[cons] = True
        
        i += 1


print(cons_dict['AJ132917.1:c.*14G>A'])

print(cons_dict['NM_000047.2:c.1743G>A'])

print(len(cons_dict))

print(unique_consequences)

UTR-3
STOP-GAIN
129044
{'cds-indel': True, 'intergenic': True, 'STOP-LOSS': True, 'UTR-3': True, 'splice-5': True, 'ncRNA': True, 'STOP-GAIN': True, 'intron': True, 'UTR-5': True, 'missense': True, 'splice-3': True, 'frameshift': True, 'cds-synon': True, 'nearGene-3': True, 'nearGene-5': True}


In [8]:

#cons_dict = {}

unique_consequences = {}

i = 0
with open('ClinVarFullRelease_2017-09.xml') as f:
    
    has_read_consequence = False
    has_read_id = False
    
    consequence = ''
    cons_id = ''
    
    for line in f:
        
        if i % 10000000 == 0 :
            print('Parsed ' + str(i) + ' rows.')
        
        if 'MolecularConsequence' in line :
            has_read_consequence = True
            has_read_id = False
            
            consequence = line.split('<Attribute Type=\"MolecularConsequence\">')[1].split('</Attribute>')[0]
            
            if consequence not in unique_consequences :
                unique_consequences[consequence] = True
        
        if has_read_consequence == True and has_read_id == False :
            if 'XRef ID' in line and 'RefSeq' in line :
                has_read_id = True
                
                cons_id = line.split('<XRef ID=\"')[1].split('\" DB="RefSeq"/>')[0].replace('&gt;', '>')
                
                if cons_id not in cons_dict :
                    cons_dict[cons_id] = consequence
                
                
                consequence = ''
                cons_id = ''
        i += 1
                
print(unique_consequences)

print(len(cons_dict))


Parsed 0 rows.
Parsed 10000000 rows.
Parsed 20000000 rows.
Parsed 30000000 rows.
Parsed 40000000 rows.
Parsed 50000000 rows.
Parsed 60000000 rows.
Parsed 70000000 rows.
Parsed 80000000 rows.
Parsed 90000000 rows.
{'intron variant': True, '2KB upstream variant': True, 'splice acceptor variant': True, 'Splice Site donor': True, 'synonymous mutation': True, 'Read-through mutation': True, 'stop lost': True, 'splice donor variant': True, 'frameshift variant': True, 'synonymous variant': True, 'non-coding transcript variant': True, '500B downstream variant': True, 'nonsense': True, 'regulatory region ablation': True, 'missense variant': True, 'Splice Site acceptor': True, 'Silent': True, 'intergenic_variant': True, 'no-stop change?': True, 'Frameshift': True, 'frameshift mutation': True, '3 prime UTR variant': True, 'missense mutation': True, 'inframe_variant': True, 'Missense': True, 'Nonsense': True, '5 prime UTR variant': True, 'exon_loss': True}
433704


In [9]:
print(cons_dict['NM_020461.3:c.5458T>G'])

stop lost


In [None]:

#Manually identified consequences
cons_dict['NG_017041.1:g.5147G>A'] = '2KB upstream variant'


In [16]:

consequence_blacklist = []
consequence_blacklist.append('cds-indel')
consequence_blacklist.append('STOP-LOSS')
consequence_blacklist.append('splice-5')
consequence_blacklist.append('ncRNA')
consequence_blacklist.append('STOP-GAIN')
consequence_blacklist.append('intron')
consequence_blacklist.append('missense')
consequence_blacklist.append('splice-3')
consequence_blacklist.append('frameshift')
consequence_blacklist.append('cds-synon')
consequence_blacklist.append('intron variant')
consequence_blacklist.append('2KB upstream variant')
consequence_blacklist.append('splice acceptor variant')
consequence_blacklist.append('Splice Site donor')
consequence_blacklist.append('Read-through mutation')
consequence_blacklist.append('stop lost')
consequence_blacklist.append('splice donor variant')
consequence_blacklist.append('frameshift variant')
consequence_blacklist.append('nonsense')
#consequence_blacklist.append('')#regulatory region ablation
consequence_blacklist.append('missense variant')
consequence_blacklist.append('Splice Site acceptor')
consequence_blacklist.append('no-stop change?')
consequence_blacklist.append('Frameshift')
consequence_blacklist.append('frameshift mutation')
consequence_blacklist.append('missense mutation')
consequence_blacklist.append('inframe_variant')
consequence_blacklist.append('Missense')
consequence_blacklist.append('Nonsense')
consequence_blacklist.append('exon_loss')


In [17]:

#Parse clinvar data
clinvar_summary_file = 'variant_summary.txt'

variant_type = []
variant_gene = []
variant_clinsig = []
variant_assembly = []
variant_chrom = []
variant_start = []
variant_end = []
variant_refallele = []
variant_varallele = []
variant_key = []
variant_id = []

variant_cons = []

i = 0
with open(clinvar_summary_file) as f:
    for line in f:
        
        lineparts = line.split('\t')
        
        if i > 0 :
            variant_type.append(lineparts[1])
            variant_gene.append(lineparts[4] + ':' + 'chr' + lineparts[18] + ':' + lineparts[19] + ':' + lineparts[20] + ':' + lineparts[21] + '->' + lineparts[22])
            variant_clinsig.append(lineparts[6])
            variant_assembly.append(lineparts[16])
            variant_chrom.append('chr' + lineparts[18])
            variant_start.append(int(lineparts[19]))
            variant_end.append(int(lineparts[20]))
            variant_refallele.append(lineparts[21])
            variant_varallele.append(lineparts[22])
            
            variant_id.append(lineparts[2])
            
            hgvs = ''
            
            hgvs = lineparts[2]
            if ' ' in hgvs :
                hgvs = hgvs.split(' ')[0]
            
            if len(hgvs.split('(')) == 2 :
                hgvs = hgvs.split('(')[0] + hgvs.split(')')[1]
            
            
            if hgvs in cons_dict:
                variant_cons.append(cons_dict[hgvs])
            else :
                variant_cons.append('undetermined')
        
        i += 1

clinvar_df = pd.DataFrame({'chr' : variant_chrom,
                    'start' : variant_start,
                    'end' : variant_end,
                    'gene' : variant_gene,
                    'refallele' : variant_refallele,
                    'varallele' : variant_varallele,
                    'assembly' : variant_assembly,
                    'significance' : variant_clinsig,
                    'vartype' : variant_type,
                    'consequence' : variant_cons,
                    'id' : variant_id
                })

clinvar_df = clinvar_df[['chr', 'start', 'end', 'gene', 'refallele', 'varallele', 'assembly', 'significance', 'vartype', 'consequence', 'id']]


print(clinvar_df.head())
print('')

clinvar_df = clinvar_df.ix[clinvar_df.assembly == 'GRCh37']
#clinvar_df = clinvar_df.ix[clinvar_df.vartype == 'single nucleotide variant']
clinvar_df = clinvar_df.ix[(((clinvar_df.vartype == 'insertion') | (clinvar_df.vartype == 'deletion')) | (clinvar_df.vartype == 'indel')) | (clinvar_df.vartype == 'single nucleotide variant')]
clinvar_df = clinvar_df.ix[clinvar_df.end - clinvar_df.start <= 20]

'''
clinvar_df = clinvar_df.ix[
    (((((((clinvar_df.consequence == 'UTR-3')
    |
    (clinvar_df.consequence == 'UTR-51'))
    |
    (clinvar_df.consequence == 'intron1'))
    |
    (clinvar_df.consequence == 'undetermined'))
    |
    (clinvar_df.consequence == '5 prime UTR variant1'))
    |
    (clinvar_df.consequence == '3 prime UTR variant'))
    |
    (clinvar_df.consequence == 'non-coding transcript variant'))
    |
    (clinvar_df.consequence == 'intron variant1')
]'''

for conseq_blacklisted in consequence_blacklist :
    clinvar_df = clinvar_df.ix[clinvar_df.consequence != conseq_blacklisted]

print(clinvar_df.head())
print(len(clinvar_df))
#59422


clinvar_dict = {}

for index, row in clinvar_df.iterrows() :
    clinvar_dict[row['gene']] = {}
    clinvar_dict[row['gene']]['significance'] = row['significance']
    clinvar_dict[row['gene']]['vartype'] = row['vartype']
    clinvar_dict[row['gene']]['varcons'] = row['consequence']
    
    clinvar_dict[row['gene']]['refallele'] = row['refallele']
    clinvar_dict[row['gene']]['varallele'] = row['varallele']
    
    clinvar_dict[row['gene']]['id'] = row['id']


     chr     start       end  \
0   chr7   4820844   4820847   
1   chr7   4781213   4781216   
2   chr7   4827366   4827379   
3   chr7   4787735   4787748   
4  chr15  85342440  85342440   

                                                gene       refallele  \
0  AP5Z1:chr7:4820844:4820847:GGAT->TGCTGTAAACTGT...            GGAT   
1  AP5Z1:chr7:4781213:4781216:GGAT->TGCTGTAAACTGT...            GGAT   
2       AP5Z1:chr7:4827366:4827379:GGACCTGCCCTGCT->-  GGACCTGCCCTGCT   
3       AP5Z1:chr7:4787735:4787748:GGACCTGCCCTGCT->-  GGACCTGCCCTGCT   
4                ZNF592:chr15:85342440:85342440:G->A               G   

                varallele assembly            significance  \
0  TGCTGTAAACTGTAACTGTAAA   GRCh37              Pathogenic   
1  TGCTGTAAACTGTAACTGTAAA   GRCh38              Pathogenic   
2                       -   GRCh37              Pathogenic   
3                       -   GRCh38              Pathogenic   
4                       A   GRCh37  Uncertain significance   

 

In [18]:
#Create bed file of apadb sites

clinvar_bed = 'clinvar_snps.bed'
bed_columns = ['chr', 'start', 'end', 'gene']
clinvar_df.to_csv(clinvar_bed, sep='\t', header=False, columns=bed_columns, index=False)

In [19]:

#Intersect Clinvar snps against APADB bed

!bedtools intersect -wa -wb -a 'clinvar_snps.bed' -b 'apadb_shortsites.bed' > 'clinvar_apadb_intersection.bed'



In [20]:

intersect_columns = ['chr', 'snp_start', 'snp_end', 'snp_key', 'apadb_site']
intersect_df = pd.read_csv('clinvar_apadb_intersection.bed', sep='\t', header=None, names=intersect_columns, usecols=[0, 1, 2, 3, 7])

print(intersect_df.head())

print(len(intersect_df))


     chr  snp_start    snp_end                               snp_key  \
0  chr22   45691707   45691707    UPK3A:chr22:45691707:45691707:T->C   
1   chr7  128589427  128589427    IRF5:chr7:128589427:128589427:G->A   
2  chr12  102122887  102122887  SYCP3:chr12:102122887:102122887:A->G   
3   chr2  231050715  231050715   SP110:chr2:231050715:231050715:A->G   
4   chr1   24020362   24020362     RPL11:chr1:24020362:24020362:C->T   

  apadb_site  
0    UPK3A.1  
1     IRF5.2  
2    CHPT1.3  
3   SP110.10  
4    RPL11.4  
3483


In [21]:
print(intersect_df[intersect_df.apadb_site == 'TUBGCP6.2'])

Empty DataFrame
Columns: [chr, snp_start, snp_end, snp_key, apadb_site]
Index: []


In [22]:

def reverse_complement(seq) :
    rev_seq = seq[::-1]
    rev_comp = ''
    for base in rev_seq :
        if base == 'A' :
            rev_comp += 'T'
        elif base == 'C' :
            rev_comp += 'G'
        elif base == 'G' :
            rev_comp += 'C'
        elif base == 'T' :
            rev_comp += 'A'
    return rev_comp


#Generate APADB Clinvar variant dataset

l_gene = []
l_refseq = []
l_varseq = []
l_region = []
l_significance = []
l_vartype = []
l_varpos = []
l_reads = []
l_totalreads = []
l_reluse = []
l_numsites = []
l_upsitedist = []
l_dnsitedist = []
l_consequence = []
l_id = []

for index, row in intersect_df.iterrows() :
    site_id = row['apadb_site']
    
    site_start = apadb_dict[site_id]['start']
    site_end = apadb_dict[site_id]['end']
    site_strand = apadb_dict[site_id]['strand']
    site_refseq = apadb_dict[site_id]['seq']
    
    site_num = apadb_dict[site_id]['num_sites']
    site_up_dist = apadb_dict[site_id]['up_site_dist']
    site_dn_dist = apadb_dict[site_id]['dn_site_dist']
    
    snp_start = row['snp_start']
    snp_end = row['snp_end']
    #snp_ref = row['snp_key'].split('->')[0][len(row['snp_key'].split('->')[0])-1]
    #snp_var = row['snp_key'].split('->')[1][0]
    
    snp_ref = clinvar_dict[row['snp_key']]['refallele']
    snp_var = clinvar_dict[row['snp_key']]['varallele']
    
    snp_type = clinvar_dict[row['snp_key']]['vartype']
    
    snp_cons = clinvar_dict[row['snp_key']]['varcons']
    
    snp_id = clinvar_dict[row['snp_key']]['id']
    
    if snp_ref == '-' :
        snp_ref = ''
    if snp_var == '-' :
        snp_var = ''
    
    
    site_varseq = site_refseq
    relpos_start = -1
    relpos_end = -1
    
    if site_strand == '+' :
        relpos_start = snp_start - site_start - 1
        relpos_end = snp_start - site_start - 1 + (snp_end - snp_start)
        
        if relpos_start < 0 :
            print('WARNING (+)! Relpos out of range (' + str(relpos_start) + ')')
            continue
        if site_refseq[relpos_start:relpos_end+1] != snp_ref and (snp_type == 'deletion' or snp_type == 'indel'):
            print('WARNING (+)! Ref base differs.')
            print(snp_type)
            print(relpos_start)
            print(site_refseq[relpos_start-5:relpos_end+1+5])
            print(site_refseq[relpos_start:relpos_end+1])
            print(snp_ref)
            continue
            
        site_varseq = site_varseq[:relpos_start] + snp_var + site_varseq[relpos_end+1:]
    if site_strand == '-' :
        
        snp_ref = reverse_complement(snp_ref)
        snp_var = reverse_complement(snp_var)
        
        relpos_end = 350 - (snp_start - site_start)
        relpos_start = 350 - (snp_start - site_start + (snp_end - snp_start))
        
        if relpos_start < 0 :
            print('WARNING (-)! Relpos out of range (' + str(relpos_start) + ')')
            continue
        if site_refseq[relpos_start:relpos_end+1] != snp_ref and (snp_type == 'deletion' or snp_type == 'indel'):
            print('WARNING (-)! Ref base differs.')
            print(snp_type)
            print(relpos_start)
            print(site_refseq)
            print(site_refseq[relpos_start-10:relpos_end+1+10])
            print(site_refseq[relpos_start:relpos_end+1])
            print(snp_ref)
            continue
            
        site_varseq = site_varseq[:relpos_start] + snp_var + site_varseq[relpos_end+1:]
        
    
    if relpos_start < 150 and snp_type == 'deletion' :
        site_varseq = ('X' * (350 - len(site_varseq))) + site_varseq
    elif relpos_start < 150 and snp_type == 'insertion' :
        site_varseq = site_varseq + ('X' * (350 - len(site_varseq)))
    elif relpos_start < 150 and snp_type == 'indel' and len(snp_ref) > len(snp_var) :
        site_varseq = ('X' * (350 - len(site_varseq))) + site_varseq
    elif relpos_start < 150 and snp_type == 'indel' and len(snp_ref) < len(snp_var) :
        site_varseq = site_varseq + ('X' * (350 - len(site_varseq)))
    elif relpos_start >= 150 :
        site_varseq = site_varseq + ('X' * (350 - len(site_varseq)))
    else :
        site_varseq = site_varseq + ('X' * (350 - len(site_varseq)))
    
    if site_refseq == site_varseq :
        print('WARNING! Ref seq == Var seq.')
        continue
    
    l_gene.append(site_id)
    l_varpos.append(relpos_start)
    l_refseq.append(site_refseq)
    l_varseq.append(site_varseq)
    l_region.append(apadb_dict[site_id]['feature'])
    l_significance.append(clinvar_dict[row['snp_key']]['significance'])
    l_vartype.append(clinvar_dict[row['snp_key']]['vartype'])
        
    l_reads.append(apadb_dict[site_id]['reads'])
    l_totalreads.append(apadb_dict[site_id]['total_reads'])
    l_reluse.append(apadb_dict[site_id]['rel_use'])
    
    l_numsites.append(site_num)
    l_upsitedist.append(site_up_dist)
    l_dnsitedist.append(site_dn_dist)
    
    l_consequence.append(snp_cons)
    
    l_id.append(snp_id)
        
        
snp_df = pd.DataFrame({'gene' : l_gene,
                    'var_pos' : l_varpos,
                    'ref_seq' : l_refseq,
                    'var_seq' : l_varseq,
                    'region' : l_region,
                    'significance' : l_significance,
                    'vartype' : l_vartype,
                    'num_sites' : l_numsites,
                    'up_site_dist' : l_upsitedist,
                    'dn_site_dist' : l_dnsitedist,
                    'reads' : l_reads,
                    'total_reads' : l_totalreads,
                    'rel_use' : l_reluse,
                    'consequence' : l_consequence,
                    'clinvar_id' : l_id
                })

snp_df = snp_df[['gene', 'var_pos', 'ref_seq', 'var_seq', 'region', 'significance', 'vartype', 'num_sites', 'up_site_dist', 'dn_site_dist', 'reads', 'total_reads', 'rel_use', 'consequence', 'clinvar_id']]
snp_df = snp_df.sort_values(by='gene')

print(snp_df.head())
print(len(snp_df))

snp_df.to_csv('apadb_snps_combined_blacklist.csv', header=True, index=False, sep='\t')


indel
206
TGTGCGCGCGCGCACGCGCGAGTGTGCTGTATGGCCCAGGCAGCCTCAAGGCCCTCGGAGCTGGCTGTGCCTGCTTCTGTGTACCACTTCTGTGGGCATGGCCGCTTCTAGAGCCTCGACACCCCCCCAACCCCCGCACCAAGCAGACAAAGTCAATAAAAGAGCTGTCTGACTGCAATCTGTGCCTCTATGTCTGTGCACTGGGGTCAGGACTTTATTTATTTCACTGACAGGCAATACCGTCCAAGGCCAGTGCAGGAGGGAGGGCCCCGGCCTCACACAAACTCGGTGAAGTCCTCCACCGAGGAGATGAGGCGCTTCCGCTGGCCCACCTCATAGCCAGGTGTGGG
TGCACTGGGGTCAGGACTTTATT
TCA

indel
147
AGTCCTGACCCCA
TGA
na
deletion
176
CGCACAGCAACATTTTGAGAATAATCTTCTACTAATGAATGGTAGTGAGTTGAAAATAATGTACGACATTTTATAGTCTCAGCAAGTTCTTTAACAACTGCATTTGCTATTGCCGTCCCATCAAATGTTGCAGTACCTCTTCCTGTTAAAGTAAAATATGCATAAGGAAGTAACTCAAAGGAATTAAAACAAAAAAGGAATTAAAACAAAAATGCTAGGACAGAAAAGCAACATCGGTTAGTACATCCACGTCTAAAAGCATTCTATAAATAGGCCTTGTTTAGCTACACGAGTCTGCTTAGCAGCCCATGGGGAGTGAGGTTTCTTTTAAAGAAAATACAGTATCGCTC
GAAGTAACTCAAAGGAATTAAAACAAAAAAGGAATTA
AAAGGAATTAAAACAAA

indel
249
AAAATATTTGTCAAAATCTTAACTGAATGTTTACTGGAAGTACTTGAGATTCCATTTGAGAGTTGTATTGTTAATAATTTCATGTCAGTGAACTGATATCTGATGTTTATGATATGGTGTCTTTTTCTTGAAACAAGCTTCCAAGGGC