In [4]:
import pandas as pd
import xml.etree.ElementTree as et

import numpy as np
import scipy.sparse as sp
import scipy.io as spio

def reverse_complement(seq) :
    seq_prime = ''
    for j in range(0, len(seq)) :
        if seq[j] == 'A' :
            seq_prime = 'T' + seq_prime
        elif seq[j] == 'C' :
            seq_prime = 'G' + seq_prime
        elif seq[j] == 'G' :
            seq_prime = 'C' + seq_prime
        elif seq[j] == 'T' :
            seq_prime = 'A' + seq_prime

    return seq_prime

In [5]:
apadb_bed    = 'hg19.apadb_v2_final.bed'#'../../data/genome_hg19/features/hg19.apadb_v2_final.bed'

hg19_fai     = 'hg19.fa.fai'#'../../data/genome_hg19/hg19.fa.fai' 
hg19_fa      = 'hg19.fa'#'../../data/genome_hg19/hg19.fa'

df_columns = ['chr', 'cut_start', 'cut_end', 'gene', 'reads', 'strand', 'feature', 'mode', 'miRNA']
df = pd.read_csv(apadb_bed, sep='\t', header=None, names=df_columns, usecols=[0,1,2,3,4,5,6,8,9])

df = df.ix[df.miRNA == 'None']
df = df.ix[df.cut_end - df.cut_start <= 90]


In [6]:
print(df.head())

     chr  cut_start  cut_end       gene  reads strand feature    mode miRNA
0  chr17      62260    62312   RPH3AL.1    122      -    UTR3   62297  None
2  chr17     289758   289785  FAM101B.1     24      -    UTR3  289766  None
3  chr17     289935   289947  FAM101B.2    170      -    UTR3  289946  None
4  chr17     290016   290021  FAM101B.3     12      -    UTR3  290017  None
5  chr17     290068   290076  FAM101B.4      6      -    UTR3  290071  None


In [7]:



#SNP search range surrounding PAS (+- 50bps)
df.ix[df.strand == '+', 'start'] = df['mode']-75#50#mode#cut_start
df.ix[df.strand == '+', 'end'] = df['mode']+50#25#mode#cut_start

df.ix[df.strand == '-', 'start'] = df['mode']-75#25#mode#cut_end
df.ix[df.strand == '-', 'end'] = df['mode']+50#50#mode#cut_end

df.start = df.start.astype(int)
df.end = df.end.astype(int)

print(df.head())
print('')

output_bed = 'apadb_shortsites.bed'
bed_columns = ['chr', 'start', 'end', 'gene', 'reads', 'strand']
df.to_csv(output_bed, sep='\t', header=False, columns=bed_columns, index=False)



#Create bed file of apadb sites
df.ix[df.strand == '+', 'start'] = df['mode']-175
df.ix[df.strand == '+', 'end'] = df['mode']+125

df.ix[df.strand == '-', 'start'] = df['mode']-125
df.ix[df.strand == '-', 'end'] = df['mode']+175

df.start = df.start.astype(int)
df.end = df.end.astype(int)

df['genefam']     = df['gene'].str.split('\\.').apply(lambda x: x[0])
df['num_sites']         = df.groupby('genefam')['genefam'].transform(len)
df['total_reads'] = df.groupby('genefam')['reads'].transform(lambda x: sum(x))
df['rel_use']     = (df['reads'] / df['total_reads']).round(3)

print(df.head())
print('')

output_bed = 'apadb_sites.bed'
bed_columns = ['chr', 'start', 'end', 'gene', 'reads', 'strand']
df.to_csv(output_bed, sep='\t', header=False, columns=bed_columns, index=False)


     chr  cut_start  cut_end       gene  reads strand feature    mode miRNA  \
0  chr17      62260    62312   RPH3AL.1    122      -    UTR3   62297  None   
2  chr17     289758   289785  FAM101B.1     24      -    UTR3  289766  None   
3  chr17     289935   289947  FAM101B.2    170      -    UTR3  289946  None   
4  chr17     290016   290021  FAM101B.3     12      -    UTR3  290017  None   
5  chr17     290068   290076  FAM101B.4      6      -    UTR3  290071  None   

    start     end  
0   62222   62347  
2  289691  289816  
3  289871  289996  
4  289942  290067  
5  289996  290121  

     chr  cut_start  cut_end       gene  reads strand feature    mode miRNA  \
0  chr17      62260    62312   RPH3AL.1    122      -    UTR3   62297  None   
2  chr17     289758   289785  FAM101B.1     24      -    UTR3  289766  None   
3  chr17     289935   289947  FAM101B.2    170      -    UTR3  289946  None   
4  chr17     290016   290021  FAM101B.3     12      -    UTR3  290017  None   
5  chr17 

In [10]:
hg19_fai     = 'hg19.fa.fai'
hg19_fa      = 'hg19.fa'

# fasta
output_fa = 'apadb_seqs.fa'
#!bedtools getfasta -name -s -fi "$hg19_fa" -bed "$output_bed" -fo | cut -d : -f-4 > "$output_fa"
!bedtools getfasta -name -s -fi "$hg19_fa" -bed "$output_bed" -fo "$output_fa"
    
# file tops
!head -5 "$output_bed" | column -t ; echo
!head -10 "$output_fa" ; echo




chr17  62172   62472   RPH3AL.1   122  -
chr17  289641  289941  FAM101B.1  24   -
chr17  289821  290121  FAM101B.2  170  -
chr17  289892  290192  FAM101B.3  12   -
chr17  289946  290246  FAM101B.4  6    -

>RPH3AL.1
CATCCTTCCTGCCTCCTGCCCCAAACCCCGGGTTCCTGGGTCTGGAAGGGCCTTCTCTCCAAGCTGGGAGCTCCTGGGCCCCCACCATTCACTTTTTGTCCTTGCTGCTGGCAAACAGTAAAGAAACTCACTTTCCCTGTGGCACGTTATGCTTCAGAATTAAAACAATGAAGATTAAAATTTGCACCGAGCCAGTGTGTTGATCGAAGACCACGATTGCCTGTGTTTCTGAGATGCGTCCATGGAAAAATGGAAAAAACTGTGGTGCGTTGACTTGCTGGAACCCTTCCTAAGCCGCAG
>FAM101B.1
AAACTTAATTTGAGCGAGTACCTTTTCATTTGACACTTTTCCTGTTTCTAACCTTAGGAAACCAGAATAGCGTTTGGCAGACACGACGTTTTCAGTTTACCTTTGACACCTGCCCCACTCCATTTTGCTTTGTGATGTCTTCATTTAACAATAAATTATCTGAAAAAACAAAACTTAGAGAGATGCTTCTGTTTTTAAAGTAGAATTATGTTTGTTTACGCAAAATGAGAAAAACAGCTCCTCATCTTGAGAAATTTTAAGACGTGATTATATTTAACAGTATTAATCTACAAGTACAAG
>FAM101B.2
GGAAGCAGTGATTTTTAGGACCCACTGATTAAAAACAAACATTCCCAAGTGTCTCTGAGAGATGCTGTTTATTTGTTAATTAAAAAGCTTTTTTCTCTGTCTTTTAAATTATGGCTTTCATGTAATAAGGATATTTTTAGTGAAAAATTGTTTTCCTTTC

In [11]:

apadb_dict = {}

for index, row in df.iterrows() :
    apadb_dict[row['gene']] = {}
    apadb_dict[row['gene']]['chr'] = row['chr']
    apadb_dict[row['gene']]['start'] = row['start']
    apadb_dict[row['gene']]['end'] = row['end']
    apadb_dict[row['gene']]['mode'] = row['mode']
    apadb_dict[row['gene']]['feature'] = row['feature']
    apadb_dict[row['gene']]['strand'] = row['strand']
    
    apadb_dict[row['gene']]['num_sites'] = row['num_sites']
    apadb_dict[row['gene']]['reads'] = row['reads']
    apadb_dict[row['gene']]['total_reads'] = row['total_reads']
    apadb_dict[row['gene']]['rel_use'] = row['rel_use']


for gene in apadb_dict :
    
    genefam = gene.split('.')[0]
    site = int(gene.split('.')[1])
    
    #Upstream site if any
    up_gene = genefam + '.' + str(site + 1)
    if up_gene in apadb_dict :
        apadb_dict[gene]['up_site_dist'] = np.abs(apadb_dict[up_gene]['mode'] - apadb_dict[gene]['mode'])
    else :
        apadb_dict[gene]['up_site_dist'] = 10000
    
    #Downstream site if any
    dn_gene = genefam + '.' + str(site - 1)
    if dn_gene in apadb_dict :
        apadb_dict[gene]['dn_site_dist'] = np.abs(apadb_dict[dn_gene]['mode'] - apadb_dict[gene]['mode'])
    else :
        apadb_dict[gene]['dn_site_dist'] = 10000


with open('apadb_seqs.fa') as seq_f:
    seq_id = ''
    for line in seq_f:
        if line[0] == '>' :
            seq_id = line[1:len(line) - 1]
        else :
            apadb_dict[seq_id]['seq'] = line[:len(line) - 1].upper()
            
print(apadb_dict['FAM101B.3'])
        
        
        


{'chr': 'chr17', 'dn_site_dist': 71, 'reads': 12, 'total_reads': 212, 'num_sites': 4, 'end': 290192, 'start': 289892, 'seq': 'GGAGAGAAACAGAACCCTTCAGGGTGGGTCAGAGGACGCCATCCACAGTGGATTCGTGTTCGTTTGCAGGTGGAAGCAGTGATTTTTAGGACCCACTGATTAAAAACAAACATTCCCAAGTGTCTCTGAGAGATGCTGTTTATTTGTTAATTAAAAAGCTTTTTTCTCTGTCTTTTAAATTATGGCTTTCATGTAATAAGGATATTTTTAGTGAAAAATTGTTTTCCTTTCAAATTACAGACCTTTTAAAAAAACTTAATTTGAGCGAGTACCTTTTCATTTGACACTTTTCCTGTTTCT', 'feature': 'UTR3', 'mode': 290017, 'strand': '-', 'rel_use': 0.057, 'up_site_dist': 54}


In [12]:

#Parse clinvar molecular conseq data


cons_dict = {}

unique_consequences = {}

i = 0
with open('molecular_consequences.txt') as f:
    for line in f:
        
        lineparts = line[:-1].split('\t')
        
        hgvs = lineparts[0]
        cons = lineparts[2]
        
        if i > 0 :
            cons_dict[hgvs] = cons
            
            if cons not in unique_consequences :
                unique_consequences[cons] = True
        
        i += 1


print(cons_dict['AJ132917.1:c.*14G>A'])

print(cons_dict['NM_000047.2:c.1743G>A'])

print(len(cons_dict))

print(unique_consequences)

UTR-3
STOP-GAIN
129044
{'intergenic': True, 'splice-5': True, 'cds-synon': True, 'missense': True, 'nearGene-3': True, 'ncRNA': True, 'STOP-GAIN': True, 'frameshift': True, 'cds-indel': True, 'STOP-LOSS': True, 'intron': True, 'nearGene-5': True, 'UTR-3': True, 'splice-3': True, 'UTR-5': True}


In [13]:

#cons_dict = {}

unique_consequences = {}

i = 0
with open('ClinVarFullRelease_2017-09.xml') as f:
    
    has_read_consequence = False
    has_read_id = False
    
    consequence = ''
    cons_id = ''
    
    for line in f:
        
        if i % 10000000 == 0 :
            print('Parsed ' + str(i) + ' rows.')
        
        if 'MolecularConsequence' in line :
            has_read_consequence = True
            has_read_id = False
            
            consequence = line.split('<Attribute Type=\"MolecularConsequence\">')[1].split('</Attribute>')[0]
            
            if consequence not in unique_consequences :
                unique_consequences[consequence] = True
        
        if has_read_consequence == True and has_read_id == False :
            if 'XRef ID' in line and 'RefSeq' in line :
                has_read_id = True
                
                cons_id = line.split('<XRef ID=\"')[1].split('\" DB="RefSeq"/>')[0].replace('&gt;', '>')
                
                if cons_id not in cons_dict :
                    cons_dict[cons_id] = consequence
                
                
                consequence = ''
                cons_id = ''
        i += 1
                
print(unique_consequences)

print(len(cons_dict))


Parsed 0 rows.
Parsed 10000000 rows.
Parsed 20000000 rows.
Parsed 30000000 rows.
Parsed 40000000 rows.
Parsed 50000000 rows.
Parsed 60000000 rows.
Parsed 70000000 rows.
Parsed 80000000 rows.
Parsed 90000000 rows.
{'stop lost': True, 'splice donor variant': True, 'no-stop change?': True, 'non-coding transcript variant': True, 'synonymous mutation': True, '500B downstream variant': True, 'frameshift variant': True, 'Silent': True, 'Read-through mutation': True, 'regulatory region ablation': True, 'Frameshift': True, 'intron variant': True, 'Missense': True, 'synonymous variant': True, '5 prime UTR variant': True, 'splice acceptor variant': True, 'frameshift mutation': True, 'Splice Site donor': True, '2KB upstream variant': True, 'nonsense': True, '3 prime UTR variant': True, 'missense mutation': True, 'intergenic_variant': True, 'Nonsense': True, 'Splice Site acceptor': True, 'missense variant': True, 'inframe_variant': True, 'exon_loss': True}
433704


In [14]:
print(cons_dict['NM_020461.3:c.5458T>G'])

stop lost


In [32]:

#Parse clinvar data
clinvar_summary_file = 'variant_summary.txt'

variant_type = []
variant_gene = []
variant_clinsig = []
variant_assembly = []
variant_chrom = []
variant_start = []
variant_end = []
variant_refallele = []
variant_varallele = []
variant_key = []

variant_cons = []

i = 0
with open(clinvar_summary_file) as f:
    for line in f:
        
        lineparts = line.split('\t')
        
        if i > 0 :
            variant_type.append(lineparts[1])
            variant_gene.append(lineparts[4] + ':' + 'chr' + lineparts[18] + ':' + lineparts[19] + ':' + lineparts[20] + ':' + lineparts[21] + '->' + lineparts[22])
            variant_clinsig.append(lineparts[6])
            variant_assembly.append(lineparts[16])
            variant_chrom.append('chr' + lineparts[18])
            variant_start.append(int(lineparts[19]))
            variant_end.append(int(lineparts[20]))
            variant_refallele.append(lineparts[21])
            variant_varallele.append(lineparts[22])
            
            hgvs = ''
            
            '''if 'NM_000047.2(ARSE):c.1743G>A' in lineparts[2] :
                print(lineparts[2])
                print(len(lineparts[2].split('(')))
                hgvs = lineparts[2].split('(')[0] + lineparts[2].split(')')[1]
                print(hgvs)
                print(cons_dict[hgvs])'''
            
            hgvs = lineparts[2]
            if ' ' in hgvs :
                hgvs = hgvs.split(' ')[0]
            
            if len(hgvs.split('(')) == 2 :
                hgvs = hgvs.split('(')[0] + hgvs.split(')')[1]
            
            
            if 'NM_000047.2(ARSE):c.1743G>A' in lineparts[2] :
                print(cons_dict[hgvs])
            
            
            if hgvs in cons_dict:
                variant_cons.append(cons_dict[hgvs])
            else :
                variant_cons.append('undetermined')
        
        i += 1

clinvar_df = pd.DataFrame({'chr' : variant_chrom,
                    'start' : variant_start,
                    'end' : variant_end,
                    'gene' : variant_gene,
                    'refallele' : variant_refallele,
                    'varallele' : variant_varallele,
                    'assembly' : variant_assembly,
                    'significance' : variant_clinsig,
                    'vartype' : variant_type,
                    'consequence' : variant_cons
                })

clinvar_df = clinvar_df[['chr', 'start', 'end', 'gene', 'refallele', 'varallele', 'assembly', 'significance', 'vartype', 'consequence']]


print(clinvar_df.head())
print('')

clinvar_df = clinvar_df.ix[clinvar_df.assembly == 'GRCh37']

print(len(clinvar_df))

clinvar_df = clinvar_df.ix[clinvar_df.vartype == 'single nucleotide variant']
#clinvar_df = clinvar_df.ix[(((clinvar_df.vartype == 'single nucleotide variant') | (clinvar_df.vartype == 'insertion')) | (clinvar_df.vartype == 'deletion')) | (clinvar_df.vartype == 'indel')]
#clinvar_df = clinvar_df.ix[clinvar_df.end - clinvar_df.start <= 8]

print(len(clinvar_df))

STOP-GAIN
STOP-GAIN
     chr     start       end  \
0   chr7   4820844   4820847   
1   chr7   4781213   4781216   
2   chr7   4827366   4827379   
3   chr7   4787735   4787748   
4  chr15  85342440  85342440   

                                                gene       refallele  \
0  AP5Z1:chr7:4820844:4820847:GGAT->TGCTGTAAACTGT...            GGAT   
1  AP5Z1:chr7:4781213:4781216:GGAT->TGCTGTAAACTGT...            GGAT   
2       AP5Z1:chr7:4827366:4827379:GGACCTGCCCTGCT->-  GGACCTGCCCTGCT   
3       AP5Z1:chr7:4787735:4787748:GGACCTGCCCTGCT->-  GGACCTGCCCTGCT   
4                ZNF592:chr15:85342440:85342440:G->A               G   

                varallele assembly            significance  \
0  TGCTGTAAACTGTAACTGTAAA   GRCh37              Pathogenic   
1  TGCTGTAAACTGTAACTGTAAA   GRCh38              Pathogenic   
2                       -   GRCh37              Pathogenic   
3                       -   GRCh38              Pathogenic   
4                       A   GRCh37  Uncertai

In [33]:

print('All undetermined')
print(len(clinvar_df[(clinvar_df.significance == 'Uncertain significance')]))

print('All determined')
print(len(clinvar_df[
            ((((clinvar_df.significance == 'Pathogenic') |
            (clinvar_df.significance == 'Likely pathogenic')) |
            (clinvar_df.significance == 'Benign')) |
            (clinvar_df.significance == 'Likely benign'))
        ]))

print('All pathogenic')
print(len(clinvar_df[
            ((clinvar_df.significance == 'Pathogenic') |
            (clinvar_df.significance == 'Likely pathogenic'))
        ]))

print('All benign')
print(len(clinvar_df[
            ((clinvar_df.significance == 'Benign') |
            (clinvar_df.significance == 'Likely benign'))
        ]))


All undetermined
111725
All determined
112896
All pathogenic
41811
All benign
71085


In [37]:
'''
{'stop lost': True, 'splice donor variant': True, 'no-stop change?': True, 'non-coding transcript variant': True, 'synonymous mutation': True, '500B downstream variant': True, 'frameshift variant': True, 'Silent': True, 'Read-through mutation': True, 'regulatory region ablation': True, 'Frameshift': True, 'intron variant': True, 'Missense': True, 'synonymous variant': True, '5 prime UTR variant': True, 'splice acceptor variant': True, 'frameshift mutation': True, 'Splice Site donor': True, '2KB upstream variant': True, 'nonsense': True, '3 prime UTR variant': True, 'missense mutation': True, 'intergenic_variant': True, 'Nonsense': True, 'Splice Site acceptor': True, 'missense variant': True, 'inframe_variant': True, 'exon_loss': True}
{'intergenic': True, 'splice-5': True, 'cds-synon': True, 'missense': True, 'nearGene-3': True, 'ncRNA': True, 'STOP-GAIN': True, 'frameshift': True, 'cds-indel': True, 'STOP-LOSS': True, 'intron': True, 'nearGene-5': True, 'UTR-3': True, 'splice-3': True, 'UTR-5': True}
'''


clinvar_utr3 = clinvar_df.ix[(clinvar_df.consequence == 'UTR-3') | (clinvar_df.consequence == '3 prime UTR variant')]
print(len(clinvar_utr3))

clinvar_utr3_and_noncoding = clinvar_df.ix[(clinvar_df.consequence == 'non-coding transcript variant')]
print(len(clinvar_utr3_and_noncoding))


clinvar_splicing = clinvar_df.ix[
    (((((((clinvar_df.consequence == 'splice-5')
    |
    (clinvar_df.consequence == 'splice-3'))
    |
    (clinvar_df.consequence == 'intron'))
    |
    (clinvar_df.consequence == 'splice donor variant'))
    |
    (clinvar_df.consequence == 'splice acceptor variant'))
    |
    (clinvar_df.consequence == 'Splice Site donor'))
    |
    (clinvar_df.consequence == 'Splice Site acceptor'))
    |
    (clinvar_df.consequence == 'intron variant')
]
print(len(clinvar_splicing))



28315
36
32279


In [45]:

print(clinvar_splicing.significance.unique())

print('Splicing undetermined')
print(len(clinvar_splicing[(clinvar_splicing.significance == 'Uncertain significance')]))

print('Splicing determined')
print(len(clinvar_splicing[
            ((clinvar_splicing.significance == 'Pathogenic') |
            (clinvar_splicing.significance == 'Benign'))
        ]))

print('Splicing pathogenic')
print(len(clinvar_splicing[
            (clinvar_splicing.significance == 'Pathogenic') #|
#            (clinvar_splicing.significance == 'Likely pathogenic'))
        ]))

print('Splicing benign')
print(len(clinvar_splicing[
            (clinvar_splicing.significance == 'Benign') #|
#            (clinvar_splicing.significance == 'Likely benign'))
        ]))


['Benign' 'Pathogenic' 'Likely pathogenic, risk factor' 'drug response'
 'Conflicting interpretations of pathogenicity' 'association'
 'Uncertain significance' 'Pathogenic/Likely pathogenic' 'risk factor'
 'Pathogenic, risk factor' 'Benign/Likely benign, association' '-'
 'drug response, risk factor'
 'Conflicting interpretations of pathogenicity, protective' 'protective'
 'Pathogenic, other' 'Affects' 'other' 'Benign/Likely benign'
 'not provided' 'Likely pathogenic' 'Likely benign'
 'Benign/Likely benign, risk factor' 'Benign, other'
 'Benign/Likely benign, other' 'Pathogenic, protective'
 'Uncertain significance, drug response'
 'Conflicting interpretations of pathogenicity, other']
Splicing undetermined
6983
Splicing determined
9235
Splicing pathogenic
3548
Splicing benign
5687


In [44]:

print(clinvar_utr3.significance.unique())

print('UTR 3 undetermined')
print(len(clinvar_utr3[(clinvar_utr3.significance == 'Uncertain significance')]))

print('UTR 3 determined')
print(len(clinvar_utr3[
            ((clinvar_utr3.significance == 'Pathogenic') |
            (clinvar_utr3.significance == 'Benign'))
        ]))

print('UTR 3 pathogenic')
print(len(clinvar_utr3[
            (clinvar_utr3.significance == 'Pathogenic') #|
#            (clinvar_utr3.significance == 'Likely pathogenic'))
        ]))

print('UTR 3 benign')
print(len(clinvar_utr3[
            (clinvar_utr3.significance == 'Benign') #|
#            (clinvar_utr3.significance == 'Likely benign'))
        ]))


['Uncertain significance' 'Pathogenic'
 'Conflicting interpretations of pathogenicity'
 'Conflicting interpretations of pathogenicity, other' 'risk factor'
 'association' 'Pathogenic, risk factor' 'Benign, risk factor' 'Benign'
 'Benign/Likely benign' 'Likely benign' 'not provided'
 'Conflicting interpretations of pathogenicity, risk factor'
 'Benign, association' 'other' 'Benign, other' 'drug response']
UTR 3 undetermined
16849
UTR 3 determined
3219
UTR 3 pathogenic
18
UTR 3 benign
3201


In [25]:
'''clinvar_df = clinvar_df.ix[
    (((((((clinvar_df.consequence == 'UTR-3')
    |
    (clinvar_df.consequence == 'UTR-5'))
    |
    (clinvar_df.consequence == 'intron'))
    |
    (clinvar_df.consequence == 'undetermined'))
    |
    (clinvar_df.consequence == '5 prime UTR variant'))
    |
    (clinvar_df.consequence == '3 prime UTR variant'))
    |
    (clinvar_df.consequence == 'non-coding transcript variant'))
    |
    (clinvar_df.consequence == 'intron variant')
]'''
clinvar_df = clinvar_df.ix[(clinvar_df.consequence == 'UTR-3') | (clinvar_df.consequence == '3 prime UTR variant')]

print(clinvar_df.head())
print(len(clinvar_df))


clinvar_dict = {}

for index, row in clinvar_df.iterrows() :
    clinvar_dict[row['gene']] = {}
    clinvar_dict[row['gene']]['significance'] = row['significance']
    clinvar_dict[row['gene']]['vartype'] = row['vartype']
    clinvar_dict[row['gene']]['varcons'] = row['consequence']
    
    clinvar_dict[row['gene']]['refallele'] = row['refallele']
    clinvar_dict[row['gene']]['varallele'] = row['varallele']


        chr      start        end                                  gene  \
1625  chr22   45691707   45691707    UPK3A:chr22:45691707:45691707:T->C   
2886  chr13   84452863   84452863  SLITRK1:chr13:84452863:84452863:C->T   
3353   chr2   86444180   86444180     REEP1:chr2:86444180:86444180:C->A   
5493  chr22   51063477   51063477     ARSA:chr22:51063477:51063477:T->C   
6089   chr7  128589427  128589427    IRF5:chr7:128589427:128589427:G->A   

     refallele varallele assembly  \
1625         T         C   GRCh37   
2886         C         T   GRCh37   
3353         C         A   GRCh37   
5493         T         C   GRCh37   
6089         G         A   GRCh37   

                                           significance  \
1625                             Uncertain significance   
2886                                         Pathogenic   
3353       Conflicting interpretations of pathogenicity   
5493  Conflicting interpretations of pathogenicity, ...   
6089                           

In [26]:
#Create bed file of apadb sites

clinvar_bed = 'clinvar_snps.bed'
bed_columns = ['chr', 'start', 'end', 'gene']
clinvar_df.to_csv(clinvar_bed, sep='\t', header=False, columns=bed_columns, index=False)

In [27]:

#Intersect Clinvar snps against APADB bed

!bedtools intersect -wa -wb -a 'clinvar_snps.bed' -b 'apadb_shortsites.bed' > 'clinvar_apadb_intersection.bed'



In [28]:

intersect_columns = ['chr', 'snp_start', 'snp_end', 'snp_key', 'apadb_site']
intersect_df = pd.read_csv('clinvar_apadb_intersection.bed', sep='\t', header=None, names=intersect_columns, usecols=[0, 1, 2, 3, 7])

print(intersect_df.head())

print(len(intersect_df))


     chr  snp_start    snp_end                             snp_key apadb_site
0  chr22   45691707   45691707  UPK3A:chr22:45691707:45691707:T->C    UPK3A.1
1   chr7  128589427  128589427  IRF5:chr7:128589427:128589427:G->A     IRF5.2
2  chr11   46761055   46761055     F2:chr11:46761055:46761055:G->A       F2.1
3  chr11    5246715    5246715      HBB:chr11:5246715:5246715:T->C      HBB.1
4  chr11    5246715    5246715      HBB:chr11:5246715:5246715:T->C      HBB.2
2338


In [29]:
print(intersect_df[intersect_df.apadb_site == 'TUBGCP6.2'])

Empty DataFrame
Columns: [chr, snp_start, snp_end, snp_key, apadb_site]
Index: []


In [30]:

#Generate APADB Clinvar variant dataset

l_gene = []
l_refseq = []
l_varseq = []
l_region = []
l_significance = []
l_vartype = []
l_varpos = []
l_reads = []
l_totalreads = []
l_reluse = []
l_numsites = []
l_upsitedist = []
l_dnsitedist = []

for index, row in intersect_df.iterrows() :
    site_id = row['apadb_site']
    
    site_start = apadb_dict[site_id]['start']
    site_end = apadb_dict[site_id]['end']
    site_strand = apadb_dict[site_id]['strand']
    site_refseq = apadb_dict[site_id]['seq']
    
    site_num = apadb_dict[site_id]['num_sites']
    site_up_dist = apadb_dict[site_id]['up_site_dist']
    site_dn_dist = apadb_dict[site_id]['dn_site_dist']
    
    snp_start = row['snp_start']
    snp_end = row['snp_end']
    snp_ref = row['snp_key'].split('->')[0][len(row['snp_key'].split('->')[0])-1]
    snp_var = row['snp_key'].split('->')[1][0]
    
    
    site_varseq = site_refseq
    relpos = -1
    
    if site_strand == '+' :
        relpos = snp_start - site_start - 1
        if relpos < 0 :
            print('WARNING (+)! Relpos out of range (' + str(relpos) + ')')
            continue
        if site_refseq[relpos] != snp_ref :
            print('WARNING (+)! Ref base differs.')
            print(relpos)
            print(site_refseq[relpos-5:relpos+5])
            print(site_refseq[relpos])
            print(snp_ref)
        site_varseq = site_varseq[:relpos] + snp_var + site_varseq[relpos+1:]
    elif site_strand == '-' :
        snp_ref_comp = ''
        if snp_ref == 'A' :
            snp_ref_comp = 'T'
        elif snp_ref == 'C' :
            snp_ref_comp = 'G'
        elif snp_ref == 'G' :
            snp_ref_comp = 'C'
        elif snp_ref == 'T' :
            snp_ref_comp = 'A'
        
        snp_var_comp = ''
        if snp_var == 'A' :
            snp_var_comp = 'T'
        elif snp_var == 'C' :
            snp_var_comp = 'G'
        elif snp_var == 'G' :
            snp_var_comp = 'C'
        elif snp_var == 'T' :
            snp_var_comp = 'A'
        
        relpos = site_end - snp_start
        if relpos < 0 :
            print('WARNING (-)! Relpos out of range (' + str(relpos) + ')')
            continue
        if site_refseq[relpos] != snp_ref_comp :
            print('WARNING (-)! Ref base differs.')
            print(relpos)
            print(site_refseq[relpos-5:relpos+5])
            print(site_refseq[relpos])
            print(snp_ref_comp)
        site_varseq = site_varseq[:relpos] + snp_var_comp + site_varseq[relpos+1:]
        
    
    if site_refseq == site_varseq :
        print('WARNING! Ref seq == Var seq.')
        continue
    
    l_gene.append(site_id)
    l_varpos.append(relpos)
    l_refseq.append(site_refseq)
    l_varseq.append(site_varseq)
    l_region.append(apadb_dict[site_id]['feature'])
    l_significance.append(clinvar_dict[row['snp_key']]['significance'])
    l_vartype.append(clinvar_dict[row['snp_key']]['vartype'])
        
    l_reads.append(apadb_dict[site_id]['reads'])
    l_totalreads.append(apadb_dict[site_id]['total_reads'])
    l_reluse.append(apadb_dict[site_id]['rel_use'])
    
    l_numsites.append(site_num)
    l_upsitedist.append(site_up_dist)
    l_dnsitedist.append(site_dn_dist)
        
        
snp_df = pd.DataFrame({'gene' : l_gene,
                    'var_pos' : l_varpos,
                    'ref_seq' : l_refseq,
                    'var_seq' : l_varseq,
                    'region' : l_region,
                    'significance' : l_significance,
                    'vartype' : l_vartype,
                    'num_sites' : l_numsites,
                    'up_site_dist' : l_upsitedist,
                    'dn_site_dist' : l_dnsitedist,
                    'reads' : l_reads,
                    'total_reads' : l_totalreads,
                    'rel_use' : l_reluse
                })

snp_df = snp_df[['gene', 'var_pos', 'ref_seq', 'var_seq', 'region', 'significance', 'vartype', 'num_sites', 'up_site_dist', 'dn_site_dist', 'reads', 'total_reads', 'rel_use']]
snp_df = snp_df.sort_values(by='gene')

print(snp_df.head())
print(len(snp_df))

snp_df.to_csv('apadb_snps.csv', header=True, index=False, sep='\t')


         gene  var_pos                                            ref_seq  \
1784   AAAS.1      179  TGGGCGGGCCCAGGAACCCCCTGCTGGGGGTGGAGGCTCTATTCAT...   
2118   AARS.2      165  AATTTCTGCCCTGAGCCCTCCACGTCAGTGCCATCGGTCTAGAACC...   
984   AARS2.1      171  TGGAGGAAAGCCCTCCTTCCCTACCAGAATGAGTCCACCTCAGACA...   
861   AARS2.2      200  GTGAGGGTGTGACCCGTGTGGAAATCAGGGAAAAGCATTCCAGCCT...   
779   AARS2.2      173  GTGAGGGTGTGACCCGTGTGGAAATCAGGGAAAAGCATTCCAGCCT...   

                                                var_seq region  \
1784  TGGGCGGGCCCAGGAACCCCCTGCTGGGGGTGGAGGCTCTATTCAT...   UTR3   
2118  AATTTCTGCCCTGAGCCCTCCACGTCAGTGCCATCGGTCTAGAACC...   UTR3   
984   TGGAGGAAAGCCCTCCTTCCCTACCAGAATGAGTCCACCTCAGACA...   UTR3   
861   GTGAGGGTGTGACCCGTGTGGAAATCAGGGAAAAGCATTCCAGCCT...   UTR3   
779   GTGAGGGTGTGACCCGTGTGGAAATCAGGGAAAAGCATTCCAGCCT...   UTR3   

                significance                    vartype  num_sites  \
1784           Likely benign  single nucleotide variant          1   


In [46]:


print('APA undetermined')
print(len(snp_df[(snp_df.significance == 'Uncertain significance')]))

print('APA determined')
print(len(snp_df[
            (snp_df.significance == 'Pathogenic') |
            (snp_df.significance == 'Benign')
        ]))

print('APA pathogenic')
print(len(snp_df[
            (snp_df.significance == 'Pathogenic')
        ]))

print('APA benign')
print(len(snp_df[
            (snp_df.significance == 'Benign') 
        ]))



APA undetermined
1442
APA determined
277
APA pathogenic
18
APA benign
259
