In [37]:
import cyvcf2
import sys
import exter ## exter.py file has to be in the working directory 
import re
import pandas as pd
import numpy as np
from collections import defaultdict
from scipy.stats import percentileofscore
import subprocess
from IPython.display import SVG
gff_path = "/Users/jasonkunisaki/git/quinlan_lab_rotation/reference/Homo_sapiens.GRCh37.82.chr20.gff3" 
wgs_vcf_path = "/Users/jasonkunisaki/git/quinlan_lab_rotation/reference/gnomad.genomes.r2.1.1.exome_calling_intervals.sites.vcf.bgz"
exome_vcf_path = "/Users/jasonkunisaki/git/quinlan_lab_rotation/reference/gnomad.exomes.r2.1.1.sites.20.vcf.bgz"
sift_polyphen_filter = True
sift_cutoff = 0.03
polyphen_cutoff = 0.6

In [38]:
def isfunctional(csq):
    return any(c in csq['Consequence'] for c in ('stop_gained', 'stop_lost', 'start_lost', 
                                                 'initiator_codon_variant', 'rare_amino_acid_variant', 
                                                 'missense_variant', 'protein_altering_variant', 
                                                 'frameshift_variant', 'inframe_insertion', 'inframe_deletion')) \
    or (('splice_donor_variant' in csq['Consequence'] or 'splice_acceptor_variant' in csq['Consequence']) \
        and 'coding_sequence_variant' in csq['Consequence'])

In [39]:
def processVCF(vcf_path, gff_path, sift_polyphen_filter, sift_cutoff, polyphen_cutoff, type="exome"):
    ## Read in the VCF file 
    vcf = cyvcf2.VCF(vcf_path)
    
    ## Get the VEP description fields into a 'list'
    kcsq = vcf["vep"]["Description"].split(":")[1].strip(' "').split("|")
    
    ## Read in the gff transcript file and subset calls from chr20
    transcripts = exter.read_gff(gff_path)
    transcripts20 = transcripts["20"] # get chr20

    ## Make an empty dictionary 
    local_by_gene = defaultdict(list)
    
    ## For each variant in the VCF file: 
    for variant in vcf:
        if variant.CHROM != "20": continue
        ## Remove variants that have "None, PASS, SEGDUP, or LCR" as the filter result
        ## NOTE: 'continue' will remove variants without these FILTER results
        if not (variant.FILTER is None or variant.FILTER in ["PASS", "SEGDUP", "LCR"]):
                continue
        ## Get the variant 'INFO' field from the VCF file for each variant
        info = variant.INFO
        ## Set the functional consequence of the variant to 'False'
        any_functional = False
        ## Merge the kcsq description keys with 
        csqs = [dict(zip(kcsq, c.split("|"))) for c in info['vep'].split(",")]

        ## For all variant/consequences that are within protein_coding regions
        for csq in (c for c in csqs if c['BIOTYPE'] == 'protein_coding'):
            ## NOTE: 'continue' will ignore variants that are intronic 
            if csq['Feature'] == '' or csq['EXON'] == '': 
                continue #or csq['cDNA_position'] == '': continue
            ## Check to see if the variant's consequence is functional (based on the mutation types listed above)
            ## NOTE: 'continue' will ignore variants that are not functional
            if not isfunctional(csq): 
                continue
            ## Assigns True to the variant as functional if it is NOT intronic and is deemed as functional by the 'isfunctional' function
            any_functional = True

            ## Get the exonic positions of the variants
            local = transcripts20.localize(variant.start, variant.end)

            ## Get the VAF of the variant
            vaf = variant.INFO.get('AF')

            ## Get the SIFT/PolyPhen results (IS THIS RIGHT)
            SIFT = csq["SIFT"]
            PolyPhen = csq["PolyPhen"]

            ## Get the reference allele 
            ref_allele = variant.REF

            ## Get string for the mutation 
            mut = []
            for alt in variant.ALT: 
                mut.append(ref_allele + ">" + alt) 

            ## Remove variants that do not meet a specific VAF cutoff 
            # if vaf < 0.0000001: continue

            ## Perform SIFT and Polyphen filters to remove the least delterious/damaging variant(s)
            ## Go through each pair of scores and check if the varaint has SIFT and PolyPhen scores
            sift_score = sift_cutoff + 0.1
            polyphen_score = polyphen_cutoff - 0.1
            if sift_polyphen_filter == True: 
                if re.findall("deleterious", SIFT):
                    sift_score = float(re.split('\)', re.split('\(', SIFT)[1])[0])
                if re.findall("damaging", PolyPhen):
                    polyphen_score = float(re.split('\)', re.split('\(', PolyPhen)[1])[0])
                ## SIFT score has to be below the sift_cutoff to pass 
                ## PolyPhen score has to be above the polyphen_cutoff to pass 
                if not sift_score < sift_cutoff and polyphen_score > polyphen_cutoff:
                    continue

            ## Remove variants that are found in duplicated segments 
            #if variant.INFO.get('segdup') == True or variant.INFO.get('lcr'): continue
            
            for l in local:
                l["chr"] = variant.CHROM ## Get the chromosome 
                l["chr_start"] = variant.start
                l["chr_stop"] = variant.end
                l["mutation"] = mut
                l["vaf"] = vaf
                l["SIFT"] = sift_score
                l["PolyPhen"] = polyphen_score
                l["type"] = type
                local_by_gene[l["gene"]].append(l)
            break
    return(local_by_gene)

In [42]:
wgs_local_by_gene = processVCF(vcf_path=wgs_vcf_path, gff_path=gff_path, sift_polyphen_filter=sift_polyphen_filter, sift_cutoff=sift_cutoff, polyphen_cutoff=polyphen_cutoff, type="whole genome sequencing")

In [43]:
wgs_local_by_gene["DEFB125"]

[{'gene': 'DEFB125',
  'start': 5,
  'stop': 6,
  'strand': '+',
  'exon': 0,
  'chr': '20',
  'chr_start': 68356,
  'chr_stop': 68357,
  'mutation': ['A>G'],
  'vaf': 3.18552010867279e-05,
  'SIFT': 0.13,
  'PolyPhen': 0.5,
  'type': 'whole genome sequencing'},
 {'gene': 'DEFB125',
  'start': 11,
  'stop': 12,
  'strand': '+',
  'exon': 0,
  'chr': '20',
  'chr_start': 68362,
  'chr_stop': 68363,
  'mutation': ['A>T'],
  'vaf': 0.00012747000437229872,
  'SIFT': 0.13,
  'PolyPhen': 0.5,
  'type': 'whole genome sequencing'},
 {'gene': 'DEFB125',
  'start': 22,
  'stop': 23,
  'strand': '+',
  'exon': 0,
  'chr': '20',
  'chr_start': 68373,
  'chr_stop': 68374,
  'mutation': ['C>A'],
  'vaf': 3.1859301088843495e-05,
  'SIFT': 0.13,
  'PolyPhen': 0.5,
  'type': 'whole genome sequencing'},
 {'gene': 'DEFB125',
  'start': 32,
  'stop': 33,
  'strand': '+',
  'exon': 0,
  'chr': '20',
  'chr_start': 68383,
  'chr_stop': 68384,
  'mutation': ['G>C'],
  'vaf': 3.18552010867279e-05,
  'SIFT': 0

In [65]:
exome_local_by_gene = processVCF(vcf_path=exome_vcf_path, gff_path=gff_path, sift_polyphen_filter=sift_polyphen_filter, sift_cutoff=sift_cutoff, polyphen_cutoff=polyphen_cutoff, type="exome sequencing")

In [66]:
exome_local_by_gene["DEFB125"]

[{'gene': 'DEFB125',
  'start': -1,
  'stop': 0,
  'strand': '+',
  'exon': 0,
  'chr': '20',
  'chr_start': 68350,
  'chr_stop': 68351,
  'mutation': ['A>G'],
  'vaf': 3.979720077040838e-06,
  'SIFT': 0.0,
  'PolyPhen': 0.5,
  'type': 'exome sequencing'},
 {'gene': 'DEFB125',
  'start': 11,
  'stop': 12,
  'strand': '+',
  'exon': 0,
  'chr': '20',
  'chr_start': 68362,
  'chr_stop': 68363,
  'mutation': ['A>C'],
  'vaf': 3.979179837187985e-06,
  'SIFT': 0.13,
  'PolyPhen': 0.5,
  'type': 'exome sequencing'},
 {'gene': 'DEFB125',
  'start': 11,
  'stop': 12,
  'strand': '+',
  'exon': 0,
  'chr': '20',
  'chr_start': 68362,
  'chr_stop': 68363,
  'mutation': ['A>T'],
  'vaf': 0.0001949800061993301,
  'SIFT': 0.13,
  'PolyPhen': 0.5,
  'type': 'exome sequencing'},
 {'gene': 'DEFB125',
  'start': 12,
  'stop': 13,
  'strand': '+',
  'exon': 0,
  'chr': '20',
  'chr_start': 68363,
  'chr_stop': 68364,
  'mutation': ['T>G'],
  'vaf': 3.9790897972125094e-06,
  'SIFT': 0.0,
  'PolyPhen': 0.

In [67]:
combined_wgs_exome_variants = defaultdict(list)
for key in wgs_local_by_gene.keys(): 
    if len(exome_local_by_gene[key]) == 0: 
        combined_wgs_exome_variants[key] = wgs_local_by_gene[key]
    else: 
        combined_wgs_exome_variants[key] = exome_local_by_gene[key]
        combined_wgs_exome_variants[key].append(wgs_local_by_gene[key])

In [68]:
combined_wgs_exome_variants["DEFB125"]

[{'gene': 'DEFB125',
  'start': -1,
  'stop': 0,
  'strand': '+',
  'exon': 0,
  'chr': '20',
  'chr_start': 68350,
  'chr_stop': 68351,
  'mutation': ['A>G'],
  'vaf': 3.979720077040838e-06,
  'SIFT': 0.0,
  'PolyPhen': 0.5,
  'type': 'exome sequencing'},
 {'gene': 'DEFB125',
  'start': 11,
  'stop': 12,
  'strand': '+',
  'exon': 0,
  'chr': '20',
  'chr_start': 68362,
  'chr_stop': 68363,
  'mutation': ['A>C'],
  'vaf': 3.979179837187985e-06,
  'SIFT': 0.13,
  'PolyPhen': 0.5,
  'type': 'exome sequencing'},
 {'gene': 'DEFB125',
  'start': 11,
  'stop': 12,
  'strand': '+',
  'exon': 0,
  'chr': '20',
  'chr_start': 68362,
  'chr_stop': 68363,
  'mutation': ['A>T'],
  'vaf': 0.0001949800061993301,
  'SIFT': 0.13,
  'PolyPhen': 0.5,
  'type': 'exome sequencing'},
 {'gene': 'DEFB125',
  'start': 12,
  'stop': 13,
  'strand': '+',
  'exon': 0,
  'chr': '20',
  'chr_start': 68363,
  'chr_stop': 68364,
  'mutation': ['T>G'],
  'vaf': 3.9790897972125094e-06,
  'SIFT': 0.0,
  'PolyPhen': 0.

In [None]:
## Read in the VCF file 
vcf = cyvcf2.VCF(vcf_path)

## Get the VEP description fields into a 'list'
kcsq = vcf["vep"]["Description"].split(":")[1].strip(' "').split("|")

## Read in the gff transcript file and subset calls from chr20
transcripts = exter.read_gff(gff_path)
transcripts20 = transcripts["20"] # get chr20

## Make an empty dictionary 
local_by_gene = defaultdict(list)

## For each variant in the VCF file: 
for variant in vcf:
    ## Remove variants that have "None, PASS, SEGDUP, or LCR" as the filter result
    ## NOTE: 'continue' will remove variants without these FILTER results
    if not (variant.FILTER is None or variant.FILTER in ["PASS", "SEGDUP", "LCR"]):
            continue
    ## Get the variant 'INFO' field from the VCF file for each variant
    info = variant.INFO
    ## Set the functional consequence of the variant to 'False'
    any_functional = False
    ## Merge the kcsq description keys with 
    csqs = [dict(zip(kcsq, c.split("|"))) for c in info['vep'].split(",")]

    ## For all variant/consequences that are within protein_coding regions
    for csq in (c for c in csqs if c['BIOTYPE'] == 'protein_coding'):
        ## NOTE: 'continue' will ignore variants that are intronic 
        if csq['Feature'] == '' or csq['EXON'] == '': 
            continue #or csq['cDNA_position'] == '': continue
        ## Check to see if the variant's consequence is functional (based on the mutation types listed above)
        ## NOTE: 'continue' will ignore variants that are not functional
        if not isfunctional(csq): 
            continue
        ## Assigns True to the variant as functional if it is NOT intronic and is deemed as functional by the 'isfunctional' function
        any_functional = True

        ## Get the exonic positions of the variants
        local = transcripts20.localize(variant.start, variant.end)

        ## Get the VAF of the variant
        vaf = variant.INFO.get('AF')

        ## Get the SIFT/PolyPhen results (IS THIS RIGHT)
        SIFT = csq["SIFT"]
        PolyPhen = csq["PolyPhen"]
                
        ## Get the reference allele 
        ref_allele = variant.REF

        ## Get string for the mutation 
        mut = []
        for alt in variant.ALT: 
            mut.append(ref_allele + ">" + alt) 
        
        ## Remove variants that do not meet a specific VAF cutoff 
        # if vaf < 0.0000001: continue
        
        ## Perform SIFT and Polyphen filters to remove the least delterious/damaging variant(s)
        ## Go through each pair of scores and check if the varaint has SIFT and PolyPhen scores
        sift_score = sift_cutoff + 0.1
        polyphen_score = polyphen_cutoff - 0.1
        if sift_polyphen_filter == True: 
            if re.findall("deleterious", SIFT):
                sift_score = float(re.split('\)', re.split('\(', SIFT)[1])[0])
            if re.findall("damaging", PolyPhen):
                polyphen_score = float(re.split('\)', re.split('\(', PolyPhen)[1])[0])
            ## SIFT score has to be below the sift_cutoff to pass 
            ## PolyPhen score has to be above the polyphen_cutoff to pass 
            if not sift_score < sift_cutoff and polyphen_score > polyphen_cutoff:
                continue

        ## Remove variants that are found in duplicated segments 
        #if variant.INFO.get('segdup') == True or variant.INFO.get('lcr'): continue

        ## Get the chromosome 
        chr = variant.CHROM

        for l in local:
            l["chr"] = chr
            l["chr_start"] = variant.start
            l["chr_stop"] = variant.end
            l["mutation"] = mut
            l["vaf"] = vaf
            l["SIFT"] = sift_score
            l["PolyPhen"] = polyphen_score
            local_by_gene[l["gene"]].append(l)
        break

In [None]:
local_by_gene[test_key]