In [None]:
## Read in the VCF file 
vcf = cyvcf2.VCF(vcf_path)

## Get the VEP description fields into a 'list'
kcsq = vcf["vep"]["Description"].split(":")[1].strip(' "').split("|")

## Read in the gff transcript file and subset calls from chr20
transcripts = exter.read_gff(gff_path)
transcripts20 = transcripts["20"] # get chr20

## Make an empty dictionary 
local_by_gene = defaultdict(list)

## For each variant in the VCF file: 
for variant in vcf:
    ## Remove variants that have "None, PASS, SEGDUP, or LCR" as the filter result
    ## NOTE: 'continue' will remove variants without these FILTER results
    if not (variant.FILTER is None or variant.FILTER in ["PASS", "SEGDUP", "LCR"]):
            continue
    ## Get the variant 'INFO' field from the VCF file for each variant
    info = variant.INFO
    ## Set the functional consequence of the variant to 'False'
    any_functional = False
    ## Merge the kcsq description keys with 
    csqs = [dict(zip(kcsq, c.split("|"))) for c in info['vep'].split(",")]

    ## For all variant/consequences that are within protein_coding regions
    for csq in (c for c in csqs if c['BIOTYPE'] == 'protein_coding'):
        ## NOTE: 'continue' will ignore variants that are intronic 
        if csq['Feature'] == '' or csq['EXON'] == '': 
            continue #or csq['cDNA_position'] == '': continue
        ## Check to see if the variant's consequence is functional (based on the mutation types listed above)
        ## NOTE: 'continue' will ignore variants that are not functional
        if not isfunctional(csq): 
            continue
        ## Assigns True to the variant as functional if it is NOT intronic and is deemed as functional by the 'isfunctional' function
        any_functional = True

        ## Get the exonic positions of the variants
        local = transcripts20.localize(variant.start, variant.end)

        ## Get the VAF of the variant
        vaf = variant.INFO.get('AF')

        ## Get the SIFT/PolyPhen results (IS THIS RIGHT)
        SIFT = csq["SIFT"]
        PolyPhen = csq["PolyPhen"]
                
        ## Get the reference allele 
        ref_allele = variant.REF

        ## Get string for the mutation 
        mut = []
        for alt in variant.ALT: 
            mut.append(ref_allele + ">" + alt) 
        
        ## Remove variants that do not meet a specific VAF cutoff 
        # if vaf < 0.0000001: continue
        
        ## Perform SIFT and Polyphen filters to remove the least delterious/damaging variant(s)
        ## Go through each pair of scores and check if the varaint has SIFT and PolyPhen scores
        sift_score = sift_cutoff + 0.1
        polyphen_score = polyphen_cutoff - 0.1
        if sift_polyphen_filter == True: 
            if re.findall("deleterious", SIFT):
                sift_score = float(re.split('\)', re.split('\(', SIFT)[1])[0])
            if re.findall("damaging", PolyPhen):
                polyphen_score = float(re.split('\)', re.split('\(', PolyPhen)[1])[0])
            ## SIFT score has to be below the sift_cutoff to pass 
            ## PolyPhen score has to be above the polyphen_cutoff to pass 
            if not sift_score < sift_cutoff and polyphen_score > polyphen_cutoff:
                continue

        ## Remove variants that are found in duplicated segments 
        #if variant.INFO.get('segdup') == True or variant.INFO.get('lcr'): continue

        ## Get the chromosome 
        chr = variant.CHROM

        for l in local:
            l["chr"] = chr
            l["chr_start"] = variant.start
            l["chr_stop"] = variant.end
            l["mutation"] = mut
            l["vaf"] = vaf
            l["SIFT"] = sift_score
            l["PolyPhen"] = polyphen_score
            local_by_gene[l["gene"]].append(l)
        break

In [None]:
local_by_gene[test_key]