# Singleton Analysis Example

An example of using pyvariantfilter to find candidate variants in a a singleton. Pretend NA12878 does not have parental samples.


In [1]:
from pyvariantfilter.family import Family
from pyvariantfilter.family_member import FamilyMember
from pyvariantfilter.variant_set import VariantSet

## Create A Family

The first step in an analyis is to create a Family object which describes the relationships between your samples. In this case the proband is female and affected.

In [2]:
# Create family members - only a single family member for this one. 
proband = FamilyMember('NA12878i', 'FAM001', 2, True)

my_family = Family('FAM001')

my_family.add_family_member(proband)

my_family.set_proband(proband.get_id())

In [3]:
# Create a new VariantSet object
my_variant_set = VariantSet()

# Associate the my_family object with my_variant_set
my_variant_set.add_family(my_family)

In [4]:
def passes_initial_filter(variant, proband_id):
    """
    Filter variants from the VCF.
    
    We import if the variant passes quality filtering and is below 1% in gnomad AND
    
    a) Is listed as pathogenic in clinvar OR
    b) Has a a relevant consequence
    
    """
    
    
    initial_af =0.01
    # If the proband has the variant and we pass the genotype and variant level filters
    if variant.has_alt(proband_id) and variant.passes_gt_filter(proband_id, min_gq=20) and variant.passes_filter():
        
        
        # The filter_on_numerical_transcript_annotation_lte() function allows us to filter on numerical values 
        # we can set different cutoffs for different variant types. For example ad_het is variants in which the 
        # proband is heterozygous on an autosome. In this case we get two boolean values describing whether the 
        # variant is below 1% in the gnomad.

        freq_gnomad = variant.filter_on_numerical_info_annotation_lte(annotation_key='gnomad_popmax_af',
                                                                                          ad_het=initial_af,
                                                                                          ad_hom_alt=initial_af,
                                                                                          x_male =initial_af,
                                                                                          x_female_het=initial_af,
                                                                                          x_female_hom=initial_af,
                                                                                          compound_het=initial_af,
                                                                                          y=initial_af,
                                                                                          mt=initial_af,
                                                                                          zero_values=['.', '', None, -1] )    
        

     
        # Coopt the get_genes() function to get the clinvar annotation VEP field.
        clinvar = variant.get_genes(feature_key='CLIN_SIG')
        is_path_in_clinvar = False
        
        for anno in clinvar:
            
            if 'pathogenic' in anno.lower():
                is_path_in_clinvar = True
                break
                
        # If the variant is below 1% and pathogenic in clinvar then import
        if freq_gnomad and is_path_in_clinvar:
            
            return True
        
        csq_filter = False
        
        if variant.get_worst_consequence() in {'transcript_ablation': None,
                                               'splice_acceptor_variant': None,
                                               'splice_donor_variant': None,
                                               'stop_gained': None,
                                               'frameshift_variant': None,
                                               'stop_lost': None,
                                               'start_lost': None,
                                               'transcript_amplification': None,
                                               'inframe_insertion': None,
                                               'inframe_deletion': None,
                                               'missense_variant': None,
                                               'protein_altering_variant': None,
                                               'splice_region_variant': None,
                                               'incomplete_terminal_codon_variant': None,
                                               'start_retained_variant': None,
                                               'stop_retained_variant': None}:
        
            csq_filter = True
        
       # If the variant is below 1% and has a relevant consequence then import
        if csq_filter and freq_gnomad:
            
            return True
        
    return False

In [5]:
my_variant_set.read_variants_from_vcf('test_data/test.norm.anno.vcf.gz',
                                               proband_variants_only=True,
                                               filter_func=passes_initial_filter,
                                               args=(proband.get_id(),))

In [6]:
print (f'{len(my_variant_set.variant_dict)} variants have been loaded into the variant set.')

389 variants have been loaded into the variant set.


## Get Compound Hets

Now we have a VariantSet object loaded with variants we can find compound hets. There are different methods for this depending on whether the proband has both parents or not. Since in this case we do not - we only look at variants where the proband has two or more heterozygous variants in a transcript.

In [7]:
# Create an attribute my_variant_set.candidate_compound_het_dict where each transcript is a key the variants 
# within that transcript are the values
my_variant_set.get_candidate_compound_hets()

# As we are pretending we do not have any parents we cannot phase the compound hets
my_variant_set.get_unfiltered_compound_hets_as_dict()

## Apply Inheritance Filter

We want to find variants which match certain inheritiance patterns. In this case:

autosomal_dominant, autosomal_reccessive, x_reccessive, x_dominant, de_novo, compound_het, y, mt

In [8]:
def passes_final_filter(variant, compound_het_dict):
    
    initial_af = 0.01
    
    freq_gnomad = variant.filter_on_numerical_info_annotation_lte(annotation_key='gnomad_popmax_af',
                                                                                          ad_het=initial_af,
                                                                                          ad_hom_alt=initial_af,
                                                                                          x_male =initial_af,
                                                                                          x_female_het=initial_af,
                                                                                          x_female_hom=initial_af,
                                                                                          compound_het=initial_af,
                                                                                          y=initial_af,
                                                                                          mt=initial_af,
                                                                                          zero_values=['.', '', None, -1])     
    
    # Get variants which match certain inheritance models
    if freq_gnomad and variant.matches_inheritance_model(['autosomal_dominant',
                                                          'autosomal_reccessive',
                                                          'x_reccessive',
                                                          'x_dominant',
                                                          'de_novo',
                                                          'compound_het',
                                                          'y',
                                                          'mt'], compound_het_dict):
        
            return True
        
    return False

In [9]:
# Apply a the passes_final_filter() function

my_variant_set.filter_variants(passes_final_filter, args=(my_variant_set.final_compound_hets,))

In [10]:
print (f'{len(my_variant_set.variant_dict)} variants left after filtering.')

385 variants left after filtering.


In [11]:
# Convert to dataframe - VEP fields get 'csq_' as a prefix. Each transcript that a variant is in gets its own row.

df = my_variant_set.to_df()

In [12]:
df[['variant_id', 'csq_SYMBOL', 'csq_Feature', 'worst_consequence', 'inheritance_models', 'NA12878i_GT']].head()

Unnamed: 0,variant_id,csq_SYMBOL,csq_Feature,worst_consequence,inheritance_models,NA12878i_GT
0,1:914414CGAA>C,PLEKHN1,NM_001160184.1,inframe_deletion,autosomal_dominant,CGAA/C
1,1:914414CGAA>C,PERM1,NM_001291366.1,inframe_deletion,autosomal_dominant,CGAA/C
2,1:914414CGAA>C,PERM1,NM_001291367.1,inframe_deletion,autosomal_dominant,CGAA/C
3,1:914414CGAA>C,PLEKHN1,NM_032129.2,inframe_deletion,autosomal_dominant,CGAA/C
4,1:914414CGAA>C,PERM1,NR_027693.1,inframe_deletion,autosomal_dominant,CGAA/C


In [13]:
# How many variants of each type do we have?

grouped = df.groupby([ 'inheritance_models','variant_id']).count().reset_index()

In [14]:
grouped = grouped.groupby('inheritance_models').count()

In [15]:
grouped[['variant_id']]

Unnamed: 0_level_0,variant_id
inheritance_models,Unnamed: 1_level_1
autosomal_dominant,303
autosomal_dominant|compound_het,64
autosomal_reccessive,5
x_dominant,10
x_dominant|compound_het,2
x_reccessive,1


In [16]:
test = my_variant_set.to_dict()

In [17]:
test

{'family': {'family_id': 'FAM001',
  'family_members': [{'family_member_id': 'NA12878i',
    'family_id': 'FAM001',
    'sex': 2,
    'affected': True,
    'mum': None,
    'dad': None,
    'proband': True}]},
 'variants': [{'chromosome': '1',
   'position': 914414,
   'ref': 'CGAA',
   'alt': 'C',
   'filter_status': 'PASS',
   'worst_consequence': 'inframe_deletion',
   'all_genes': 'PLEKHN1||PERM1',
   'inheritance_models': 'autosomal_dominant',
   'info_annotations': {'MaxSpliceAI': -1.0,
    'gnomad_popmax_af': 0.0026740198954939842,
    'gnomad_nhomalt': 1.0,
    'gnomad_popmax_af_controls': 0.004065040033310652,
    'gnomad_nhomalt_controls': 0.0},
   'picked_transcript_annotations': [{'Allele': '-',
     'Consequence': 'inframe_deletion',
     'IMPACT': 'MODERATE',
     'SYMBOL': 'PERM1',
     'Gene': '84808',
     'Feature_type': 'Transcript',
     'Feature': 'NM_001291366.1',
     'BIOTYPE': 'protein_coding',
     'EXON': '2/4',
     'INTRON': '',
     'HGVSc': 'NM_001291366.