# VCF_functions.py

A framework to process variant call files (VCF) and mutation annotation files (MAF) into pandas dataframes. Able to retrieve mutations for proteins of interest using their Ensembl Transcript identifiers.

In [2]:
import pandas as pd
import VCF_functions as vcf

## Process a VCF file

### Without specifying a list of transcript ids

In [4]:
df_vcf = vcf.read_process_vcf('./data/vcf_data/a/a.vcf', 'missense_variant', 'SNV', return_case_id=False)

In [5]:
df_vcf

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NORMAL,TUMOR
0,chr7,1111,.,X,Y,.,PASS,"{'ACGTNacgtnMINUS': '0', 'ACGTNacgtnPLUS': '0'...",X,Y,Z


### With a list of specified transcript ids, the information in the `INFO` column for that transcript is expanded and added to the dataframe

In [6]:
ids = ['ENST00000318560']
df_vcf = vcf.read_process_vcf('./data/vcf_data/a/a.vcf', 'missense_variant', 'SNV', return_case_id=False, 
                              id_list = ids)

In [7]:
df_vcf

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NORMAL,...,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,ENTREZ,EVIDENCE
0,chr7,1111,.,X,Y,.,PASS,"{'ACGTNacgtnMINUS': '0', 'ACGTNacgtnPLUS': '0'...",X,Y,...,,,,,,,,,5243,


In [9]:
df_vcf[['Consequence', 'IMPACT', 'SYMBOL', 'Gene',
       'Feature_type', 'Feature', 'BIOTYPE', 'EXON', 'INTRON', 'HGVSc',
       'HGVSp', 'cDNA_position', 'CDS_position', 'Protein_position',
       'Amino_acids']]

Unnamed: 0,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids
0,missense_variant,MODERATE,ABL1,ENSG00000085563,Transcript,ENST00000318560,protein_coding,1/1130,,ENST00000622132.3:c.2428A>G,ENSP00000478255.1:p.Thr810Ala,2753/4778,2428/3843,1/1130,T/A


## Process a MAF file

### Without specifying a list of transcript ids

In [36]:
df_maf = vcf.read_process_maf('./data/maf_data/a.maf', variant='Missense_Mutation', consequence='missense_variant', 
                              variant_class='SNP')

In [37]:
df_maf.head(2)

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,...,normal_bam_uuid,case_id,GDC_FILTER,COSMIC,hotspot,RNA_Support,RNA_depth,RNA_ref_count,RNA_alt_count,callers
1,ATPAF1,64756,WUGSC,GRCh38,chr1,46665282,46665282,+,Missense_Mutation,SNP,...,d4d55edf-e3ee-4b91-8b03-87afdf9bfb5a,bfd49783-1767-469b-9d79-7822301c5efc,,COSM426393,N,Unknown,,,,muse;mutect2;varscan2
2,RORC,6097,WUGSC,GRCh38,chr1,151814660,151814660,+,Missense_Mutation,SNP,...,d4d55edf-e3ee-4b91-8b03-87afdf9bfb5a,bfd49783-1767-469b-9d79-7822301c5efc,,COSM423839;COSM4813058,N,Unknown,,,,muse;mutect2;varscan2


The filtering is done on the `all_effects` column, there may be some variants reported as Silent for the canonical transcript but they are missense for an alternative transcript, those cases are kept

### With a list of specified transcript ids, the information in the `all_effects` column for that transcript is expanded and added to the dataframe

In [39]:
ids = ['ENST00000574428', 'ENST00000318247']
df_maf = vcf.read_process_maf('./data/maf_data/a.maf', variant='Missense_Mutation', consequence='missense_variant', 
                              variant_class='SNP', id_list=ids)

In [40]:
df_maf

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,...,Consequence_all_effects,HGVSp_Short_all_effects,Transcript_ID_all_effects,RefSeq_all_effects,HGVSc_all_effects,Impact_all_effects,Canonical_all_effects,Sift_all_effects,PolyPhen_all_effects,Strand_all_effects
0,ATPAF1,64756,WUGSC,GRCh38,chr1,46665282,46665282,+,Missense_Mutation,SNP,...,missense_variant,p.D117N,ENST00000574428,,c.349G>A,MODERATE,,deleterious(0.05),possibly_damaging(0.834),-1
1,RORC,6097,WUGSC,GRCh38,chr1,151814660,151814660,+,Missense_Mutation,SNP,...,missense_variant,p.E283K,ENST00000318247,NM_005060.4,c.847G>A,MODERATE,YES,deleterious(0),benign(0.036),-1


When a list of transcript IDs is provided, the data in the `all_effects` column is expanded and added to the dataframe. All the columns coming from the `all_effects` column have '_all_effects' at the end.