## Vep results VCF details

![Vep results design](design.png)

In [5]:
import vcfpy
import json

def pretty_obj(d):
    print(json.dumps(d, sort_keys=True, indent=4))

[vcfpy](https://vcfpy.readthedocs.io/en/stable/index.html) is the python module used by variation to parse VCF files.

The main object used is  [record](https://vcfpy.readthedocs.io/en/stable/api_record.html)

For varient we should use `record.ID` which is an array of zero or more Ids. If there are no ids we should show `.`. In the design we show RS ids, but it has been confirmed that we should show whatever the use provided in their input VCF 


In [6]:
def open_vcf(): #You can only iterate once a vcf
    vcf_file = "vep-output-example-without-phase1-options.vcf"
    return vcfpy.Reader.from_path(vcf_file)

In [7]:
records = open_vcf()
counter = 0
print("ID\tREF\tLocation\tAlt")

for rec in records:
    id = "."
    if len(rec.ID) > 0:
        id = ", ".join(rec.ID) #VCF supports multiple semicolon delimited ids
        
    print(f"{id}\t{rec.REF}\t{rec.CHROM}{rec.begin}\t{', '.join([alt.value for alt in rec.ALT])}")

    counter += 1
    if counter >= 10:
        break


ID	REF	Location	Alt
.	C	chr1982663	T
.	T	chr1982828	A
.	C	chr1982866	T
.	T	chr1983371	C
.	A	chr1983452	G
.	C	chr1984187	T
.	T	chr1984299	C
.	G	chr1984652	A
.	C	chr1985276	A
.	A	chr19243500	G


- Most of the data we want is found in the INFO column of the VCF in a value called CSQ.
- There will be a CSQ entry for every ALT allele in a given record.
- The CSQ is `|` pipe delimited with null values showing as empty `||`.
- values can contain multiple results. They are delimited by an ampersand &



In [8]:
reader = open_vcf()
print(reader.header.get_info_field_info("CSQ").description)


Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|REF_ALLELE|UPLOADED_ALLELE|DISTANCE|STRAND|FLAGS|SYMBOL_SOURCE|HGNC_ID|MANE_SELECT|MANE_PLUS_CLINICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|UNIPROT_ISOFORM|SIFT|PolyPhen|HGVS_OFFSET|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|gnomADe_AF|gnomADe_AFR_AF|gnomADe_AMR_AF|gnomADe_ASJ_AF|gnomADe_EAS_AF|gnomADe_FIN_AF|gnomADe_NFE_AF|gnomADe_OTH_AF|gnomADe_SAS_AF|gnomADg_AF|gnomADg_AFR_AF|gnomADg_AMI_AF|gnomADg_AMR_AF|gnomADg_ASJ_AF|gnomADg_EAS_AF|gnomADg_FIN_AF|gnomADg_MID_AF|gnomADg_NFE_AF|gnomADg_OTH_AF|gnomADg_SAS_AF|CLIN_SIG|SOMATIC|PHENO|PUBMED|VAR_SYNONYMS|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS|pHaplo|pTriplo|OpenTargets_geneId|OpenTargets_l2g


In [9]:
reader = open_vcf()
csq_header_str = reader.header.get_info_field_info("CSQ").description.split(":")[-1].strip()
csq_headers = csq_header_str.split("|")
pretty_obj(csq_headers)

def csq_obj(headers,values):  
    return {headers[x]:values[x] for x in range(0,len(headers))}

[
    "Allele",
    "Consequence",
    "IMPACT",
    "SYMBOL",
    "Gene",
    "Feature_type",
    "Feature",
    "BIOTYPE",
    "EXON",
    "INTRON",
    "HGVSc",
    "HGVSp",
    "cDNA_position",
    "CDS_position",
    "Protein_position",
    "Amino_acids",
    "Codons",
    "Existing_variation",
    "REF_ALLELE",
    "UPLOADED_ALLELE",
    "DISTANCE",
    "STRAND",
    "FLAGS",
    "SYMBOL_SOURCE",
    "HGNC_ID",
    "MANE_SELECT",
    "MANE_PLUS_CLINICAL",
    "TSL",
    "APPRIS",
    "CCDS",
    "ENSP",
    "SWISSPROT",
    "TREMBL",
    "UNIPARC",
    "UNIPROT_ISOFORM",
    "SIFT",
    "PolyPhen",
    "HGVS_OFFSET",
    "AF",
    "AFR_AF",
    "AMR_AF",
    "EAS_AF",
    "EUR_AF",
    "SAS_AF",
    "gnomADe_AF",
    "gnomADe_AFR_AF",
    "gnomADe_AMR_AF",
    "gnomADe_ASJ_AF",
    "gnomADe_EAS_AF",
    "gnomADe_FIN_AF",
    "gnomADe_NFE_AF",
    "gnomADe_OTH_AF",
    "gnomADe_SAS_AF",
    "gnomADg_AF",
    "gnomADg_AFR_AF",
    "gnomADg_AMI_AF",
    "gnomADg_AMR_AF",
    "gnomAD

The values found in the CSQ will change depending on the options selected during VEP. Some values will always be present. We should check this header to ensure we are looking in the correct index before pulling out 

Predicted molecular consequence can be found in `Consequence`. This is a multi value field so you will need to split by '&' if present

In [10]:
print("Predicted molecular consequence & alt allele frequency\n")

reader = open_vcf()

found = False
for rec in reader:
    for str_csq in rec.INFO["CSQ"]:
        csq_dict = csq_obj(csq_headers,str_csq.split("|"))
        if csq_dict['AF']:
            print(f"{rec.REF}\t{rec.CHROM}{rec.begin}\t{', '.join([alt.value for alt in rec.ALT])}")
            print(f"{', '.join(csq_dict['Consequence'].split('&'))}\t{csq_dict['AF']}")      
            found = True
            break
    if found:
        break




Predicted molecular consequence & alt allele frequency

A	chr19243500	G
intron_variant, non_coding_transcript_variant	0.3730


In [11]:
reader = open_vcf()

for rec in reader:
    csq_dict = csq_obj(csq_headers,rec.INFO["CSQ"][0].split("|"))
    pretty_obj(csq_dict)
    break
        

{
    "AF": "",
    "AFR_AF": "",
    "AMR_AF": "",
    "APPRIS": "",
    "Allele": "T",
    "Amino_acids": "",
    "BIOTYPE": "lncRNA",
    "CCDS": "",
    "CDS_position": "",
    "CLIN_SIG": "",
    "Codons": "",
    "Consequence": "upstream_gene_variant",
    "DISTANCE": "4978",
    "EAS_AF": "",
    "ENSP": "",
    "EUR_AF": "",
    "EXON": "",
    "Existing_variation": "rs868831437",
    "FLAGS": "",
    "Feature": "ENST00000631376.1",
    "Feature_type": "Transcript",
    "Gene": "ENSG00000282591",
    "HGNC_ID": "HGNC:33581",
    "HGVS_OFFSET": "",
    "HGVSc": "",
    "HGVSp": "",
    "HIGH_INF_POS": "",
    "IMPACT": "MODIFIER",
    "INTRON": "",
    "MANE_PLUS_CLINICAL": "",
    "MANE_SELECT": "",
    "MOTIF_NAME": "",
    "MOTIF_POS": "",
    "MOTIF_SCORE_CHANGE": "",
    "OpenTargets_geneId": "",
    "OpenTargets_l2g": "",
    "PHENO": "",
    "PUBMED": "",
    "PolyPhen": "",
    "Protein_position": "",
    "REF_ALLELE": "C",
    "SAS_AF": "",
    "SIFT": "",
    "SOMATIC"

Gene & regulation features and transcripts

`Feature` and `Feature_type` hold details for both features and transcripts. It was agreed in the web variation catch up that anything with a `Feature_type` of `Transcript` will go in the transcript column for the design. Everything else should do in the features column

For transcripts we also show the `BIOTYPE`

In [12]:
print("Transcripts")

reader = open_vcf()

found = False
for rec in reader:
    for str_csq in rec.INFO["CSQ"]:
        csq_dict = csq_obj(csq_headers,str_csq.split("|"))
        if csq_dict['Feature'] and csq_dict['Feature_type'] == "Transcript":
            print(f"{rec.REF}\t{rec.CHROM}{rec.begin}\t{', '.join([alt.value for alt in rec.ALT])}")
            print(f"{csq_dict['Feature']}\t{csq_dict['BIOTYPE']}")      
            found = True
            break
    if found:
        break

Transcripts
C	chr1982663	T
ENST00000631376.1	lncRNA


For features we show `SYMBOL` if we have it, `Feature`, `BIOTYPE` if we have it and there is no gene, and finally `STRAND` providing it is either 1 (`forward strand`) or -1 (`reverse strand`)

In [13]:
print("Features")

reader = open_vcf()

found_g = 0
found_b = 0
found_s = 0
for rec in reader:
    for str_csq in rec.INFO["CSQ"]:
        csq_dict = csq_obj(csq_headers,str_csq.split("|"))
        if csq_dict['Feature'] and csq_dict['Feature_type'] != "Transcript":
            
            if csq_dict['Gene'] and found_g == 0:
                print(f"{rec.REF}\t{rec.CHROM}{rec.begin}\t{', '.join([alt.value for alt in rec.ALT])}")
                print(f"Feature:{csq_dict['Feature']} Gene:{csq_dict['SYMBOL']}")
                found_g = 1
            if csq_dict['BIOTYPE'] and found_b == 0:
                print(f"{rec.REF}\t{rec.CHROM}{rec.begin}\t{', '.join([alt.value for alt in rec.ALT])}")
                print(f"Feature:{csq_dict['Feature']} Biotype:{csq_dict['BIOTYPE']}")
                found_b = 1
            if csq_dict['STRAND'] and found_s == 0:
                print(f"{rec.REF}\t{rec.CHROM}{rec.begin}\t{', '.join([alt.value for alt in rec.ALT])}")
                print(f"Feature:{csq_dict['Feature']} Strand: {csq_dict['STRAND']}")
                found_s = 1


    if found_g + found_b + found_s == 3:
        break

Features
A	chr19243500	G
Feature:ENSR00001020616 Biotype:enhancer
C	chr19267212	T
Feature:ENSM00201890131 Strand: 1
