In [None]:
import requests
import pandas as pd
import subprocess as sp
import sys
from glob import glob
import re
from pysam import VariantFile, VariantRecord
from collections import defaultdict
import csv 
import os


In [None]:
"""
Download the latest files from the mutation catalogue github repo

The excel file contains the graded mutations and their confidence levels
The vcf file contains the genomic coordinates of the mutations linked to the graded mutations
"""

excel_file_url = 'https://github.com/GTB-tbsequencing/mutation-catalogue-2023/raw/main/Final%20Result%20Files/WHO-UCN-TB-2023.6-eng.xlsx'
r = requests.get(excel_file_url, allow_redirects=True)
excel_file = 'WHO-UCN-TB-2023.5-eng.xlsx'
open(excel_file, 'wb').write(r.content)

vcf_file_url = 'https://github.com/GTB-tbsequencing/mutation-catalogue-2023/raw/main/Final%20Result%20Files/Genomic_coordinates_2Feb2024.vcf.gz'
r = requests.get(vcf_file_url, allow_redirects=True)
open('Genomic_coordinates_7Sep2023.vcf.gz', 'wb').write(r.content)

In [None]:
"""
snpEff doesn't use the right codon table for Mtb, so we need to modify the config file
"""
snpeff_dirs = glob(f'{sys.prefix}/share/snpeff*')
if len(snpeff_dirs)==1:
    snpeff_dir = snpeff_dirs[0]
    snpeff_config = f'{snpeff_dir}/snpEff.config'
    if "Mycobacterium_tuberculosis_h37rv.Chromosome.codonTable : Bacterial_and_Plant_Plastid" not in open(snpeff_config).read():
        open(snpeff_config, 'a').write("\nMycobacterium_tuberculosis_h37rv.Chromosome.codonTable : Bacterial_and_Plant_Plastid\n")
    else:
        print("Right codon table alread set")
else:
    print("Multiple or no snpeff installations found. Please remove all but one.")

In [None]:
"""
We annotate the vcf file with snpEff to make sure we are getting the same annotations as the graded mutations
"""
if os.path.isfile('ann.vcf'):
    print("Annotation file already exists")
else:
    sp.call("bcftools view Genomic_coordinates_7Sep2023.vcf.gz | sed 's/NC_000962.3/Chromosome/' | snpEff ann Mycobacterium_tuberculosis_h37rv  > ann.vcf", shell=True)

In [None]:

"""
This function extracts the both the graded mutation and the snpEff mutation from the vcf file
"""
def extract_mutation(var: VariantRecord) -> dict:
    nucleotide_types = set(['initiator_codon_variant','synonymous_variant','upstream_gene_variant','splice_region_variant&stop_retained_variant','splice_region_variant&non_coding_transcript_exon_variant','non_coding_transcript_exon_variant'])
    gene = var.info['graded_variant'].split("_")[0]
    cat_mut = '_'.join(var.info['graded_variant'].split("_")[1:])
    annotations = []
    for a in var.info['ANN']:
        a = a.split('|')
        if a[1] in ('downstream_gene_variant'):
            continue
        if a[3]==gene:
            annotations.append(a)
    if len(annotations) == 0:
        raise Exception
    else:
        a = annotations[0]
        if a[1] in nucleotide_types:
            mutation = a[9]
        elif a[1]=='start_lost':
            mutation = 'p.Met1?'
        elif r:=re.match('p.Ter(\d+)fs',a[10]):
            mutation = f'p.Ter{r.group(1)}ext*?'
        else:
            mutation = a[10]
        return {
            'gene':gene,
            'catalogue_mutation':cat_mut,
            'snpEff_mutation':mutation,
            'type': a[1]
        }


In [None]:
"""
Process the vcf file and compare the graded mutations with the snpEff mutations
Some of the mutations don't agree in terms of the assigned graded mutation and the mutation from snpEff so we dump those
"""
processed = []
dump = []
vcf = VariantFile('ann.vcf')
for i,var in enumerate(vcf):
    try:
        obj = extract_mutation(var)
    except:
        break
    if obj['catalogue_mutation']!=obj['snpEff_mutation']:
        dump.append((var,obj))
    else:
        processed.append(obj)


print('Number of validated mutations:', len(processed))
print('Number of mutations that could not be parsed:', len(dump))





In [None]:
"""
Create a lookup table for the mutations graded mutation -> snpEff mutation
This is a one to many relationship. For example the graded mutation katG LoF is linked to many snpEff mutations (e.g. frameshifts)
"""
variant_lookup = defaultdict(set)
for var in processed:
    key = (var['gene'],var['catalogue_mutation'])
    val = (var['gene'],var['snpEff_mutation'])
    variant_lookup[key].add(val)

In [None]:
"""
Create function to generate the rows with different SO terms for LoF mutations
"""
def generate_lof_rows(gene: str, drug: str, confidence: str, comment: str, vtype: str) -> list:
    if vtype=='deletion':
        so_terms = ['feature_ablation']
    elif vtype=='LoF':
        so_terms = ['transcript_ablation','feature_ablation','start_lost','stop_gained','frameshift_variant']
    lof_rows = []
    for so_term in so_terms:
        lof_rows.append({
            'Gene':gene,
            'Mutation':so_term,
            'type': 'drug_resistance' if confidence in ('Assoc w R', 'Assoc w R - Interim') else 'who_confidence',
            'drug': drug,
            'original_mutation': vtype,
            'confidence': confidence,
            'source': 'WHO catalogue v2',
            'comment': comment
        })
    return lof_rows

In [None]:
""""
Here we run through the excel file and lookup the graded mutations in the lookup table.
If the mutation is not found in the lookup table we dump it to a json file for checking.
If all goes well they should all be found in the lookup table.
"""

unparsed = []
lof_variants = ['deletion','LoF']
confidence = {}
rows = []
gene_drug_associations = set()
for row in pd.read_excel(excel_file,sheet_name='Catalogue_master_file', skiprows=2).to_dict('records'):
    if row['gene']=='dnaA' and row['mutation'].startswith('c.-'): 
        continue
    drug = row['drug'].lower()
    key = (drug,row['gene'],row['mutation'])
    vkey = (row['gene'],row['mutation'])
    if row['mutation'] not in lof_variants and vkey not in variant_lookup:
        unparsed.append(key)
        continue
    gene_drug_associations.add((row['gene'], drug))
    confidence = row['FINAL CONFIDENCE GRADING'][3:]
    for v in variant_lookup[vkey]:
        r = {
            'Gene':v[0],
            'Mutation':v[1],
            'type': 'drug_resistance' if confidence in ('Assoc w R', 'Assoc w R - Interim') else 'who_confidence',
            'drug': drug,
            'original_mutation': row["mutation"],
            'confidence': confidence,
            'source': 'WHO catalogue v2',
            'comment': row["Comment"]
        }
        rows.append(r)
    if row['mutation'] in lof_variants:
        if row['mutation']=='deletion':
            so_terms = ['feature_ablation']
        elif row['mutation']=='LoF':
            so_terms = ['transcript_ablation','feature_ablation','start_lost','stop_gained','frameshift_variant']
        for so_term in so_terms:
            r = {
                'Gene':row['gene'],
                'Mutation': so_term,
                'type': 'drug_resistance' if confidence in ('Assoc w R', 'Assoc w R - Interim') else 'who_confidence',
                'drug': drug,
                'original_mutation': row["mutation"],
                'confidence': confidence,
                'source': 'WHO catalogue v2',
                'comment': row["Comment"]
            }
            rows.append(r)

print("Number of unparsed mutations:",len(unparsed))
print("Number of parsed mutations:",len(rows))

import json
json.dump(unparsed,open('who.json','w'),indent=4)

In [None]:
"""This adds pretomanid to the gene-drug associations based on the expert rules"""
for gene in "ddn, fbiA, fbiB, fbiC, fgd1, Rv2983".split(", "):
    rows += generate_lof_rows(gene, 'pretomanid', 'Assoc w R - Interim', 'Confers DLM-PMD cross-resistance', 'LoF')

In [None]:
"""
This adds the expert rule for Rifampicin/RRDR:
 - Non-silent variants in RRDR of rpoB
"""
for mutation in ('missense_variant_p.426_452','conservative_inframe_deletion_c.1276_1356','conservative_inframe_insertion_c.1276_1356'):
    rows.append({
        'Gene':'rpoB',
        'Mutation':mutation,
        'type':'drug_resistance',
        'drug':'rifampicin',
        'original_mutation':'RRDR non-silent',
        'confidence':'Assoc w R - Interim',
        'source': 'WHO catalogue v2',
        'comment':'Expert rule: Non-silent variants in RRDR of rpoB'
    }) 



In [None]:
"""
Create the mutations output file 
"""

df = pd.DataFrame(rows)
df.to_csv('parsed_who_mutations.csv',index=False)

In [None]:
"""
Create a watchlist with all genes, regardless if they have a tier 1/2 mutation or not
"""

watchlist_rows = []
for gene,drug in gene_drug_associations:
    watchlist_rows.append({
        'Gene':gene,
        'Type':'drug_resistance',
        'Drug':drug,
    })
with open("genes.csv","w")  as O:
    writer = csv.DictWriter(O,fieldnames=['Gene','Type','Drug'])
    writer.writeheader()
    writer.writerows(watchlist_rows)


In [None]:
supported_so_terms = [
    'coding_sequence_variant', 'chromosome', 'duplication', 'inversion', 'coding_sequence_variant', 
    'inframe_insertion', 'disruptive_inframe_insertion', 'inframe_deletion', 'disruptive_inframe_deletion', 
    'downstream_gene_variant', 'exon_variant', 'exon_loss_variant', 'exon_loss_variant', 'duplication', 
    'duplication', 'inversion', 'inversion', 'frameshift_variant', 'gene_variant', 'feature_ablation', 
    'duplication', 'gene_fusion', 'gene_fusion', 'bidirectional_gene_fusion', 'rearranged_at_DNA_level', 
    'intergenic_region', 'conserved_intergenic_variant', 'intragenic_variant', 'intron_variant', 
    'conserved_intron_variant', 'miRNA', 'missense_variant', 'initiator_codon_variant', 'stop_retained_variant', 
    'protein_protein_contact', 'structural_interaction_variant', 'rare_amino_acid_variant', 
    'splice_acceptor_variant', 'splice_donor_variant', 'splice_region_variant', 'splice_region_variant', 
    'splice_region_variant', 'stop_lost', '5_prime_UTR_premature_', 'start_codon_gain_variant', 
    'start_lost', 'stop_gained', 'synonymous_variant', 'start_retained', 'stop_retained_variant', 
    'transcript_variant', 'transcript_ablation', 'regulatory_region_variant', 'upstream_gene_variant', 
    '3_prime_UTR_variant', '3_prime_UTR_truncation + exon_loss', '5_prime_UTR_variant', 
    '5_prime_UTR_truncation + exon_loss_variant', 'sequence_feature + exon_loss_variant', 'functionally_normal',
    'conservative_inframe_deletion', 'conservative_inframe_insertion'
]

In [None]:
def so_term_in_mutation(mutation: str) -> bool:
    for term in supported_so_terms:
        if term in mutation:
            return True
    return False

so_term_in_mutation('conservative_inframe_deletion_c.1276_1356')