In [None]:
import hgvs.parser
import hgvs.dataproviders.uta
import hgvs.assemblymapper
import xml.etree.ElementTree as ET
import requests
import pandas as pd
from numpy import nan

# Take the table from Rafique et al., 2021 supplementary and annotate it 

## 1. Import and annotate the table from Rafique usinf Vcfanno 
the table was slightly cleaned and converted from MS Word to .csv

In [None]:
# MODY table
mody_df = pd.read_csv('input/clean_variants_from_ Rafique.csv',
                      converters={i: str for i in range(11)}, low_memory=False)
mody_df

In [None]:
# initialize HGVS package
hp = hgvs.parser.Parser()
hdp = hgvs.dataproviders.uta.connect()
am = hgvs.assemblymapper.AssemblyMapper(hdp, assembly_name='GRCh38', alt_aln_method='splign', replace_reference=True)

In [None]:
# get the NM id based on an NP ID
def query_nm_id(db_name, NP_id):
    response = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=" + db_name + "&term=" + NP_id)
    root = ET.fromstring(response.content)

    NM_id = ''

    for child in root.iter('Id'):
        id = child.text

        full_response = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=' + db_name + '&id=' + id)

        if ('accession "NM_' in full_response.text):
            NM_id = 'NM_' + full_response.text.split('accession \"NM_', 1)[1].split('\"', 1)[0]

    return NM_id

In [None]:
NP_NM_map = {}	# Mapping of NP to NM accessions

# function to extract the NM accession from the 'Accession number' field, if present
def get_id_nm(acc_row):
    NP_id = ''
    NM_id = ''

    if ('NM_' in str(acc_row)):
        NM_id = 'NM_' + acc_row.split('NM_', 1)[1].split()[0]

    if ('NP_' in str(acc_row)):
        NP_id = 'NP_' + acc_row.split('NP_', 1)[1].split()[0]

        if (NM_id == ''):
            if (NP_id in NP_NM_map):
                NM_id = NP_NM_map[NP_id]
            else:
                NM_id = query_nm_id("protein", NP_id)
                NP_NM_map[NP_id] = NM_id
        else:
            NP_NM_map[NP_id] = NM_id

    return NM_id

In [None]:
mody_df['NM_acc'] = mody_df['Accession number'].apply(get_id_nm)

In [None]:
# get the genomic coordinates of a the variant
def get_genomic_coords(row):
    nm_acc = row['NM_acc']
    c_pos = row['Nucleotide position'].strip()
    c_pos = c_pos.replace(',', '')
    c_pos = c_pos.replace(' ', '')

    if ('NM' not in nm_acc) or ('c.' not in c_pos):
        return None

    hgvs_c = nm_acc + ':' + c_pos
    var_g = None

    try:
        var_c = hp.parse_hgvs_variant(hgvs_c)
        var_g = am.c_to_g(var_c)

    except:
        return None

    finally:
        return var_g

In [None]:
mody_df['DNA_coords'] = mody_df.apply(get_genomic_coords, axis=1)

In [None]:
# Ensembl Exon variants table filtered and with all the alleles as separate rows and 'vf_allele' column added
Exon_var = pd.read_csv(
    'Ens_filtered_all_alleles_location_coord_no_duplicates.csv',
                      converters={i: str for i in range(11)}, low_memory=False)
Exon_var

In [None]:
# map the row in the MODY table to the ensembl table, if possible
def map_to_ensembl(row):
    var_g = row['DNA_coords']
    if (var_g is None):
        return ''

    bp_pos = 0
    alt_allele = ''

    try:
        bp_pos = var_g.posedit.pos.start.base
        alt_allele = var_g.posedit.edit.alt

    except:
        return ''

    finally:
        # find corresponding position in the ensembl table
        ensembl_candidates = Exon_var[Exon_var['Location'].str.contains(str(bp_pos)) & (Exon_var['vf_allele'] == alt_allele)]

        if (len(ensembl_candidates) > 0):
            return ";".join(ensembl_candidates['id'].unique().tolist())
        else:
            return ''

mody_df['ensembl_id'] = mody_df.apply(map_to_ensembl, axis=1)
mody_df['DNA_coords'] = mody_df['DNA_coords'].apply(lambda x: str(x))

In [None]:
mody_df

In [None]:
# Writing the annotated table to file
mody_df.to_csv(
    'Rafique_with_rs.csv',
    header=True, index=False)

In [None]:
# Creating a list of variants from Rafique that now have the rs identifiers
ids_from_Rafique = []
for number in mody_df['ensembl_id']:
    if number != '': ids_from_Rafique.append(number)
len(ids_from_Rafique)

In [None]:
# Filtering the Ensembl table to only those variants
mapped_variants = Exon_var.drop_duplicates().query('id in @ids_from_Rafique').reset_index(drop=True)
mapped_variants

In [None]:
mapped_variants.to_csv('Rafique_mapped_to_Ens_1st.csv',
    header=True, index=False)

## 2. Table with Rafique annotated variants, but excluded BLK, KLF11 and PAX4

In [None]:
genes = ['GCK', 'HNF1A', 'HNF4A', 'HNF1B', 'INS', 'ABCC8', 'PDX1',
       'NEUROD1', 'KCNJ11', 'APPL1', 'CEL']

In [None]:
Rafique_excluded = mody_df.query('Gene in @genes').reset_index(drop=True)
Rafique_excluded = Rafique_excluded.replace(nan, '')
Rafique_excluded

In [None]:
# Creating a list of variants from Rafique that now have the rs identifiers
ids_from_Rafique_excluded = []
for number in Rafique_excluded['ensembl_id']:
    if number != '': ids_from_Rafique_excluded.append(number)
len(ids_from_Rafique_excluded)

In [None]:
mapped_variants_excluded = Exon_var.drop_duplicates().query('id in @ids_from_Rafique_excluded').reset_index(drop=True)
mapped_variants_excluded

In [None]:
# Writing to file specifying that it is the 1st stage of annotation ans short version of the table
mapped_variants.to_csv(
    'Rafique_mapped_to_Ens_1st_excluded_BLK_KLF11_PAX4.csv',
    header=True, index=False)