# Take ClinVar variants and map them to reference Ensembl table

In [None]:
import pandas as pd
import requests, sys
import time
import pprint
from numpy import nan, log

## 1. Export variant table associated with phenotypes "MODY", "Monogenic diabetes" and "Neonathal diabetes" and map them to Ensembl reference table by rs identifiers 

In [None]:
#combine these tables together
ClinVar_MODY = pd.read_table(
    'input/clinvar_result_MODY.txt')
ClinVar_MD = pd.read_table(
    'input/clinvar_result_MD.txt')
ClinVar_ND = pd.read_table(
    'input/clinvar_result_ND.txt')
df = pd.concat([ClinVar_MODY, ClinVar_MD, ClinVar_ND])
ClinVar = df.rename(columns={"dbSNP ID": "ID"}).rename(columns={"Canonical SPDI": "Canonical_SPDI"}).drop_duplicates().reset_index(drop=True)
ClinVar = ClinVar.replace(nan, '')
ClinVar

In [None]:
# Creating a "coordinate" column that will look like the coordinate in Ensembl

coordinates = []
for index, row in ClinVar.iterrows():
    
    if row['Canonical_SPDI'] == '':
        coordinates.append('')

    elif type(row['GRCh38Chromosome']) == float:
        coordinates.append(str(int(row['GRCh38Chromosome'])) + ':' + 
              (str(int(row['Canonical_SPDI'].split(':')[1]) + 1)) + ':' + row['Canonical_SPDI'].split(':')[2] + 
              '>' + row['Canonical_SPDI'].split(':')[3])

    else: 
        coordinates.append(str(row['GRCh38Chromosome']) + ':' + 
              (str(int(row['Canonical_SPDI'].split(':')[1]) + 1)) + ':' + row['Canonical_SPDI'].split(':')[2] + 
              '>' + row['Canonical_SPDI'].split(':')[3])
        
ClinVar['coordinates'] = coordinates
ClinVar
#coordinates

In [None]:
# choosing the variants from ClinVar that are in dbSNP

ClinVar_list = []
for var in ClinVar['ID']:
    if str(var).startswith('rs'):
        ClinVar_list.append(var)
ClinVar_rs = list(set(ClinVar_list)) #getting rid of duplicates
len(ClinVar_rs)

In [None]:
Ens = pd.read_csv(
    'intermediate/Ens_filtered_all_alleles_location_coord_no_duplicates.csv',
                      converters={i: str for i in range(11)}, low_memory=False)

In [None]:
ClinVar_mapped = Ens.drop_duplicates().query('id in @ClinVar_rs').reset_index(drop=True)
ClinVar_mapped

In [None]:
#How many variants are in this table?
ClinVar_ID_list = ClinVar_mapped['id'].unique().tolist()
len(ClinVar_ID_list)

In [None]:
#This was a 1st stage mapping
ClinVar_mapped.to_csv(
    'ClinVar_mapped_to_Ens_1st.csv',
    header=True, index=False)

## 2. Take the variants that did not map with rs and try to map them by coordinates

In [None]:
# choosing the variants from ClinVar that are not in dbSNP

ClinVar_nonrs_list = []
for index,row in ClinVar.iterrows():
    if str(row['ID']).startswith('rs'):
        continue
    if row['Canonical_SPDI'] == '':
        continue
    else: ClinVar_nonrs_list.append(row['Canonical_SPDI'])
len(ClinVar_nonrs_list)

In [None]:
#there seem to be some lacking ['Canonical SPDI']
strange_list = []
for index,row in ClinVar.iterrows():
    if row['Canonical_SPDI'] == '':
        strange_list.append(str(row['Name']))
len(strange_list)

In [None]:
ClinVar_unmapped = ClinVar.drop_duplicates().query('Canonical_SPDI in @ClinVar_nonrs_list').reset_index(drop=True)
ClinVar_unmapped

In [None]:
ClinVar_unmapped.to_csv(
    'ClinVar_all_unmapped.csv', header=True, index=False)

In [None]:
#How many variants in the ClinVar_unmapped table
len(ClinVar_unmapped['Accession'].unique())

In [None]:
#list of coordinates of the variants that did not map with rs ID to Ensembl
unmapped_coord = ClinVar_unmapped['coordinates'].tolist()

In [None]:
#Mapping them to Ensembl by coordinates
ClinVar_mapped_coord = Ens.query('coordinate in @unmapped_coord').reset_index(drop=True)
ClinVar_mapped_coord

In [None]:
#This was a 2nd stage mapping
ClinVar_mapped_coord.to_csv(
    'ClinVar_mapped_to_Ens_2nd.csv',
    header=True, index=False)

In [None]:
#How many variants have mapped
len(ClinVar_mapped_coord['coordinate'].unique())

In [None]:
#List of the variants that did not map neither with rs nor with coordinates
leftovers = []
ClinVar_mapped_coord_list = ClinVar_mapped_coord['coordinate'].tolist()
for var in ClinVar_unmapped['coordinates']:
    if var not in ClinVar_mapped_coord_list:
        leftovers.append(var)
len(leftovers)

In [None]:
#Filtering the ClinVar_unmapped table to only leftover variants
ClinVar_rs_coord_unmapped = ClinVar_unmapped.query('coordinates in @leftovers').reset_index(drop=True)
ClinVar_rs_coord_unmapped

In [None]:
#This was a 2nd stage mapping
ClinVar_rs_coord_unmapped.to_csv(
    'ClinVar_unmapped_by_rs_coord.csv',
    header=True, index=False)

So that would be nice now to know which one of the ClinVar variants are the coding ones or at least lay in exons. For this we can do the "Fetch variant consequences based on a HGVS notation" from here https://rest.ensembl.org/documentation/info/vep_hgvs_get

## 3. Fetch the variant consequence type using coordinates with Ensembl API

In [None]:
# creating a proper genomic coordinate for the API search
coordinates = []
for index,row in ClinVar_rs_coord_unmapped.iterrows():
    coordinates.append(str(int(row['GRCh38Chromosome']))
          + ':g.' + str(int(row['Canonical_SPDI'].split(':')[1])+1)
          + row['Canonical_SPDI'].split(':')[2] + '>' + row['Canonical_SPDI'].split(':')[3])
ClinVar_rs_coord_unmapped['DNA_coordinate'] = coordinates
ClinVar_rs_coord_unmapped

In [None]:
variants = []
passed_vars = []
indels = []

In [None]:
server = "https://rest.ensembl.org"

count = 0
for index, row in ClinVar_rs_coord_unmapped.iterrows():
    if row['Accession'] not in passed_vars:
        variant_id = row['Accession']
        ext = "/vep/human/hgvs/" + row['DNA_coordinate'] + "?"
        r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
        if not r.ok:
            print(row['DNA_coordinate'])
            indels.append(row['DNA_coordinate'])
            continue
 
        variants.append({'variant_id': variant_id, 'variant': r.json()})
        time.sleep(1)
        count = count + 1
        print(str(count), ' -----> ', ext)

In [None]:
len(indels)

In [None]:
len(variants)

## 4. Working with the ones that have fetched

Create a dataframe with variants, coordinates and their fetched consequences

In [None]:
coord_cons = []
for var in variants:
    var_id = var['variant_id']
    for trans in var['variant']:
        coord = trans['id']
        consequence = trans['most_severe_consequence']
        coord_cons.append({'coordinate': coord, 'consequence': consequence, 'accession': var_id})
coord_cons_df = pd.DataFrame(coord_cons)
coord_cons_df

In [None]:
coord_cons_df.to_csv(
    'ClinVar_unmapped_fetched_unfiltered.csv',
    header=True, index=False)

In [None]:
#check which consequence types are there
coord_cons_df['consequence'].unique()

The consequence types decided from the Pathogenicity check are the following:
    
'missense variant',
'frameshift variant',
'splice donor variant',
'splice acceptor variant',
'nonsense (stop gained)',
'stop lost',
'nc transcript variant' 

for the categories from ClinVar and the following for the Ensembl categories:

'missense_variant',
'protein_altering_variant', 
'coding_sequence_variant', 
'frameshift_variant', 
'splice_donor_variant', 
'splice_acceptor_variant', 
'splice_donor_5th_base_variant', 
'start_lost', 
'stop_gained', 
'stop_lost', 
'inframe_deletion', 
'inframe_insertion'

In [None]:
#The list of consequence types we desided to leave in the table

consequence_type_list = ['missense_variant',
                         'protein_altering_variant',
                         'coding_sequence_variant',
                         'frameshift_variant',
                         'splice_donor_variant',
                         'splice_acceptor_variant',
                         'splice_donor_5th_base_variant',
                         'start_lost',
                         'stop_gained',
                         'stop_lost',
                         'inframe_deletion',
                         'inframe_insertion']

In [None]:
ClinVar_unmapped_SNV_filtered = coord_cons_df.query('consequence in @consequence_type_list').reset_index(drop=True)
ClinVar_unmapped_SNV_filtered

In [None]:
len(ClinVar_unmapped_SNV_filtered['coordinate'].unique())

In [None]:
#Make them pretty for the inclusion into VCF
ClinVar_unmapped_SNV_filtered_listdict = []
for index,row in ClinVar_unmapped_SNV_filtered.iterrows():
    chrom = row['coordinate'][:row['coordinate'].find(':')]
    pos = row['coordinate'][row['coordinate'].find('.')+1:row['coordinate'].find('>')-1]
    ref = row['coordinate'][row['coordinate'].find('>')-1]
    alt = row['coordinate'][row['coordinate'].find('>')+1:]
    accession = row['accession']
    ClinVar_unmapped_SNV_filtered_listdict.append({'chrom': chrom, 'pos': pos,
                                             'ref': ref, 'alt': alt, 'accession': accession})
ClinVar_unmapped_SNV_filtered_df = pd.DataFrame(ClinVar_unmapped_SNV_filtered_listdict)   
ClinVar_unmapped_SNV_filtered_df

In [None]:
ClinVar_unmapped_SNV_filtered_df.to_csv(
    'ClinVar_unmapped_filtered_SNV_for_VCF.csv',
    header=True, index=False)

## 5. Working with indels

The ones that did not fetch are all indels. They are in the list 'indels' created in the p. 3

In [None]:
indels

In [None]:
#filter the unmapped ClinVar table to contain just indels
indel_df = ClinVar_rs_coord_unmapped.query('DNA_coordinate in @indels').reset_index(drop=True)
indel_df

In [None]:
#Make them pretty for the inclusion into VCF
df_dict_list = []
for index,row in indel_df.iterrows():
    df_dict_list.append({'chrom' : str(int(row['GRCh38Chromosome'])), 
     'pos' : row['GRCh38Location'].split(' -')[0], 
     'ref' : row['Canonical_SPDI'].split(':')[2], 
     'alt' : row['Canonical_SPDI'].split(':')[3],
                         'accession': row['Accession']})
ClinVar_indels = pd.DataFrame(df_dict_list)
ClinVar_indels

In [None]:
ClinVar_indels.to_csv(
    'ClinVar_indels.csv',
    header=True, index=False)

## 6. Repeat the same but with the table filtered to just 'pathogenic' and 'likely pathogenic' variants

## Here we also need to remove BLK, KLF11 and PAX4

In [None]:
ClinVar_clinical = ClinVar.rename(columns={"Clinical significance (Last reviewed)": "clinical_significance"})
ClinVar_clinical

In [None]:
#Filtering to just pathogenic variants and removing the 3 genes

pathogenic_vars = []
for index, row in ClinVar_clinical.iterrows():
    if row['clinical_significance'].split('(')[0] == 'Pathogenic' or row['clinical_significance'].split('(')[0] == 'Likely pathogenic':
        pathogenic_vars.append(row['Accession'])
ClinPath = ClinVar_clinical.query('Accession in @pathogenic_vars').reset_index(drop=True)
pathogenic = ClinPath[ClinPath['Gene(s)'] != 'PAX4']
pathogenic


In [None]:
# Checking that there are no BLK, KLF11, PAX4
pathogenic['Gene(s)'].unique()

In [None]:
pathogenic.to_csv(
    'ClinVar_MD_pathogenic.csv',
    header=True, index=False)

In [None]:
# choosing the variants from ClinVar that are in dbSNP
ClinVar_pat_list = []
for var in pathogenic['ID']:
    if str(var).startswith('rs'):
        ClinVar_pat_list.append(var)
ClinVar_pat_rs = list(set(ClinVar_pat_list)) #getting rid of duplicates
ClinVar_pat_mapped = Ens.drop_duplicates().query('id in @ClinVar_pat_rs').reset_index(drop=True)
ClinVar_pat_mapped.to_csv(
    'ClinVar_pathogenic_mapped_1st.csv',
    header=True, index=False)
print('created the 1st stage of mapping')

# choosing the variants from ClinVar that are not in dbSNP
ClinVar_nonrs_list = []
for index,row in pathogenic.iterrows():
    if str(row['ID']).startswith('rs'):
        continue
    if row['Canonical_SPDI'] == '':
        continue
    else: ClinVar_nonrs_list.append(row['Canonical_SPDI'])
len(ClinVar_nonrs_list)
ClinVar_pat_unmapped = pathogenic.query('Canonical_SPDI in @ClinVar_nonrs_list').reset_index(drop=True)
ClinVar_pat_unmapped.to_csv(
    'ClinVar_pat_unmapped.csv', header=True, index=False)
print('saving the table with unmapped')

#list of coordinates of the variants that did not map with rs ID to Ensembl
unmapped_coord = ClinVar_pat_unmapped['coordinates'].tolist()

#Mapping them to Ensembl by coordinates
ClinVar_pat_mapped_coord = Ens.query('coordinate in @unmapped_coord').reset_index(drop=True)
#This was a 2nd stage mapping
ClinVar_pat_mapped_coord.to_csv(
    'ClinVar_pathogenic_mapped_to_Ens_2nd.csv',
    header=True, index=False)
print('creating 2nd stage of mapping')

#List of the variants that did not map neither with rs nor with coordinates
leftovers = []
ClinVar_pat_mapped_coord_list = ClinVar_pat_mapped_coord['coordinate'].tolist()
for var in ClinVar_pat_unmapped['coordinates']:
    if var not in ClinVar_pat_mapped_coord_list:
        leftovers.append(var)

#Filtering the ClinVar_unmapped table to only leftover variants
ClinVar_pat_rs_coord_unmapped = ClinVar_pat_unmapped.query('coordinates in @leftovers').reset_index(drop=True)

# creating a proper genomic coordinate for the API search
coordinates = []
for index,row in ClinVar_pat_rs_coord_unmapped.iterrows():
    coordinates.append(str(int(row['GRCh38Chromosome']))
          + ':g.' + str(int(row['Canonical_SPDI'].split(':')[1])+1)
          + row['Canonical_SPDI'].split(':')[2] + '>' + row['Canonical_SPDI'].split(':')[3])
ClinVar_pat_rs_coord_unmapped['DNA_coordinate'] = coordinates
ClinVar_pat_rs_coord_unmapped

The 1st stage table includes 314 variants and the 2nd stage table includes 36 variants

In [None]:
variants_pat = []
passed_vars_pat = []
indels_pat = []

In [None]:
server = "https://rest.ensembl.org"

count = 0
for index, row in ClinVar_pat_rs_coord_unmapped.iterrows():
    if row['Accession'] not in passed_vars:
        variant_id = row['Accession']
        ext = "/vep/human/hgvs/" + row['DNA_coordinate'] + "?"
        r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
        if not r.ok:
            print(row['DNA_coordinate'])
            indels_pat.append(row['DNA_coordinate'])
            continue
 
        variants_pat.append({'variant_id': variant_id, 'variant': r.json()})
        time.sleep(1)
        count = count + 1
        print(str(count), ' -----> ', ext)

In [None]:
coord_cons = []
for var in variants_pat:
    var_id = var['variant_id']
    for trans in var['variant']:
        coord = trans['id']
        consequence = trans['most_severe_consequence']
        coord_cons.append({'coordinate': coord, 'consequence': consequence, 'accession': var_id})
pat_coord_cons_df = pd.DataFrame(coord_cons)
pat_coord_cons_df.to_csv(
    'ClinVar_pathogenic_unmapped_fetched_unfiltered.csv',
    header=True, index=False)

#The list of consequence types we desided to leave in the table
consequence_type_list = ['missense_variant',
                         'protein_altering_variant',
                         'coding_sequence_variant',
                         'frameshift_variant',
                         'splice_donor_variant',
                         'splice_acceptor_variant',
                         'splice_donor_5th_base_variant',
                         'start_lost',
                         'stop_gained',
                         'stop_lost',
                         'inframe_deletion',
                         'inframe_insertion']

ClinVar_pat_unmapped_SNV_filtered = pat_coord_cons_df.query(
    'consequence in @consequence_type_list').reset_index(drop=True)

#Make them pretty for the inclusion into VCF
ClinVar_pat_unmapped_SNV_filtered_listdict = []
for index,row in ClinVar_pat_unmapped_SNV_filtered.iterrows():
    chrom = row['coordinate'][:row['coordinate'].find(':')]
    pos = row['coordinate'][row['coordinate'].find('.')+1:row['coordinate'].find('>')-1]
    ref = row['coordinate'][row['coordinate'].find('>')-1]
    alt = row['coordinate'][row['coordinate'].find('>')+1:]
    accession = row['accession']
    ClinVar_pat_unmapped_SNV_filtered_listdict.append({'chrom': chrom, 'pos': pos,
                                             'ref': ref, 'alt': alt, 'accession': accession})
ClinVar_pat_unmapped_SNV_filtered_df = pd.DataFrame(ClinVar_pat_unmapped_SNV_filtered_listdict)   
ClinVar_pat_unmapped_SNV_filtered_df.to_csv(
    'ClinVar_pat_unmapped_filtered_SNV_for_VCF.csv',
    header=True, index=False)

#filter the unmapped ClinVar table to contain just indels
indel_pat_df = ClinVar_pat_rs_coord_unmapped.query('DNA_coordinate in @indels_pat').reset_index(drop=True)
#Make them pretty for the inclusion into VCF
df_dict_list = []
for index,row in indel_pat_df.iterrows():
    df_dict_list.append({'chrom' : str(int(row['GRCh38Chromosome'])), 
     'pos' : row['GRCh38Location'].split(' -')[0], 
     'ref' : row['Canonical_SPDI'].split(':')[2], 
     'alt' : row['Canonical_SPDI'].split(':')[3],
                         'accession': row['Accession']})
ClinVar_pat_indels = pd.DataFrame(df_dict_list)
ClinVar_pat_indels.to_csv(
    'ClinVar_pathogenic_indels.csv',
    header=True, index=False)