# Take ClinVar variants and map them to reference Ensembl table

In [1]:
import pandas as pd
import requests, sys
import time
import pprint
from numpy import nan, log

## 1. Export variant table associated with phenotypes "MODY", "Monogenic diabetes" and "Neonathal diabetes" and map them to Ensembl reference table by rs identifiers 

In [2]:
#combine these tables together
ClinVar_MODY = pd.read_table('input/clinvar_result_MODY.txt')
ClinVar_MD = pd.read_table('input/clinvar_result_MD.txt')
ClinVar_ND = pd.read_table('input/clinvar_result_ND.txt')
df = pd.concat([ClinVar_MODY, ClinVar_MD, ClinVar_ND])
ClinVar = df.rename(columns={"dbSNP ID": "ID"}).rename(columns={"Canonical SPDI": "Canonical_SPDI"}).drop_duplicates().reset_index(drop=True)
ClinVar = ClinVar.replace(nan, '')
ClinVar

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Clinical significance (Last reviewed),Review status,Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),ID,Canonical_SPDI,Unnamed: 15
0,NM_000352.6(ABCC8):c.1630+5G>A,ABCC8,,Transitory neonatal diabetes mellitus|Maturity...,Uncertain significance,"criteria provided, single submitter",VCV001697233,11.0,17464262,11.0,17442715,1697233,1689628,rs1956364777,NC_000011.10:17442714:C:T,
1,NM_000352.6(ABCC8):c.1716G>A (p.Ser572_Pro573=),ABCC8,,Transitory neonatal diabetes mellitus|Maturity...,Uncertain significance,"criteria provided, single submitter",VCV001697229,11.0,17452462,11.0,17430915,1697229,1689624,rs750619309,NC_000011.10:17430914:C:T,
2,NM_000352.6(ABCC8):c.1833C>T (p.Ser611_Glu612=),ABCC8,,Transitory neonatal diabetes mellitus|Maturity...,Uncertain significance,"criteria provided, single submitter",VCV001697228,11.0,17450202,11.0,17428655,1697228,1689623,rs763273046,NC_000011.10:17428654:G:A,
3,NM_000352.6(ABCC8):c.1801G>A (p.Val601Ile),ABCC8,"V600I, V601I",not specified,"Uncertain significance(Last reviewed: May 5, 2...","criteria provided, single submitter",VCV001696124,11.0,17452377,11.0,17430830,1696124,1688532,,NC_000011.10:17430829:C:T,
4,NM_000352.6(ABCC8):c.890G>T (p.Arg297Met),ABCC8,"R296M, R297M",Maturity onset diabetes mellitus in young|Tran...,Uncertain significance,"criteria provided, single submitter",VCV001693120,11.0,17482156,11.0,17460609,1693120,1685560,rs371802112,NC_000011.10:17460608:C:A,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4454,NM_001109809.5(ZFP57):c.743G>A (p.Arg248His),ZFP57,"R248H, R176H","Diabetes mellitus, transient neonatal, 1","Pathogenic(Last reviewed: Aug 1, 2008)",no assertion criteria provided,VCV000000719,6,29641145,6,29673368,719,15758,rs77625743,NC_000006.12:29673367:C:T,
4455,NM_001109809.5(ZFP57):c.1372C>G (p.His458Asp),ZFP57,"H458D, H386D","Diabetes mellitus, transient neonatal, 1","Pathogenic(Last reviewed: Aug 1, 2008)",no assertion criteria provided,VCV000000718,6,29640516,6,29672739,718,15757,rs79020217,NC_000006.12:29672738:G:C,
4456,NM_001109809.5(ZFP57):c.1383del (p.Tyr462fs),ZFP57,"Y390fs, Y462fs","Diabetes mellitus, transient neonatal, 1","Pathogenic(Last reviewed: Aug 1, 2008)",no assertion criteria provided,VCV000000717,6,29640505,6,29672728,717,15756,rs606231122,NC_000006.12:29672727:G:,
4457,NM_001109809.5(ZFP57):c.317_318del (p.Glu106fs),ZFP57,"E34fs, E106fs","Diabetes mellitus, transient neonatal, 1","Pathogenic(Last reviewed: Aug 1, 2008)",no assertion criteria provided,VCV000000716,6,29643197 - 29643198,6,29675420 - 29675421,716,15755,rs606231121,NC_000006.12:29675419:CTCTCTC:CTCTC,


In [3]:
# Creating a "coordinate" column that will look like the coordinate in Ensembl

coordinates = []
for index, row in ClinVar.iterrows():
    
    if row['Canonical_SPDI'] == '':
        coordinates.append('')

    elif type(row['GRCh38Chromosome']) == float:
        coordinates.append(str(int(row['GRCh38Chromosome'])) + ':' + 
              (str(int(row['Canonical_SPDI'].split(':')[1]) + 1)) + ':' + row['Canonical_SPDI'].split(':')[2] + 
              '>' + row['Canonical_SPDI'].split(':')[3])

    else: 
        coordinates.append(str(row['GRCh38Chromosome']) + ':' + 
              (str(int(row['Canonical_SPDI'].split(':')[1]) + 1)) + ':' + row['Canonical_SPDI'].split(':')[2] + 
              '>' + row['Canonical_SPDI'].split(':')[3])
        
ClinVar['coordinates'] = coordinates
ClinVar
#coordinates

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Clinical significance (Last reviewed),Review status,Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),ID,Canonical_SPDI,Unnamed: 15,coordinates
0,NM_000352.6(ABCC8):c.1630+5G>A,ABCC8,,Transitory neonatal diabetes mellitus|Maturity...,Uncertain significance,"criteria provided, single submitter",VCV001697233,11.0,17464262,11.0,17442715,1697233,1689628,rs1956364777,NC_000011.10:17442714:C:T,,11:17442715:C>T
1,NM_000352.6(ABCC8):c.1716G>A (p.Ser572_Pro573=),ABCC8,,Transitory neonatal diabetes mellitus|Maturity...,Uncertain significance,"criteria provided, single submitter",VCV001697229,11.0,17452462,11.0,17430915,1697229,1689624,rs750619309,NC_000011.10:17430914:C:T,,11:17430915:C>T
2,NM_000352.6(ABCC8):c.1833C>T (p.Ser611_Glu612=),ABCC8,,Transitory neonatal diabetes mellitus|Maturity...,Uncertain significance,"criteria provided, single submitter",VCV001697228,11.0,17450202,11.0,17428655,1697228,1689623,rs763273046,NC_000011.10:17428654:G:A,,11:17428655:G>A
3,NM_000352.6(ABCC8):c.1801G>A (p.Val601Ile),ABCC8,"V600I, V601I",not specified,"Uncertain significance(Last reviewed: May 5, 2...","criteria provided, single submitter",VCV001696124,11.0,17452377,11.0,17430830,1696124,1688532,,NC_000011.10:17430829:C:T,,11:17430830:C>T
4,NM_000352.6(ABCC8):c.890G>T (p.Arg297Met),ABCC8,"R296M, R297M",Maturity onset diabetes mellitus in young|Tran...,Uncertain significance,"criteria provided, single submitter",VCV001693120,11.0,17482156,11.0,17460609,1693120,1685560,rs371802112,NC_000011.10:17460608:C:A,,11:17460609:C>A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4454,NM_001109809.5(ZFP57):c.743G>A (p.Arg248His),ZFP57,"R248H, R176H","Diabetes mellitus, transient neonatal, 1","Pathogenic(Last reviewed: Aug 1, 2008)",no assertion criteria provided,VCV000000719,6,29641145,6,29673368,719,15758,rs77625743,NC_000006.12:29673367:C:T,,6:29673368:C>T
4455,NM_001109809.5(ZFP57):c.1372C>G (p.His458Asp),ZFP57,"H458D, H386D","Diabetes mellitus, transient neonatal, 1","Pathogenic(Last reviewed: Aug 1, 2008)",no assertion criteria provided,VCV000000718,6,29640516,6,29672739,718,15757,rs79020217,NC_000006.12:29672738:G:C,,6:29672739:G>C
4456,NM_001109809.5(ZFP57):c.1383del (p.Tyr462fs),ZFP57,"Y390fs, Y462fs","Diabetes mellitus, transient neonatal, 1","Pathogenic(Last reviewed: Aug 1, 2008)",no assertion criteria provided,VCV000000717,6,29640505,6,29672728,717,15756,rs606231122,NC_000006.12:29672727:G:,,6:29672728:G>
4457,NM_001109809.5(ZFP57):c.317_318del (p.Glu106fs),ZFP57,"E34fs, E106fs","Diabetes mellitus, transient neonatal, 1","Pathogenic(Last reviewed: Aug 1, 2008)",no assertion criteria provided,VCV000000716,6,29643197 - 29643198,6,29675420 - 29675421,716,15755,rs606231121,NC_000006.12:29675419:CTCTCTC:CTCTC,,6:29675420:CTCTCTC>CTCTC


In [4]:
# choosing the variants from ClinVar that are in dbSNP

ClinVar_list = []
for var in ClinVar['ID']:
    if str(var).startswith('rs'):
        ClinVar_list.append(var)
ClinVar_rs = list(set(ClinVar_list)) #getting rid of duplicates
len(ClinVar_rs)

2705

In [5]:
Ens = pd.read_csv(
    'Ens_filtered_all_alleles_location_coord_no_duplicates.csv',
                      converters={i: str for i in range(11)}, low_memory=False)

In [6]:
ClinVar_mapped = Ens.drop_duplicates().query('id in @ClinVar_rs').reset_index(drop=True)
ClinVar_mapped

Unnamed: 0,id,seq_region_name,start,end,strand,vf_allele,Location,coordinate,Gene,Transcript,Exon
0,rs796065047,3,57238111.0,57238111.0,1.0,A,3:57238111,3:57238111:G>A,ENSG00000157500,ENST00000650354,ENSE00003522948
1,rs869320673,3,57260016.0,57260016.0,1.0,A,3:57260016,3:57260016:T>A,ENSG00000157500,ENST00000650354,ENSE00003516737
2,rs796065047,3,57238111.0,57238111.0,1.0,A,3:57238111,3:57238111:G>A,ENSG00000157500,ENST00000482800,ENSE00003527210
3,rs869320673,3,57260016.0,57260016.0,1.0,A,3:57260016,3:57260016:T>A,ENSG00000157500,ENST00000482800,ENSE00003585026
4,rs796065047,3,57238111.0,57238111.0,1.0,A,3:57238111,3:57238111:G>A,ENSG00000157500,ENST00000468342,ENSE00001849559
...,...,...,...,...,...,...,...,...,...,...,...
8709,rs886054724,2,10048394.0,10048394.0,1.0,T,2:10048394,2:10048394:C>T,ENSG00000172059,ENST00000305883,ENSE00001171571
8710,rs563767876,2,10048506.0,10048506.0,1.0,T,2:10048506,2:10048506:G>T,ENSG00000172059,ENST00000305883,ENSE00001171571
8711,rs757875185,2,10052364.0,10052364.0,1.0,T,2:10052364,2:10052364:A>T,ENSG00000172059,ENST00000305883,ENSE00001337294
8712,rs750063435,2,10052418.0,10052418.0,1.0,T,2:10052418,2:10052418:G>T,ENSG00000172059,ENST00000305883,ENSE00001337294


In [7]:
#How many variants are in this table?
ClinVar_ID_list = ClinVar_mapped['id'].unique().tolist()
len(ClinVar_ID_list)

1588

In [8]:
#This was a 1st stage mapping
ClinVar_mapped.to_csv(
    'ClinVar_mapped_to_Ens_1st.csv',
    header=True, index=False)

## 2. Take the variants that did not map with rs and try to map them by coordinates

In [9]:
# choosing the variants from ClinVar that are not in dbSNP

ClinVar_nonrs_list = []
for index,row in ClinVar.iterrows():
    if str(row['ID']).startswith('rs'):
        continue
    if row['Canonical_SPDI'] == '':
        continue
    else: ClinVar_nonrs_list.append(row['Canonical_SPDI'])
len(ClinVar_nonrs_list)

693

In [10]:
#there seem to be some lacking ['Canonical SPDI']
strange_list = []
for index,row in ClinVar.iterrows():
    if row['Canonical_SPDI'] == '':
        strange_list.append(str(row['Name']))
len(strange_list)

34

In [11]:
ClinVar_unmapped = ClinVar.drop_duplicates().query('Canonical_SPDI in @ClinVar_nonrs_list').reset_index(drop=True)
ClinVar_unmapped

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Clinical significance (Last reviewed),Review status,Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),ID,Canonical_SPDI,Unnamed: 15,coordinates
0,NM_000352.6(ABCC8):c.1801G>A (p.Val601Ile),ABCC8,"V600I, V601I",not specified,"Uncertain significance(Last reviewed: May 5, 2...","criteria provided, single submitter",VCV001696124,11.0,17452377,11.0,17430830,1696124,1688532,,NC_000011.10:17430829:C:T,,11:17430830:C>T
1,NM_000352.6(ABCC8):c.2912A>T (p.Glu971Val),ABCC8,"E970V, E971V, E972V, E993V",Maturity onset diabetes mellitus in young,Uncertain significance,"criteria provided, single submitter",VCV001525997,11.0,17428909,11.0,17407362,1525997,1517402,,NC_000011.10:17407361:T:A,,11:17407362:T>A
2,NM_000352.6(ABCC8):c.4544C>T (p.Thr1515Met),ABCC8,"T1514M, T1515M, T1516M, T1537M",not provided|Diabetes mellitus,"Pathogenic(Last reviewed: Sep 2, 2021)","criteria provided, single submitter",VCV001506182,11.0,17415814,11.0,17394267,1506182,1339911,,NC_000011.10:17394266:G:A,,11:17394267:G>A
3,NM_000352.6(ABCC8):c.2975G>C (p.Arg992Pro),ABCC8,"R1014P, R991P, R992P, R993P",not specified,"Uncertain significance(Last reviewed: Nov 22, ...","criteria provided, single submitter",VCV001338672,11.0,17428622,11.0,17407075,1338672,1329678,,NC_000011.10:17407074:C:G,,11:17407075:C>G
4,NM_012096.3(APPL1):c.1926A>G (p.Ile642Met),APPL1|ASB14,I642M,Maturity-onset diabetes of the young type 14,"Likely benign(Last reviewed: Feb 26, 2021)","criteria provided, single submitter",VCV001678586,3.0,57302458,3.0,57268430,1678586,1670335,,NC_000003.12:57268429:A:G,,3:57268430:A>G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,NM_001109809.5(ZFP57):c.820C>T (p.Leu274Phe),ZFP57,"L274F, L202F",not provided,"Uncertain significance(Last reviewed: Aug 3, 2...","criteria provided, single submitter",VCV001449415,6,29641068,6,29673291,1449415,1416471,,NC_000006.12:29673290:G:A,,6:29673291:G>A
689,NM_001109809.5(ZFP57):c.448C>T (p.Gln150Ter),ZFP57,"Q150*, Q78*","Diabetes mellitus, transient neonatal, 1","Likely pathogenic(Last reviewed: Aug 24, 2021)",no assertion criteria provided,VCV001325371,6,29641440,6,29673663,1325371,1315627,,NC_000006.12:29673662:G:A,,6:29673663:G>A
690,NM_001109809.5(ZFP57):c.711dup (p.Lys238fs),ZFP57,"K166fs, K238fs","Diabetes mellitus, transient neonatal, 1","Likely pathogenic(Last reviewed: Aug 17, 2021)",no assertion criteria provided,VCV001325370,6,29641176 - 29641177,6,29673399 - 29673400,1325370,1315626,,NC_000006.12:29673399:G:GG,,6:29673400:G>GG
691,NM_001109809.5(ZFP57):c.133del (p.Thr45fs),ZFP57,T45fs,"Diabetes mellitus, transient neonatal, 1","Pathogenic(Last reviewed: Jun 11, 2020)",no assertion criteria provided,VCV001323781,6,29643827,6,29676050,1323781,1314062,,NC_000006.12:29676049:T:,,6:29676050:T>


In [12]:
ClinVar_unmapped.to_csv(
    'ClinVar_all_unmapped.csv', header=True, index=False)

In [13]:
#How many variants in the ClinVar_unmapped table
len(ClinVar_unmapped['Accession'].unique())

550

In [14]:
#list of coordinates of the variants that did not map with rs ID to Ensembl
unmapped_coord = ClinVar_unmapped['coordinates'].tolist()

In [15]:
#Mapping them to Ensembl by coordinates
ClinVar_mapped_coord = Ens.query('coordinate in @unmapped_coord').reset_index(drop=True)
ClinVar_mapped_coord

Unnamed: 0,id,seq_region_name,start,end,strand,vf_allele,Location,coordinate,Gene,Transcript,Exon
0,rs183787750,3,57268430.0,57268430.0,1.0,G,3:57268430,3:57268430:A>G,ENSG00000157500,ENST00000650354,ENSE00001032946
1,rs754470733,12,120978644.0,120978650.0,1.0,GGGGGG,12:120978644,12:120978644:GGGGGGG>GGGGGG,ENSG00000135100,ENST00000257555,ENSE00001893688
2,rs1275805852,12,120978802.0,120978802.0,1.0,T,12:120978802,12:120978802:C>T,ENSG00000135100,ENST00000257555,ENSE00001893688
3,rs1001854457,12,120978866.0,120978866.0,1.0,G,12:120978866,12:120978866:C>G,ENSG00000135100,ENST00000257555,ENSE00001893688
4,rs373180062,12,120978907.0,120978907.0,1.0,A,12:120978907,12:120978907:G>A,ENSG00000135100,ENST00000257555,ENSE00001893688
...,...,...,...,...,...,...,...,...,...,...,...
428,rs201499958,11,17407075.0,17407075.0,1.0,G,11:17407075,11:17407075:C>G,ENSG00000006071,ENST00000527905,ENSE00003483011
429,rs201499958,11,17407075.0,17407075.0,1.0,G,11:17407075,11:17407075:C>G,ENSG00000006071,ENST00000644542,ENSE00003817955
430,rs372930264,11,17461599.0,17461599.0,1.0,T,11:17461599,11:17461599:G>T,ENSG00000006071,ENST00000683253,ENSE00003917065
431,rs372930264,11,17461599.0,17461599.0,1.0,T,11:17461599,11:17461599:G>T,ENSG00000006071,ENST00000646737,ENSE00003816790


In [16]:
#This was a 2nd stage mapping
ClinVar_mapped_coord.to_csv(
    'ClinVar_mapped_to_Ens_2nd.csv',
    header=True, index=False)

In [17]:
#How many variants have mapped
len(ClinVar_mapped_coord['coordinate'].unique())

126

In [18]:
#List of the variants that did not map neither with rs nor with coordinates
leftovers = []
ClinVar_mapped_coord_list = ClinVar_mapped_coord['coordinate'].tolist()
for var in ClinVar_unmapped['coordinates']:
    if var not in ClinVar_mapped_coord_list:
        leftovers.append(var)
len(leftovers)

546

In [19]:
#Filtering the ClinVar_unmapped table to only leftover variants
ClinVar_rs_coord_unmapped = ClinVar_unmapped.query('coordinates in @leftovers').reset_index(drop=True)
ClinVar_rs_coord_unmapped

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Clinical significance (Last reviewed),Review status,Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),ID,Canonical_SPDI,Unnamed: 15,coordinates
0,NM_000352.6(ABCC8):c.1801G>A (p.Val601Ile),ABCC8,"V600I, V601I",not specified,"Uncertain significance(Last reviewed: May 5, 2...","criteria provided, single submitter",VCV001696124,11.0,17452377,11.0,17430830,1696124,1688532,,NC_000011.10:17430829:C:T,,11:17430830:C>T
1,NM_012096.3(APPL1):c.69A>G (p.Leu23=),APPL1,,Maturity-onset diabetes of the young type 14|n...,"Benign(Last reviewed: Nov 16, 2021)","criteria provided, multiple submitters, no con...",VCV001170466,3.0,57269608,3.0,57235580,1170466,1154533,,NC_000003.12:57235579:A:G,,3:57235580:A>G
2,NM_000545.8(HNF1A):c.1786del (p.Val596fs),C12orf43|HNF1A,"V596fs, V603fs",Monogenic diabetes,"Likely pathogenic(Last reviewed: Jul 1, 2022)",reviewed by expert panel,VCV001700003,12.0,121438884,12.0,121001081,1700003,1692403,,NC_000012.12:121001080:GG:G,,12:121001081:GG>G
3,NM_000545.8(HNF1A):c.1873C>T (p.Gln625Ter),C12orf43|HNF1A,"Q625*, Q632*",Monogenic diabetes,"Uncertain significance(Last reviewed: Apr 17, ...",reviewed by expert panel,VCV001687090,12.0,121438972,12.0,121001169,1687090,1679255,,NC_000012.12:121001168:C:T,,12:121001169:C>T
4,NM_000545.8(HNF1A):c.1840_1841del (p.Asn614fs),C12orf43|HNF1A,"N614fs, N621fs",Monogenic diabetes,"Likely pathogenic(Last reviewed: May 9, 2022)",reviewed by expert panel,VCV001687087,12.0,121438939 - 121438940,12.0,121001136 - 121001137,1687087,1679252,,NC_000012.12:121001135:AA:,,12:121001136:AA>
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541,NM_173560.4(RFX6):c.1679-34dup,RFX6,,Hypoplastic pancreas-intestinal atresia-hypopl...,"Benign(Last reviewed: Nov 7, 2021)","criteria provided, single submitter",VCV001327955,6,117246576 - 117246577,6,116925413 - 116925414,1327955,1318574,,NC_000006.12:116925413:AAAAAA:AAAAAAA,,6:116925414:AAAAAA>AAAAAAA
542,NM_001109809.5(ZFP57):c.820C>T (p.Leu274Phe),ZFP57,"L274F, L202F",not provided,"Uncertain significance(Last reviewed: Aug 3, 2...","criteria provided, single submitter",VCV001449415,6,29641068,6,29673291,1449415,1416471,,NC_000006.12:29673290:G:A,,6:29673291:G>A
543,NM_001109809.5(ZFP57):c.711dup (p.Lys238fs),ZFP57,"K166fs, K238fs","Diabetes mellitus, transient neonatal, 1","Likely pathogenic(Last reviewed: Aug 17, 2021)",no assertion criteria provided,VCV001325370,6,29641176 - 29641177,6,29673399 - 29673400,1325370,1315626,,NC_000006.12:29673399:G:GG,,6:29673400:G>GG
544,NM_001109809.5(ZFP57):c.133del (p.Thr45fs),ZFP57,T45fs,"Diabetes mellitus, transient neonatal, 1","Pathogenic(Last reviewed: Jun 11, 2020)",no assertion criteria provided,VCV001323781,6,29643827,6,29676050,1323781,1314062,,NC_000006.12:29676049:T:,,6:29676050:T>


In [20]:
#This was a 2nd stage mapping
ClinVar_rs_coord_unmapped.to_csv(
    'ClinVar_unmapped_by_rs_coord.csv',
    header=True, index=False)

So that would be nice now to know which one of the ClinVar variants are the coding ones or at least lay in exons. For this we can do the "Fetch variant consequences based on a HGVS notation" from here https://rest.ensembl.org/documentation/info/vep_hgvs_get

## 3. Fetch the variant consequence type using coordinates with Ensembl API

In [21]:
# creating a proper genomic coordinate for the API search
coordinates = []
for index,row in ClinVar_rs_coord_unmapped.iterrows():
    coordinates.append(str(int(row['GRCh38Chromosome']))
          + ':g.' + str(int(row['Canonical_SPDI'].split(':')[1])+1)
          + row['Canonical_SPDI'].split(':')[2] + '>' + row['Canonical_SPDI'].split(':')[3])
ClinVar_rs_coord_unmapped['DNA_coordinate'] = coordinates
ClinVar_rs_coord_unmapped

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Clinical significance (Last reviewed),Review status,Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),ID,Canonical_SPDI,Unnamed: 15,coordinates,DNA_coordinate
0,NM_000352.6(ABCC8):c.1801G>A (p.Val601Ile),ABCC8,"V600I, V601I",not specified,"Uncertain significance(Last reviewed: May 5, 2...","criteria provided, single submitter",VCV001696124,11.0,17452377,11.0,17430830,1696124,1688532,,NC_000011.10:17430829:C:T,,11:17430830:C>T,11:g.17430830C>T
1,NM_012096.3(APPL1):c.69A>G (p.Leu23=),APPL1,,Maturity-onset diabetes of the young type 14|n...,"Benign(Last reviewed: Nov 16, 2021)","criteria provided, multiple submitters, no con...",VCV001170466,3.0,57269608,3.0,57235580,1170466,1154533,,NC_000003.12:57235579:A:G,,3:57235580:A>G,3:g.57235580A>G
2,NM_000545.8(HNF1A):c.1786del (p.Val596fs),C12orf43|HNF1A,"V596fs, V603fs",Monogenic diabetes,"Likely pathogenic(Last reviewed: Jul 1, 2022)",reviewed by expert panel,VCV001700003,12.0,121438884,12.0,121001081,1700003,1692403,,NC_000012.12:121001080:GG:G,,12:121001081:GG>G,12:g.121001081GG>G
3,NM_000545.8(HNF1A):c.1873C>T (p.Gln625Ter),C12orf43|HNF1A,"Q625*, Q632*",Monogenic diabetes,"Uncertain significance(Last reviewed: Apr 17, ...",reviewed by expert panel,VCV001687090,12.0,121438972,12.0,121001169,1687090,1679255,,NC_000012.12:121001168:C:T,,12:121001169:C>T,12:g.121001169C>T
4,NM_000545.8(HNF1A):c.1840_1841del (p.Asn614fs),C12orf43|HNF1A,"N614fs, N621fs",Monogenic diabetes,"Likely pathogenic(Last reviewed: May 9, 2022)",reviewed by expert panel,VCV001687087,12.0,121438939 - 121438940,12.0,121001136 - 121001137,1687087,1679252,,NC_000012.12:121001135:AA:,,12:121001136:AA>,12:g.121001136AA>
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541,NM_173560.4(RFX6):c.1679-34dup,RFX6,,Hypoplastic pancreas-intestinal atresia-hypopl...,"Benign(Last reviewed: Nov 7, 2021)","criteria provided, single submitter",VCV001327955,6,117246576 - 117246577,6,116925413 - 116925414,1327955,1318574,,NC_000006.12:116925413:AAAAAA:AAAAAAA,,6:116925414:AAAAAA>AAAAAAA,6:g.116925414AAAAAA>AAAAAAA
542,NM_001109809.5(ZFP57):c.820C>T (p.Leu274Phe),ZFP57,"L274F, L202F",not provided,"Uncertain significance(Last reviewed: Aug 3, 2...","criteria provided, single submitter",VCV001449415,6,29641068,6,29673291,1449415,1416471,,NC_000006.12:29673290:G:A,,6:29673291:G>A,6:g.29673291G>A
543,NM_001109809.5(ZFP57):c.711dup (p.Lys238fs),ZFP57,"K166fs, K238fs","Diabetes mellitus, transient neonatal, 1","Likely pathogenic(Last reviewed: Aug 17, 2021)",no assertion criteria provided,VCV001325370,6,29641176 - 29641177,6,29673399 - 29673400,1325370,1315626,,NC_000006.12:29673399:G:GG,,6:29673400:G>GG,6:g.29673400G>GG
544,NM_001109809.5(ZFP57):c.133del (p.Thr45fs),ZFP57,T45fs,"Diabetes mellitus, transient neonatal, 1","Pathogenic(Last reviewed: Jun 11, 2020)",no assertion criteria provided,VCV001323781,6,29643827,6,29676050,1323781,1314062,,NC_000006.12:29676049:T:,,6:29676050:T>,6:g.29676050T>


In [22]:
variants = []
passed_vars = []
indels = []

In [23]:
server = "https://rest.ensembl.org"

count = 0
for index, row in ClinVar_rs_coord_unmapped.iterrows():
    if row['Accession'] not in passed_vars:
        variant_id = row['Accession']
        ext = "/vep/human/hgvs/" + row['DNA_coordinate'] + "?"
        r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
        if not r.ok:
            print(row['DNA_coordinate'])
            indels.append(row['DNA_coordinate'])
            continue
 
        variants.append({'variant_id': variant_id, 'variant': r.json()})
        time.sleep(1)
        count = count + 1
        print(str(count), ' -----> ', ext)

1  ----->  /vep/human/hgvs/11:g.17430830C>T?
2  ----->  /vep/human/hgvs/3:g.57235580A>G?
12:g.121001081GG>G
3  ----->  /vep/human/hgvs/12:g.121001169C>T?
12:g.121001136AA>
12:g.121001116AGAG>AG
12:g.121001098C>
4  ----->  /vep/human/hgvs/12:g.121001099A>AA?
5  ----->  /vep/human/hgvs/12:g.121001064G>A?
12:g.121001159CATCTCCACCCAGATGGCCTCTTCCTCC>CATCTCCACCCAGATGGCCTCTTCCTCCATCTCCACCCAGATGGCCTCTTCCTCC
12:g.121001068CC>
6  ----->  /vep/human/hgvs/12:g.121001077G>T?
7  ----->  /vep/human/hgvs/7:g.44149798T>TT?
8  ----->  /vep/human/hgvs/7:g.44150047C>G?
9  ----->  /vep/human/hgvs/7:g.44146565A>T?
10  ----->  /vep/human/hgvs/7:g.44149969C>A?
11  ----->  /vep/human/hgvs/7:g.44150024C>G?
12  ----->  /vep/human/hgvs/7:g.44153299A>G?
13  ----->  /vep/human/hgvs/7:g.44153301C>T?
14  ----->  /vep/human/hgvs/7:g.44153407C>T?
15  ----->  /vep/human/hgvs/7:g.44145222A>C?
16  ----->  /vep/human/hgvs/7:g.44145671G>T?
17  ----->  /vep/human/hgvs/7:g.44146541A>C?
18  ----->  /vep/human/hgvs/7:g.44147742

151  ----->  /vep/human/hgvs/17:g.37739578G>A?
152  ----->  /vep/human/hgvs/17:g.37739550A>T?
153  ----->  /vep/human/hgvs/17:g.37739548T>C?
154  ----->  /vep/human/hgvs/17:g.37739541G>A?
155  ----->  /vep/human/hgvs/17:g.37739533A>G?
156  ----->  /vep/human/hgvs/17:g.37739532G>C?
157  ----->  /vep/human/hgvs/17:g.37739527G>T?
158  ----->  /vep/human/hgvs/17:g.37739524G>A?
159  ----->  /vep/human/hgvs/17:g.37739518T>C?
17:g.37739512>GGGCTGCA
160  ----->  /vep/human/hgvs/17:g.37739508G>A?
17:g.37739513G>
161  ----->  /vep/human/hgvs/17:g.37739506T>C?
17:g.37739500T>
17:g.37739497GGG>GG
162  ----->  /vep/human/hgvs/17:g.37739494T>G?
163  ----->  /vep/human/hgvs/17:g.37739491G>A?
164  ----->  /vep/human/hgvs/17:g.37739490C>G?
165  ----->  /vep/human/hgvs/17:g.37739479A>G?
166  ----->  /vep/human/hgvs/17:g.37739471C>G?
17:g.37739459CTTTCT>CT
167  ----->  /vep/human/hgvs/17:g.37739458G>A?
17:g.37739450C>
168  ----->  /vep/human/hgvs/17:g.37739439C>G?
169  ----->  /vep/human/hgvs/17:g.377394

300  ----->  /vep/human/hgvs/12:g.120978841G>C?
12:g.120978837AGAG>AG
301  ----->  /vep/human/hgvs/12:g.120978790C>A?
302  ----->  /vep/human/hgvs/12:g.120978817C>G?
303  ----->  /vep/human/hgvs/12:g.120978815T>C?
304  ----->  /vep/human/hgvs/12:g.120978814C>G?
12:g.120978810GGCCCTGCTC>TG
305  ----->  /vep/human/hgvs/12:g.120978809C>T?
306  ----->  /vep/human/hgvs/12:g.120978803T>G?
307  ----->  /vep/human/hgvs/12:g.120978803T>C?
308  ----->  /vep/human/hgvs/12:g.120978803T>A?
309  ----->  /vep/human/hgvs/12:g.120994183G>A?
310  ----->  /vep/human/hgvs/12:g.120988903G>A?
311  ----->  /vep/human/hgvs/12:g.120978794A>C?
312  ----->  /vep/human/hgvs/12:g.120996601G>C?
313  ----->  /vep/human/hgvs/12:g.120988848T>TT?
314  ----->  /vep/human/hgvs/12:g.120978850C>T?
315  ----->  /vep/human/hgvs/12:g.120978785G>A?
316  ----->  /vep/human/hgvs/12:g.120978784A>T?
317  ----->  /vep/human/hgvs/12:g.120978770T>C?
318  ----->  /vep/human/hgvs/12:g.120978769A>C?
319  ----->  /vep/human/hgvs/12:g.120

In [24]:
len(indels)

115

In [25]:
len(variants)

431

## 4. Working with the ones that have fetched

Create a dataframe with variants, coordinates and their fetched consequences

In [26]:
coord_cons = []
for var in variants:
    var_id = var['variant_id']
    for trans in var['variant']:
        coord = trans['id']
        consequence = trans['most_severe_consequence']
        coord_cons.append({'coordinate': coord, 'consequence': consequence, 'accession': var_id})
coord_cons_df = pd.DataFrame(coord_cons)
coord_cons_df

Unnamed: 0,coordinate,consequence,accession
0,11:g.17430830C>T,missense_variant,VCV001696124
1,3:g.57235580A>G,synonymous_variant,VCV001170466
2,12:g.121001169C>T,stop_gained,VCV001687090
3,12:g.121001099A>AA,frameshift_variant,VCV001687083
4,12:g.121001064G>A,splice_acceptor_variant,VCV001687082
...,...,...,...
426,11:g.17387536G>C,missense_variant,VCV001338466
427,11:g.17414389G>A,intron_variant,VCV001177449
428,6:g.29673291G>A,missense_variant,VCV001449415
429,6:g.29673400G>GG,frameshift_variant,VCV001325370


In [27]:
coord_cons_df.to_csv(
    'ClinVar_unmapped_fetched_unfiltered.csv',
    header=True, index=False)

In [29]:
#check which consequence types are there
coord_cons_df['consequence'].unique()

array(['missense_variant', 'synonymous_variant', 'stop_gained',
       'frameshift_variant', 'splice_acceptor_variant',
       'splice_region_variant', 'splice_donor_variant',
       'splice_donor_5th_base_variant', 'intron_variant',
       '5_prime_UTR_variant', 'start_lost', 'upstream_gene_variant',
       '3_prime_UTR_variant', 'splice_polypyrimidine_tract_variant',
       'non_coding_transcript_exon_variant',
       'splice_donor_region_variant'], dtype=object)

The consequence types decided from the Pathogenicity check are the following:
    
'missense variant',
'frameshift variant',
'splice donor variant',
'splice acceptor variant',
'nonsense (stop gained)',
'stop lost',
'nc transcript variant' 

for the categories from ClinVar and the following for the Ensembl categories:

'missense_variant',
'protein_altering_variant', 
'coding_sequence_variant', 
'frameshift_variant', 
'splice_donor_variant', 
'splice_acceptor_variant', 
'splice_donor_5th_base_variant', 
'start_lost', 
'stop_gained', 
'stop_lost', 
'inframe_deletion', 
'inframe_insertion'

In [30]:
#The list of consequence types we desided to leave in the table

consequence_type_list = ['missense_variant',
                         'protein_altering_variant',
                         'coding_sequence_variant',
                         'frameshift_variant',
                         'splice_donor_variant',
                         'splice_acceptor_variant',
                         'splice_donor_5th_base_variant',
                         'start_lost',
                         'stop_gained',
                         'stop_lost',
                         'inframe_deletion',
                         'inframe_insertion']

In [31]:
ClinVar_unmapped_SNV_filtered = coord_cons_df.query('consequence in @consequence_type_list').reset_index(drop=True)
ClinVar_unmapped_SNV_filtered

Unnamed: 0,coordinate,consequence,accession
0,11:g.17430830C>T,missense_variant,VCV001696124
1,12:g.121001169C>T,stop_gained,VCV001687090
2,12:g.121001099A>AA,frameshift_variant,VCV001687083
3,12:g.121001064G>A,splice_acceptor_variant,VCV001687082
4,12:g.121001077G>T,missense_variant,VCV001315998
...,...,...,...
370,11:g.17387338C>A,missense_variant,VCV001525999
371,11:g.17387902C>T,missense_variant,VCV001525998
372,11:g.17387536G>C,missense_variant,VCV001338466
373,6:g.29673291G>A,missense_variant,VCV001449415


In [32]:
len(ClinVar_unmapped_SNV_filtered['coordinate'].unique())

282

In [33]:
#Make them pretty for the inclusion into VCF
ClinVar_unmapped_SNV_filtered_listdict = []
for index,row in ClinVar_unmapped_SNV_filtered.iterrows():
    chrom = row['coordinate'][:row['coordinate'].find(':')]
    pos = row['coordinate'][row['coordinate'].find('.')+1:row['coordinate'].find('>')-1]
    ref = row['coordinate'][row['coordinate'].find('>')-1]
    alt = row['coordinate'][row['coordinate'].find('>')+1:]
    accession = row['accession']
    ClinVar_unmapped_SNV_filtered_listdict.append({'chrom': chrom, 'pos': pos,
                                             'ref': ref, 'alt': alt, 'accession': accession})
ClinVar_unmapped_SNV_filtered_df = pd.DataFrame(ClinVar_unmapped_SNV_filtered_listdict)   
ClinVar_unmapped_SNV_filtered_df

Unnamed: 0,chrom,pos,ref,alt,accession
0,11,17430830,C,T,VCV001696124
1,12,121001169,C,T,VCV001687090
2,12,121001099,A,AA,VCV001687083
3,12,121001064,G,A,VCV001687082
4,12,121001077,G,T,VCV001315998
...,...,...,...,...,...
370,11,17387338,C,A,VCV001525999
371,11,17387902,C,T,VCV001525998
372,11,17387536,G,C,VCV001338466
373,6,29673291,G,A,VCV001449415


In [34]:
ClinVar_unmapped_SNV_filtered_df.to_csv(
    'ClinVar_unmapped_filtered_SNV_for_VCF.csv',
    header=True, index=False)

## 5. Working with indels

The ones that did not fetch are all indels. They are in the list 'indels' created in the p. 3

In [35]:
indels

['12:g.121001081GG>G',
 '12:g.121001136AA>',
 '12:g.121001116AGAG>AG',
 '12:g.121001098C>',
 '12:g.121001159CATCTCCACCCAGATGGCCTCTTCCTCC>CATCTCCACCCAGATGGCCTCTTCCTCCATCTCCACCCAGATGGCCTCTTCCTCC',
 '12:g.121001068CC>',
 '7:g.44145499GGGG>GGG',
 '7:g.44145579TGATGA>TGA',
 '7:g.44145255CGC>C',
 '7:g.44153361GGG>GG',
 '7:g.44149981GATAG>GATAGATAG',
 '12:g.120988910A>',
 '12:g.120988883ACAACA>ACA',
 '12:g.120999577CC>C',
 '12:g.120994316CC>T',
 '12:g.120978897CC>A',
 '12:g.120978906GGGGGAGTCCTGCGGCGG>G',
 '12:g.120994312GGG>GGGG',
 '12:g.120994260CC>C',
 '12:g.120978483TC>',
 '12:g.120978971GGGG>GGG',
 '12:g.120978960GGGGG>GGGGGG',
 '12:g.120978952AA>A',
 '12:g.120978929GAGGGGAG>GAG',
 '12:g.120978810GGCCCTGCTC>TG',
 '12:g.120978847AT>CC',
 '12:g.120978788AGCTGCAGACGGAGCT>AGCTGCAGACGGAGCTGCAGACGGAGCT',
 '12:g.120988931CCC>CC',
 '12:g.120997532GACCACCCTGCAGCCCG>GACCACCCTGCAGCCCGACCACCCTGCAGCCCG',
 '12:g.120997510>G',
 '12:g.120997495AGAG>AG',
 '12:g.120993575ACCAACCAA>ACCAA',
 '12:g.120994324

In [36]:
#filter the unmapped ClinVar table to contain just indels
indel_df = ClinVar_rs_coord_unmapped.query('DNA_coordinate in @indels').reset_index(drop=True)
indel_df

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Clinical significance (Last reviewed),Review status,Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),ID,Canonical_SPDI,Unnamed: 15,coordinates,DNA_coordinate
0,NM_000545.8(HNF1A):c.1786del (p.Val596fs),C12orf43|HNF1A,"V596fs, V603fs",Monogenic diabetes,"Likely pathogenic(Last reviewed: Jul 1, 2022)",reviewed by expert panel,VCV001700003,12.0,121438884,12.0,121001081,1700003,1692403,,NC_000012.12:121001080:GG:G,,12:121001081:GG>G,12:g.121001081GG>G
1,NM_000545.8(HNF1A):c.1840_1841del (p.Asn614fs),C12orf43|HNF1A,"N614fs, N621fs",Monogenic diabetes,"Likely pathogenic(Last reviewed: May 9, 2022)",reviewed by expert panel,VCV001687087,12.0,121438939 - 121438940,12.0,121001136 - 121001137,1687087,1679252,,NC_000012.12:121001135:AA:,,12:121001136:AA>,12:g.121001136AA>
2,NM_000545.8(HNF1A):c.1822_1823del (p.Ser608fs),C12orf43|HNF1A,"S608fs, S615fs",Monogenic diabetes,"Pathogenic(Last reviewed: May 9, 2022)",reviewed by expert panel,VCV001687086,12.0,121438919 - 121438920,12.0,121001116 - 121001117,1687086,1679251,,NC_000012.12:121001115:AGAG:AG,,12:121001116:AGAG>AG,12:g.121001116AGAG>AG
3,NM_000545.8(HNF1A):c.1802del (p.Ser600_Ser601i...,C12orf43|HNF1A,,Monogenic diabetes,"Likely pathogenic(Last reviewed: May 4, 2022)",reviewed by expert panel,VCV001687084,12.0,121438901,12.0,121001098,1687084,1679249,,NC_000012.12:121001097:C:,,12:121001098:C>,12:g.121001098C>
4,NM_000545.8(HNF1A):c.1864_1890dup (p.Ile622_Se...,C12orf43|HNF1A,,Monogenic diabetes,"Uncertain significance(Last reviewed: Apr 24, ...",reviewed by expert panel,VCV001687077,12.0,121438961 - 121438962,12.0,121001158 - 121001159,1687077,1679242,,NC_000012.12:121001158:CATCTCCACCCAGATGGCCTCTT...,,12:121001159:CATCTCCACCCAGATGGCCTCTTCCTCC>CATC...,12:g.121001159CATCTCCACCCAGATGGCCTCTTCCTCC>CAT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,NM_004836.7(EIF2AK3):c.1936_1958del (p.Leu646fs),EIF2AK3,"L495fs, L646fs",Wolcott-Rallison dysplasia,"Pathogenic(Last reviewed: Jun 30, 2021)","criteria provided, single submitter",VCV001179141,2,88876150 - 88876172,2,88576632 - 88576654,1179141,1168625,,NC_000002.12:88576631:CTAACAATGCCCGGGTGTTCAAGC...,,2:88576632:CTAACAATGCCCGGGTGTTCAAGCT>CT,2:g.88576632CTAACAATGCCCGGGTGTTCAAGCT>CT
111,NM_004836.7(EIF2AK3):c.12del (p.Ile5fs),EIF2AK3,I5fs,Wolcott-Rallison dysplasia,"Pathogenic(Last reviewed: Feb 28, 2020)","criteria provided, single submitter",VCV001175193,2,88926781,2,88627263,1175193,1164624,,NC_000002.12:88627262:GG:G,,2:88627263:GG>G,2:g.88627263GG>G
112,NM_005742.4(PDIA6):c.547del (p.Val183fs),PDIA6,"V180fs, V183fs, V188fs, V231fs, V235fs",not provided,"Uncertain significance(Last reviewed: Jan 18, ...",no assertion criteria provided,VCV001334421,2,10931958,2,10791832,1334421,1325315,,NC_000002.12:10791831:CC:C,,2:10791832:CC>C,2:g.10791832CC>C
113,NM_173560.4(RFX6):c.1679-34dup,RFX6,,Hypoplastic pancreas-intestinal atresia-hypopl...,"Benign(Last reviewed: Nov 7, 2021)","criteria provided, single submitter",VCV001327955,6,117246576 - 117246577,6,116925413 - 116925414,1327955,1318574,,NC_000006.12:116925413:AAAAAA:AAAAAAA,,6:116925414:AAAAAA>AAAAAAA,6:g.116925414AAAAAA>AAAAAAA


In [37]:
#Make them pretty for the inclusion into VCF
df_dict_list = []
for index,row in indel_df.iterrows():
    df_dict_list.append({'chrom' : str(int(row['GRCh38Chromosome'])), 
     'pos' : row['GRCh38Location'].split(' -')[0], 
     'ref' : row['Canonical_SPDI'].split(':')[2], 
     'alt' : row['Canonical_SPDI'].split(':')[3],
                         'accession': row['Accession']})
ClinVar_indels = pd.DataFrame(df_dict_list)
ClinVar_indels

Unnamed: 0,chrom,pos,ref,alt,accession
0,12,121001081,GG,G,VCV001700003
1,12,121001136,AA,,VCV001687087
2,12,121001116,AGAG,AG,VCV001687086
3,12,121001098,C,,VCV001687084
4,12,121001158,CATCTCCACCCAGATGGCCTCTTCCTCC,CATCTCCACCCAGATGGCCTCTTCCTCCATCTCCACCCAGATGGCC...,VCV001687077
...,...,...,...,...,...
110,2,88576632,CTAACAATGCCCGGGTGTTCAAGCT,CT,VCV001179141
111,2,88627263,GG,G,VCV001175193
112,2,10791832,CC,C,VCV001334421
113,6,116925413,AAAAAA,AAAAAAA,VCV001327955


In [38]:
ClinVar_indels.to_csv(
    'ClinVar_indels.csv',
    header=True, index=False)

## 6. Repeat the same but with the table filtered to just 'pathogenic' and 'likely pathogenic' variants

In [39]:
ClinVar_clinical = ClinVar.rename(columns={"Clinical significance (Last reviewed)": "clinical_significance"})
ClinVar_clinical

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),clinical_significance,Review status,Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),ID,Canonical_SPDI,Unnamed: 15,coordinates
0,NM_000352.6(ABCC8):c.1630+5G>A,ABCC8,,Transitory neonatal diabetes mellitus|Maturity...,Uncertain significance,"criteria provided, single submitter",VCV001697233,11.0,17464262,11.0,17442715,1697233,1689628,rs1956364777,NC_000011.10:17442714:C:T,,11:17442715:C>T
1,NM_000352.6(ABCC8):c.1716G>A (p.Ser572_Pro573=),ABCC8,,Transitory neonatal diabetes mellitus|Maturity...,Uncertain significance,"criteria provided, single submitter",VCV001697229,11.0,17452462,11.0,17430915,1697229,1689624,rs750619309,NC_000011.10:17430914:C:T,,11:17430915:C>T
2,NM_000352.6(ABCC8):c.1833C>T (p.Ser611_Glu612=),ABCC8,,Transitory neonatal diabetes mellitus|Maturity...,Uncertain significance,"criteria provided, single submitter",VCV001697228,11.0,17450202,11.0,17428655,1697228,1689623,rs763273046,NC_000011.10:17428654:G:A,,11:17428655:G>A
3,NM_000352.6(ABCC8):c.1801G>A (p.Val601Ile),ABCC8,"V600I, V601I",not specified,"Uncertain significance(Last reviewed: May 5, 2...","criteria provided, single submitter",VCV001696124,11.0,17452377,11.0,17430830,1696124,1688532,,NC_000011.10:17430829:C:T,,11:17430830:C>T
4,NM_000352.6(ABCC8):c.890G>T (p.Arg297Met),ABCC8,"R296M, R297M",Maturity onset diabetes mellitus in young|Tran...,Uncertain significance,"criteria provided, single submitter",VCV001693120,11.0,17482156,11.0,17460609,1693120,1685560,rs371802112,NC_000011.10:17460608:C:A,,11:17460609:C>A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4454,NM_001109809.5(ZFP57):c.743G>A (p.Arg248His),ZFP57,"R248H, R176H","Diabetes mellitus, transient neonatal, 1","Pathogenic(Last reviewed: Aug 1, 2008)",no assertion criteria provided,VCV000000719,6,29641145,6,29673368,719,15758,rs77625743,NC_000006.12:29673367:C:T,,6:29673368:C>T
4455,NM_001109809.5(ZFP57):c.1372C>G (p.His458Asp),ZFP57,"H458D, H386D","Diabetes mellitus, transient neonatal, 1","Pathogenic(Last reviewed: Aug 1, 2008)",no assertion criteria provided,VCV000000718,6,29640516,6,29672739,718,15757,rs79020217,NC_000006.12:29672738:G:C,,6:29672739:G>C
4456,NM_001109809.5(ZFP57):c.1383del (p.Tyr462fs),ZFP57,"Y390fs, Y462fs","Diabetes mellitus, transient neonatal, 1","Pathogenic(Last reviewed: Aug 1, 2008)",no assertion criteria provided,VCV000000717,6,29640505,6,29672728,717,15756,rs606231122,NC_000006.12:29672727:G:,,6:29672728:G>
4457,NM_001109809.5(ZFP57):c.317_318del (p.Glu106fs),ZFP57,"E34fs, E106fs","Diabetes mellitus, transient neonatal, 1","Pathogenic(Last reviewed: Aug 1, 2008)",no assertion criteria provided,VCV000000716,6,29643197 - 29643198,6,29675420 - 29675421,716,15755,rs606231121,NC_000006.12:29675419:CTCTCTC:CTCTC,,6:29675420:CTCTCTC>CTCTC


In [40]:
pathogenic_vars = []
for index, row in ClinVar_clinical.iterrows():
    if row['clinical_significance'].split('(')[0] == 'Pathogenic' or row['clinical_significance'].split('(')[0] == 'Likely pathogenic':
        pathogenic_vars.append(row['Accession'])
pathogenic = ClinVar_clinical.query('Accession in @pathogenic_vars').reset_index(drop=True)
pathogenic       

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),clinical_significance,Review status,Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),ID,Canonical_SPDI,Unnamed: 15,coordinates
0,NM_000352.6(ABCC8):c.4544C>T (p.Thr1515Met),ABCC8,"T1514M, T1515M, T1516M, T1537M",not provided|Diabetes mellitus,"Pathogenic(Last reviewed: Sep 2, 2021)","criteria provided, single submitter",VCV001506182,11.0,17415814,11.0,17394267,1506182,1339911,,NC_000011.10:17394266:G:A,,11:17394267:G>A
1,NM_000352.6(ABCC8):c.4516G>A (p.Glu1506Lys),ABCC8,"E1506K, E1507K, E1505K, E1528K","not provided|Hyperinsulinemic hypoglycemia, fa...","Pathogenic(Last reviewed: Oct 8, 2020)","criteria provided, multiple submitters, no con...",VCV000009097,11.0,17415842,11.0,17394295,9097,24136,rs137852671,NC_000011.10:17394294:C:T,,11:17394295:C>T
2,NM_012096.3(APPL1):c.280G>A (p.Asp94Asn),APPL1,D94N,Maturity-onset diabetes of the young type 14,"Pathogenic(Last reviewed: Jul 2, 2015)",no assertion criteria provided,VCV000208075,3.0,57272139,3.0,57238111,208075,204354,rs796065047,NC_000003.12:57238110:G:A,,3:57238111:G>A
3,NM_012096.3(APPL1):c.1655T>A (p.Leu552Ter),APPL1,L552*,Maturity-onset diabetes of the young type 14,"Pathogenic(Last reviewed: Jul 2, 2015)",no assertion criteria provided,VCV000208074,3.0,57294044,3.0,57260016,208074,204353,rs869320673,NC_000003.12:57260015:T:A,,3:57260016:T>A
4,NM_000545.8(HNF1A):c.1786del (p.Val596fs),C12orf43|HNF1A,"V596fs, V603fs",Monogenic diabetes,"Likely pathogenic(Last reviewed: Jul 1, 2022)",reviewed by expert panel,VCV001700003,12.0,121438884,12.0,121001081,1700003,1692403,,NC_000012.12:121001080:GG:G,,12:121001081:GG>G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
877,NM_001109809.5(ZFP57):c.743G>A (p.Arg248His),ZFP57,"R248H, R176H","Diabetes mellitus, transient neonatal, 1","Pathogenic(Last reviewed: Aug 1, 2008)",no assertion criteria provided,VCV000000719,6,29641145,6,29673368,719,15758,rs77625743,NC_000006.12:29673367:C:T,,6:29673368:C>T
878,NM_001109809.5(ZFP57):c.1372C>G (p.His458Asp),ZFP57,"H458D, H386D","Diabetes mellitus, transient neonatal, 1","Pathogenic(Last reviewed: Aug 1, 2008)",no assertion criteria provided,VCV000000718,6,29640516,6,29672739,718,15757,rs79020217,NC_000006.12:29672738:G:C,,6:29672739:G>C
879,NM_001109809.5(ZFP57):c.1383del (p.Tyr462fs),ZFP57,"Y390fs, Y462fs","Diabetes mellitus, transient neonatal, 1","Pathogenic(Last reviewed: Aug 1, 2008)",no assertion criteria provided,VCV000000717,6,29640505,6,29672728,717,15756,rs606231122,NC_000006.12:29672727:G:,,6:29672728:G>
880,NM_001109809.5(ZFP57):c.317_318del (p.Glu106fs),ZFP57,"E34fs, E106fs","Diabetes mellitus, transient neonatal, 1","Pathogenic(Last reviewed: Aug 1, 2008)",no assertion criteria provided,VCV000000716,6,29643197 - 29643198,6,29675420 - 29675421,716,15755,rs606231121,NC_000006.12:29675419:CTCTCTC:CTCTC,,6:29675420:CTCTCTC>CTCTC


In [41]:
# Checking that there are no NLK, KLF11, PAX4
pathogenic['Gene(s)'].unique()

array(['ABCC8', 'APPL1', 'C12orf43|HNF1A', 'CEL', 'GCK', 'HNF1A', 'HNF1B',
       'HNF4A', 'HNF4A|R3HDML-AS1', 'IGF2|INS|INS-IGF2', 'INS|INS-IGF2',
       'INS-IGF2|INS', 'KCNJ11', 'MLKL', 'NEUROD1', 'PAX4', 'PDX1',
       'PLIN1', 'BSCL2|HNRNPUL2-BSCL2', 'LEP', 'ABCC8|LOC110121471',
       'EIF2AK3', 'EIF2AK3|LOC101928371', 'FBN1', 'FOXP3', 'GLIS3',
       'GPR161',
       'HYMAI|LOC113146422|LOC123864090|LOC123864091|LTV1|PHACTR2|PLAGL1|SF3B5|SNORA98|STX11|TRL-TAA1-1|UTRN|ZC2HC1B',
       'INS', 'KCNQ1', 'KCNQ2', 'LOC101928371|EIF2AK3',
       'LOC102724058|SCN1A', 'LZTR1', 'MAGEL2', 'MECP2', 'PAX6', 'PTF1A',
       'PURA', 'RFX6', 'SHANK3', 'SLC2A2', 'TRIP11', 'ZFP57'],
      dtype=object)

In [42]:
pathogenic.to_csv(
    'ClinVar_MD_pathogenic.csv',
    header=True, index=False)

In [47]:
# choosing the variants from ClinVar that are in dbSNP
ClinVar_pat_list = []
for var in pathogenic['ID']:
    if str(var).startswith('rs'):
        ClinVar_pat_list.append(var)
ClinVar_pat_rs = list(set(ClinVar_pat_list)) #getting rid of duplicates
ClinVar_pat_mapped = Ens.drop_duplicates().query('id in @ClinVar_pat_rs').reset_index(drop=True)
ClinVar_pat_mapped.to_csv(
    'ClinVar_pathogenic_mapped_1st.csv',
    header=True, index=False)
print('created the 1st stage of mapping')

# choosing the variants from ClinVar that are not in dbSNP
ClinVar_nonrs_list = []
for index,row in pathogenic.iterrows():
    if str(row['ID']).startswith('rs'):
        continue
    if row['Canonical_SPDI'] == '':
        continue
    else: ClinVar_nonrs_list.append(row['Canonical_SPDI'])
len(ClinVar_nonrs_list)
ClinVar_pat_unmapped = pathogenic.query('Canonical_SPDI in @ClinVar_nonrs_list').reset_index(drop=True)
ClinVar_pat_unmapped.to_csv(
    'ClinVar_pat_unmapped.csv', header=True, index=False)
print('saving the table with unmapped')

#list of coordinates of the variants that did not map with rs ID to Ensembl
unmapped_coord = ClinVar_pat_unmapped['coordinates'].tolist()

#Mapping them to Ensembl by coordinates
ClinVar_pat_mapped_coord = Ens.query('coordinate in @unmapped_coord').reset_index(drop=True)
#This was a 2nd stage mapping
ClinVar_pat_mapped_coord.to_csv(
    'ClinVar_pathogenic_mapped_to_Ens_2nd.csv',
    header=True, index=False)
print('creating 2nd stage of mapping')

#List of the variants that did not map neither with rs nor with coordinates
leftovers = []
ClinVar_pat_mapped_coord_list = ClinVar_pat_mapped_coord['coordinate'].tolist()
for var in ClinVar_pat_unmapped['coordinates']:
    if var not in ClinVar_pat_mapped_coord_list:
        leftovers.append(var)

#Filtering the ClinVar_unmapped table to only leftover variants
ClinVar_pat_rs_coord_unmapped = ClinVar_pat_unmapped.query('coordinates in @leftovers').reset_index(drop=True)

# creating a proper genomic coordinate for the API search
coordinates = []
for index,row in ClinVar_pat_rs_coord_unmapped.iterrows():
    coordinates.append(str(int(row['GRCh38Chromosome']))
          + ':g.' + str(int(row['Canonical_SPDI'].split(':')[1])+1)
          + row['Canonical_SPDI'].split(':')[2] + '>' + row['Canonical_SPDI'].split(':')[3])
ClinVar_pat_rs_coord_unmapped['DNA_coordinate'] = coordinates
ClinVar_pat_rs_coord_unmapped

created the 1st stage of mapping
saving the table with unmapped
creating 2nd stage of mapping


Unnamed: 0,Name,Gene(s),Protein change,Condition(s),clinical_significance,Review status,Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),ID,Canonical_SPDI,Unnamed: 15,coordinates,DNA_coordinate
0,NM_000545.8(HNF1A):c.1786del (p.Val596fs),C12orf43|HNF1A,"V596fs, V603fs",Monogenic diabetes,"Likely pathogenic(Last reviewed: Jul 1, 2022)",reviewed by expert panel,VCV001700003,12.0,121438884,12.0,121001081,1700003,1692403,,NC_000012.12:121001080:GG:G,,12:121001081:GG>G,12:g.121001081GG>G
1,NM_000545.8(HNF1A):c.1840_1841del (p.Asn614fs),C12orf43|HNF1A,"N614fs, N621fs",Monogenic diabetes,"Likely pathogenic(Last reviewed: May 9, 2022)",reviewed by expert panel,VCV001687087,12.0,121438939 - 121438940,12.0,121001136 - 121001137,1687087,1679252,,NC_000012.12:121001135:AA:,,12:121001136:AA>,12:g.121001136AA>
2,NM_000545.8(HNF1A):c.1822_1823del (p.Ser608fs),C12orf43|HNF1A,"S608fs, S615fs",Monogenic diabetes,"Pathogenic(Last reviewed: May 9, 2022)",reviewed by expert panel,VCV001687086,12.0,121438919 - 121438920,12.0,121001116 - 121001117,1687086,1679251,,NC_000012.12:121001115:AGAG:AG,,12:121001116:AGAG>AG,12:g.121001116AGAG>AG
3,NM_000545.8(HNF1A):c.1802del (p.Ser600_Ser601i...,C12orf43|HNF1A,,Monogenic diabetes,"Likely pathogenic(Last reviewed: May 4, 2022)",reviewed by expert panel,VCV001687084,12.0,121438901,12.0,121001098,1687084,1679249,,NC_000012.12:121001097:C:,,12:121001098:C>,12:g.121001098C>
4,NM_000545.8(HNF1A):c.1772_1773del (p.Ser591fs),C12orf43|HNF1A,"S591fs, S598fs",Monogenic diabetes,"Likely pathogenic(Last reviewed: Apr 17, 2022)",reviewed by expert panel,VCV001687070,12.0,121438871 - 121438872,12.0,121001068 - 121001069,1687070,1679235,,NC_000012.12:121001067:CC:,,12:121001068:CC>,12:g.121001068CC>
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,NM_000525.4(KCNJ11):c.754G>T (p.Val252Leu),KCNJ11,"V165L, V252L",Neonatal diabetes mellitus,Likely pathogenic,"criteria provided, single submitter",VCV001525999,11,17408885,11,17387338,1525999,1517404,,NC_000011.10:17387337:C:A,,11:17387338:C>A,11:g.17387338C>A
359,NM_000525.4(KCNJ11):c.190G>A (p.Val64Met),KCNJ11,V64M,Neonatal diabetes mellitus,Likely pathogenic,"criteria provided, single submitter",VCV001525998,11,17409449,11,17387902,1525998,1517403,,NC_000011.10:17387901:C:T,,11:17387902:C>T,11:g.17387902C>T
360,NM_000525.4(KCNJ11):c.556C>G (p.His186Asp),KCNJ11,"H186D, H99D",not provided,"Likely pathogenic(Last reviewed: Aug 28, 2021)","criteria provided, multiple submitters, no con...",VCV001338466,11,17409083,11,17387536,1338466,1329474,,NC_000011.10:17387535:G:C,,11:17387536:G>C,11:g.17387536G>C
361,NM_001109809.5(ZFP57):c.711dup (p.Lys238fs),ZFP57,"K166fs, K238fs","Diabetes mellitus, transient neonatal, 1","Likely pathogenic(Last reviewed: Aug 17, 2021)",no assertion criteria provided,VCV001325370,6,29641176 - 29641177,6,29673399 - 29673400,1325370,1315626,,NC_000006.12:29673399:G:GG,,6:29673400:G>GG,6:g.29673400G>GG


The 1st stage table includes 314 variants and the 2nd stage table includes 36 variants

In [44]:
variants_pat = []
passed_vars_pat = []
indels_pat = []

In [45]:
server = "https://rest.ensembl.org"

count = 0
for index, row in ClinVar_pat_rs_coord_unmapped.iterrows():
    if row['Accession'] not in passed_vars:
        variant_id = row['Accession']
        ext = "/vep/human/hgvs/" + row['DNA_coordinate'] + "?"
        r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
        if not r.ok:
            print(row['DNA_coordinate'])
            indels_pat.append(row['DNA_coordinate'])
            continue
 
        variants_pat.append({'variant_id': variant_id, 'variant': r.json()})
        time.sleep(1)
        count = count + 1
        print(str(count), ' -----> ', ext)

12:g.121001081GG>G
12:g.121001136AA>
12:g.121001116AGAG>AG
12:g.121001098C>
12:g.121001068CC>
1  ----->  /vep/human/hgvs/12:g.121001077G>T?
2  ----->  /vep/human/hgvs/7:g.44149798T>TT?
3  ----->  /vep/human/hgvs/7:g.44150047C>G?
4  ----->  /vep/human/hgvs/7:g.44146565A>T?
5  ----->  /vep/human/hgvs/7:g.44149969C>A?
6  ----->  /vep/human/hgvs/7:g.44150024C>G?
7  ----->  /vep/human/hgvs/7:g.44153299A>G?
8  ----->  /vep/human/hgvs/7:g.44153301C>T?
9  ----->  /vep/human/hgvs/7:g.44153407C>T?
10  ----->  /vep/human/hgvs/7:g.44145222A>C?
11  ----->  /vep/human/hgvs/7:g.44145671G>T?
12  ----->  /vep/human/hgvs/7:g.44146541A>C?
13  ----->  /vep/human/hgvs/7:g.44147742C>T?
14  ----->  /vep/human/hgvs/7:g.44149825T>A?
15  ----->  /vep/human/hgvs/7:g.44149825T>C?
16  ----->  /vep/human/hgvs/7:g.44153457G>A?
17  ----->  /vep/human/hgvs/7:g.44147762T>C?
7:g.44145499GGGG>GGG
18  ----->  /vep/human/hgvs/7:g.44145630C>G?
19  ----->  /vep/human/hgvs/7:g.44145210C>A?
20  ----->  /vep/human/hgvs/7:g.4415

146  ----->  /vep/human/hgvs/17:g.37710661C>CC?
147  ----->  /vep/human/hgvs/17:g.37710654T>C?
17:g.37710610TGATTG>TG
148  ----->  /vep/human/hgvs/17:g.37710573G>T?
17:g.37710571CC>C
149  ----->  /vep/human/hgvs/17:g.37710565G>A?
17:g.37710562GCTGTAAAACCGACTGGCTGGTCACCATGG>
150  ----->  /vep/human/hgvs/17:g.37710502C>T?
151  ----->  /vep/human/hgvs/17:g.37710502C>G?
152  ----->  /vep/human/hgvs/17:g.37705003T>G?
153  ----->  /vep/human/hgvs/17:g.37705003T>A?
17:g.37704957G>
17:g.37704954G>
154  ----->  /vep/human/hgvs/17:g.37704946G>A?
155  ----->  /vep/human/hgvs/17:g.37704916C>T?
156  ----->  /vep/human/hgvs/17:g.37701166A>T?
157  ----->  /vep/human/hgvs/17:g.37701157G>A?
17:g.37701156TGTG>TG
17:g.37701153CTCT>CT
158  ----->  /vep/human/hgvs/17:g.37701109G>A?
159  ----->  /vep/human/hgvs/17:g.37701088G>A?
17:g.37701000T>
160  ----->  /vep/human/hgvs/17:g.37699191T>C?
161  ----->  /vep/human/hgvs/17:g.37687394T>A?
162  ----->  /vep/human/hgvs/17:g.37739471C>T?
17:g.37733577GG>G
17:g.3

In [46]:
coord_cons = []
for var in variants_pat:
    var_id = var['variant_id']
    for trans in var['variant']:
        coord = trans['id']
        consequence = trans['most_severe_consequence']
        coord_cons.append({'coordinate': coord, 'consequence': consequence, 'accession': var_id})
pat_coord_cons_df = pd.DataFrame(coord_cons)
pat_coord_cons_df.to_csv(
    'ClinVar_pathogenic_unmapped_fetched_unfiltered.csv',
    header=True, index=False)

#The list of consequence types we desided to leave in the table
consequence_type_list = ['missense_variant',
                         'protein_altering_variant',
                         'coding_sequence_variant',
                         'frameshift_variant',
                         'splice_donor_variant',
                         'splice_acceptor_variant',
                         'splice_donor_5th_base_variant',
                         'start_lost',
                         'stop_gained',
                         'stop_lost',
                         'inframe_deletion',
                         'inframe_insertion']

ClinVar_pat_unmapped_SNV_filtered = pat_coord_cons_df.query(
    'consequence in @consequence_type_list').reset_index(drop=True)

#Make them pretty for the inclusion into VCF
ClinVar_pat_unmapped_SNV_filtered_listdict = []
for index,row in ClinVar_pat_unmapped_SNV_filtered.iterrows():
    chrom = row['coordinate'][:row['coordinate'].find(':')]
    pos = row['coordinate'][row['coordinate'].find('.')+1:row['coordinate'].find('>')-1]
    ref = row['coordinate'][row['coordinate'].find('>')-1]
    alt = row['coordinate'][row['coordinate'].find('>')+1:]
    accession = row['accession']
    ClinVar_pat_unmapped_SNV_filtered_listdict.append({'chrom': chrom, 'pos': pos,
                                             'ref': ref, 'alt': alt, 'accession': accession})
ClinVar_pat_unmapped_SNV_filtered_df = pd.DataFrame(ClinVar_pat_unmapped_SNV_filtered_listdict)   
ClinVar_pat_unmapped_SNV_filtered_df.to_csv(
    'ClinVar_pat_unmapped_filtered_SNV_for_VCF.csv',
    header=True, index=False)

#filter the unmapped ClinVar table to contain just indels
indel_pat_df = ClinVar_pat_rs_coord_unmapped.query('DNA_coordinate in @indels_pat').reset_index(drop=True)
#Make them pretty for the inclusion into VCF
df_dict_list = []
for index,row in indel_pat_df.iterrows():
    df_dict_list.append({'chrom' : str(int(row['GRCh38Chromosome'])), 
     'pos' : row['GRCh38Location'].split(' -')[0], 
     'ref' : row['Canonical_SPDI'].split(':')[2], 
     'alt' : row['Canonical_SPDI'].split(':')[3],
                         'accession': row['Accession']})
ClinVar_pat_indels = pd.DataFrame(df_dict_list)
ClinVar_pat_indels.to_csv(
    'ClinVar_pathogenic_indels.csv',
    header=True, index=False)