In [1]:
# Code to get a gene position from dbSNP ID
# Example of the webpage to access;
# https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=snp&id=2230288&report=XML
# RefSeq accession.version of GRCh37 can be seen in the following link
# https://www.ncbi.nlm.nih.gov/grc/human/data?asm=GRCh37
# (https://www.ncbi.nlm.nih.gov/grc/human/data?asm=GRCh38)

from bs4 import BeautifulSoup, SoupStrainer
import requests
import re
import time
import pandas as pd

In [2]:
response = requests.get('https://www.ncbi.nlm.nih.gov/grc/human/data?asm=GRCh37')
html_str = response.text
bs = BeautifulSoup(html_str, "html5lib")
# print(bs.prettify())
ACVER=bs.find_all(text = re.compile("NC_000"))
ACVER[0:3]

['NC_000001.10', 'NC_000002.11', 'NC_000003.11']

In [3]:
ACVERS = "|".join(ACVER)
print(ACVERS)

NC_000001.10|NC_000002.11|NC_000003.11|NC_000004.11|NC_000005.9|NC_000006.11|NC_000007.13|NC_000008.10|NC_000009.11|NC_000010.10|NC_000011.9|NC_000012.11|NC_000013.10|NC_000014.8|NC_000015.9|NC_000016.9|NC_000017.10|NC_000018.9|NC_000019.9|NC_000020.10|NC_000021.8|NC_000022.10|NC_000023.10|NC_000024.9


In [4]:
response = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=snp&id=2230288&report=XML')
html_str = response.text
bs = BeautifulSoup(html_str, "html5lib")
bs.find(text = re.compile(ACVERS))

'NC_000001.10:g.155206167C>T'

In [5]:
IDs = ["2619363", "2619364", "2230288"]
POSs = []
for ID in IDs:
    response = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=snp&id=' + ID + '&report=XML') 
    html_str = response.text 
    bs = BeautifulSoup(html_str, "html5lib") 
    POSs.append(bs.find(text = re.compile(ACVERS)))
    time.sleep(1/3) # three requests per second (Guideline)
print(POSs)

['NC_000004.11:g.90759047G>T', 'NC_000004.11:g.90759887A>G', 'NC_000001.10:g.155206167C>T']


In [6]:
df = pd.read_csv('S4SNPs.csv')
print(df[0:5]) 

           Gene       RS ID Alternate RS ID  \
0  LRRK2 G2019S  rs34637584             NaN   
1  LRRK2 R1441G  rs33939927             NaN   
2  LRRK2 R1441C  rs33939927             NaN   
3  LRRK2 Y1699C  rs35801418             NaN   
4  LRRK2 G2385R  rs34778348             NaN   

  Region BRIT IS THIS COLUMN NEEDED OR CAN I DELETE IT?  
0                                                NaN     
1                                                NaN     
2                                                NaN     
3                                                NaN     
4                                                NaN     


In [7]:
IDs = [i[2:] for i in df.iloc[:, 1] if "rs" in str(i)]
IDs[:5]

['34637584', '33939927', '33939927', '35801418', '34778348']

In [8]:
POSs = []
for ID in IDs:
    response = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=snp&id=' + ID + '&report=XML') 
    html_str = response.text 
    bs = BeautifulSoup(html_str, "html5lib") 
    POSs.append(bs.find(text = re.compile(ACVERS)))
    time.sleep(1/3) # three requests per second (Guideline)
print(POSs[:10])

['NC_000012.11:g.40734202G>A', 'NC_000012.11:g.40704236C>A', 'NC_000012.11:g.40704236C>A', 'NC_000012.11:g.40714916A>G', 'NC_000012.11:g.40757328G>A', 'NC_000012.11:g.40713845G>A', 'NC_000001.10:g.155205634T>C', 'NC_000001.10:g.155205043A>C', 'NC_000001.10:g.155210420C>A', 'NC_000001.10:g.155210451_155210452insC']


In [9]:
rsIDs = ["rs" + ID for ID in IDs]
CHRs = [POS[7:9] for POS in POSs]
LOCs = [i[1+i.rfind("."):-3] for i in POSs]
RES = pd.DataFrame({"dbSNP": rsIDs,
                   "RefSeq_Accession_GRCh37": POSs,
                   "Chr": CHRs,
                   "Loc": LOCs})
RES[:10]

Unnamed: 0,dbSNP,RefSeq_Accession_GRCh37,Chr,Loc
0,rs34637584,NC_000012.11:g.40734202G>A,12,40734202
1,rs33939927,NC_000012.11:g.40704236C>A,12,40704236
2,rs33939927,NC_000012.11:g.40704236C>A,12,40704236
3,rs35801418,NC_000012.11:g.40714916A>G,12,40714916
4,rs34778348,NC_000012.11:g.40757328G>A,12,40757328
5,rs33949390,NC_000012.11:g.40713845G>A,12,40713845
6,rs76763715,NC_000001.10:g.155205634T>C,1,155205634
7,rs35095275,NC_000001.10:g.155205043A>C,1,155205043
8,rs104886460,NC_000001.10:g.155210420C>A,1,155210420
9,rs387906315,NC_000001.10:g.155210451_155210452insC,1,155210451_155210452i


In [10]:
RES.to_csv("S4SNP_POS.csv", index = False)