In [37]:
# Jupyter Notebook Scratch Work for NCBI Protein Search
import requests
import json
import xmltodict

In [38]:
# Define Protein and XML API search
PROTEIN = "NP_001116538"  # Microtubule-associated protein tau
PROTEIN_XML = PROTEIN + ".xml"
PROTEIN_JSON = PROTEIN + ".json"

In [39]:
# Perform API Call to NCBI for Protein and write data to XML file
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=protein&id=" + PROTEIN
resp = requests.get(url)

# Save the xml file:
with open(PROTEIN_XML, "wb") as f:
    f.write(resp.content)

In [40]:
# Read Protein XML file and convert to dictionary
with open(PROTEIN_XML) as xmlfile:
    data_dict = xmltodict.parse(xmlfile.read())

In [41]:
# Write dictionary to json file
json_data = json.dumps(data_dict)

with open(PROTEIN_JSON, "w") as json_file:
    json_file.write(json_data)

In [42]:
# Read in JSON data into normal Python dictionary
with open(PROTEIN_JSON, "r") as json_file:
    protein_data = json.load(json_file)
    
protein_data

{'eSummaryResult': {'DocSum': {'Id': '294862258',
   'Item': [{'@Name': 'Caption', '@Type': 'String', '#text': 'NP_001116538'},
    {'@Name': 'Title',
     '@Type': 'String',
     '#text': 'microtubule-associated protein tau isoform 6 [Homo sapiens]'},
    {'@Name': 'Extra',
     '@Type': 'String',
     '#text': 'gi|294862258|ref|NP_001116538.2|[294862258]'},
    {'@Name': 'Gi', '@Type': 'Integer', '#text': '294862258'},
    {'@Name': 'CreateDate', '@Type': 'String', '#text': '2008/04/09'},
    {'@Name': 'UpdateDate', '@Type': 'String', '#text': '2021/08/22'},
    {'@Name': 'Flags', '@Type': 'Integer', '#text': '512'},
    {'@Name': 'TaxId', '@Type': 'Integer', '#text': '9606'},
    {'@Name': 'Length', '@Type': 'Integer', '#text': '776'},
    {'@Name': 'Status', '@Type': 'String', '#text': 'live'},
    {'@Name': 'ReplacedBy', '@Type': 'String'},
    {'@Name': 'Comment', '@Type': 'String'},
    {'@Name': 'AccessionVersion',
     '@Type': 'String',
     '#text': 'NP_001116538.2'}]}}}

In [43]:
# Get GID and perform second API call to obtain sequence
gid = protein_data["eSummaryResult"]["DocSum"]["Id"]
gid_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sequences&id=" + gid + "&rettype=fasta&retmode=text"
gid_resp = requests.get(gid_url)

In [44]:
# Decode response (byte string)
gid_resp_decode = gid_resp.content.decode("utf-8")

In [45]:
# Get sequence
seq_arr = gid_resp_decode.split("\n")
seq = ""
for i in range(1, len(seq_arr)):
    seq += seq_arr[i]
seq

'MAEPRQEFEVMEDHAGTYGLGDRKDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAKSTPTAEDVTAPLVDEGAPGKQAAAQPHTEIPEGTTAEEAGIGDTPSLEDEAAGHVTQEPESGKVVQEGFLREPGPPGLSHQLMSGMPGAPLLPEGPREATRQPSGTGPEDTEGGRHAPELLKHQLLGDLHQEGPPLKGAGGKERPGSKEEVDEDRDVDESSPQDSPPSKASPAQDGRPPQTAAREATSIPGFPAEGAIPLPVDFLSKVSTEIPASEPDGPSVGRAKGQDAPLEFTFHVEITPNVQKEQAHSEEHLGRAAFPGAPGEGPEARGPSLGEDTKEADLPEPSEKQPAAAPRGKPVSRVPQLKARMVSKSKDGTGSDDKKAKTSTRSSAKTLKNRPCLSPKHPTPGSSDPLIQPSSPAVCPEPPSSPKYVSSVTSRTGSSGAKEMKLKGADGKTKIATPRGAAPPGQKGQANATRIPAKTPPAPKTPPSSATKQVQRRPPPAGPRSERGEPPKSGDRSGYSSPGSPGTPGSRSRTPSLPTPPTREPKKVAVVRTPPKSPSSAKSRLQTAPVPMPDLKNVKSKIGSTENLKHQPGGGKVQIINKKLDLSNVQSKCGSKDNIKHVPGGGSVQIVYKPVDLSKVTSKCGSLGNIHHKPGGGQVEVKSEKLDFKDRVQSKIGSLDNITHVPGGGNKKIETHKLTFRENAKAKTDHGAEIVYKSPVVSGDTSPRHLSNVSSTGSIDMVDSPQLATLADEVSASLAKQGL'