In [6]:
import requests
import json
import xmltodict
import os
import time

In [3]:
# Function to perform API call and return Python Dictionary containing data
def api_call(url):
    # Define tempory XML and JSON files
    xml_file = "data.xml"
    json_file = "data.json"
    # Perform API call
    resp = requests.get(url)
    
    # Save XML result to temporary file
    with open(xml_file, "wb") as f:
        f.write(resp.content)

    # Read XML file and convert to OrderedDict using xmltodict
    with open(xml_file, "r") as f:
        data_dict = xmltodict.parse(f.read())
    
    # Write OrderedDict to JSON file
    json_data = json.dumps(data_dict)
    with open(json_file, "w") as f:
        f.write(json_data)
    
    # Read in JSON file into regular Python dictionary
    with open(json_file, "r") as f:
        data = json.load(f)
    
    # Clean up temporary files
    os.remove(xml_file)
    os.remove(json_file)

    return data

In [4]:
# Read protein accession numbers from proteins.txt
infile = os.path.join("Resources", "proteins.txt")
with open(infile, "r") as f:
    lines = f.readlines()
proteins = [line.replace("\n", "") for line in lines]
proteins[:5]

['NP_001116538', 'Q5YCV9', 'XP_012352933', 'XP_002800600', 'XP_003913279']

In [8]:
# Perform API Call to NCBI to get GIDs for each Protein
url_base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=protein&id="
gids = []
err_proteins = []  # list to hold proteins throwing error
# Be nice: no more than 3 calls per second -> every three calls wait 1 second
cnt = 0
for protein in proteins:
    print(cnt)
    if cnt == 3:
        time.sleep(1)
        cnt = 0
    result = api_call(url_base + protein)
    try:
        gid = result["eSummaryResult"]["DocSum"]["Id"]
        gids.append(gid)
    except KeyError:
        err_proteins.append(protein)
    cnt += 1
gids[0:5]

0
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3


['294862258', '59798492', '821025767', '297273333', '1777289710']

In [10]:
print("Total Number of Proteins:", len(proteins))
print("Number of Entries Found:", len(gids))
print("Proteins Without Entries (%s):" % len(err_proteins))
for protein in err_proteins:
    print(protein)

Total Number of Proteins: 88
Number of Entries Found: 79
Proteins Without Entries (9):
scaffold11486
JL1528
GL477576
CT004140
scaffold43622
BAHO01035973
KE993814
NW_003943621
XP_01266736


In [13]:
api_call("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=nuccore&id=LN638503")  # scaffold 11486
# i.e need to check on above proteins to get proper accession number, search in correct db

{'eSummaryResult': {'DocSum': {'Id': '727378838',
   'Item': [{'@Name': 'Caption', '@Type': 'String', '#text': 'LN638503'},
    {'@Name': 'Title',
     '@Type': 'String',
     '#text': 'Fasciola hepatica genome assembly Fhepatica_v1, scaffold scaffold11486, whole genome shotgun sequence'},
    {'@Name': 'Extra',
     '@Type': 'String',
     '#text': 'gi|727378838|emb|LN638503.1|[727378838]'},
    {'@Name': 'Gi', '@Type': 'Integer', '#text': '727378838'},
    {'@Name': 'CreateDate', '@Type': 'String', '#text': '2014/11/08'},
    {'@Name': 'UpdateDate', '@Type': 'String', '#text': '2014/11/08'},
    {'@Name': 'Flags', '@Type': 'Integer', '#text': '32'},
    {'@Name': 'TaxId', '@Type': 'Integer', '#text': '6192'},
    {'@Name': 'Length', '@Type': 'Integer', '#text': '7454'},
    {'@Name': 'Status', '@Type': 'String', '#text': 'live'},
    {'@Name': 'ReplacedBy', '@Type': 'String'},
    {'@Name': 'Comment', '@Type': 'String'},
    {'@Name': 'AccessionVersion', '@Type': 'String', '#text': '

In [11]:
gids

['294862258',
 '59798492',
 '821025767',
 '297273333',
 '1777289710',
 '675658919',
 '1984072572',
 '556777384',
 '927194489',
 '560953646',
 '554527651',
 '281350103',
 '634829804',
 '344285203',
 '562861703',
 '655872369',
 '568972037',
 '884895226',
 '395826136',
 '611992639',
 '312836787',
 '909794529',
 '641794377',
 '565322635',
 '637314516',
 '1785342033',
 '556973267',
 '632985757',
 '907837202',
 '573901255',
 '767918175',
 '821032855',
 '1622851286',
 '675665508',
 '640782340',
 '395823520',
 '562820368',
 '655829092',
 '884928294',
 '568905885',
 '558099716',
 '1126296301',
 '478506527',
 '560965823',
 '927211211',
 '465967067',
 '344268583',
 '821476924',
 '529449556',
 '513192801',
 '530566428',
 '637250248',
 '847166186',
 '573897481',
 '907664691',
 '632964270',
 '47519639',
 '627761090',
 '635118487',
 '627761102',
 '675727252',
 '640824856',
 '545862088',
 '466078457',
 '281343610',
 '731469454',
 '655834258',
 '884935809',
 '328927075',
 '821469839',
 '591384013',
 '5

In [15]:
api_call("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=protein&id=JL1528")

{'eSummaryResult': {'ERROR': 'Invalid uid JL1528 at position=0'}}

In [16]:
api_call("https://www.uniprot.org/uniprot/JL1528.xml")

{'html': {'@xmlns': 'http://www.w3.org/1999/xhtml',
  '@lang': 'en',
  '@xml:lang': 'en',
  'head': {'title': 'Error',
   'meta': [{'@content': 'IE=edge', '@http-equiv': 'X-UA-Compatible'},
    {'@content': 'text/html; charset=UTF-8', '@http-equiv': 'Content-Type'},
    {'@content': 'width=device-width, initial-scale=1', '@name': 'viewport'},
    {'@content': 'nositelinkssearchbox', '@name': 'google'}],
   'link': [{'@href': '/', '@rel': 'home'},
    {'@href': 'https://creativecommons.org/licenses/by/4.0/',
     '@rel': 'license'},
    {'@type': 'image/vnd.microsoft.icon',
     '@href': '/favicon.ico',
     '@rel': 'shortcut icon'},
    {'@href': '/uniprot.min.css2021_03',
     '@type': 'text/css',
     '@rel': 'stylesheet'},
    {'@href': '/tippy.css', '@type': 'text/css', '@rel': 'stylesheet'}],
   'script': [{'@type': 'text/javascript', '#text': "var BASE = '/';"},
    {'@src': '/js-compr.js2021_03', '@type': 'text/javascript'},
    {'@type': 'text/javascript',
     '#text': "unipro