In [23]:
import requests
import json
import xmltodict
import os
import time
import numpy as np
import pandas as pd

In [24]:
# Function to perform API call and return Python Dictionary containing data
def api_call(db, protein):
    """
    Performs API call to NCBI for specified database and protein
    Parameters:
    -----------
    db : str
        NCBI database to search for, i.e. 'protein', 'nuccore', etc.
    protein : str
        Protein accession number to search for
    """
    # Create url for API call
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=" + db + "&id=" + protein
    
    # Define tempory XML and JSON files
    xml_file = "data.xml"
    json_file = "data.json"

    # Perform API call
    resp = requests.get(url)
    
    # Save XML result to temporary file
    with open(xml_file, "wb") as f:
        f.write(resp.content)

    # Read XML file and convert to OrderedDict using xmltodict
    with open(xml_file, "r") as f:
        data_dict = xmltodict.parse(f.read())
    
    # Write OrderedDict to JSON file
    json_data = json.dumps(data_dict)
    with open(json_file, "w") as f:
        f.write(json_data)
    
    # Read in JSON file into regular Python dictionary
    with open(json_file, "r") as f:
        data = json.load(f)
    
    # Clean up temporary files
    os.remove(xml_file)
    os.remove(json_file)

    return data

In [25]:
api_call("nuccore", "LN638503")  # scaffold 11486, but lots of different accession numbers with scaffold 11486

{'eSummaryResult': {'DocSum': {'Id': '727378838',
   'Item': [{'@Name': 'Caption', '@Type': 'String', '#text': 'LN638503'},
    {'@Name': 'Title',
     '@Type': 'String',
     '#text': 'Fasciola hepatica genome assembly Fhepatica_v1, scaffold scaffold11486, whole genome shotgun sequence'},
    {'@Name': 'Extra',
     '@Type': 'String',
     '#text': 'gi|727378838|emb|LN638503.1|[727378838]'},
    {'@Name': 'Gi', '@Type': 'Integer', '#text': '727378838'},
    {'@Name': 'CreateDate', '@Type': 'String', '#text': '2014/11/08'},
    {'@Name': 'UpdateDate', '@Type': 'String', '#text': '2014/11/08'},
    {'@Name': 'Flags', '@Type': 'Integer', '#text': '32'},
    {'@Name': 'TaxId', '@Type': 'Integer', '#text': '6192'},
    {'@Name': 'Length', '@Type': 'Integer', '#text': '7454'},
    {'@Name': 'Status', '@Type': 'String', '#text': 'live'},
    {'@Name': 'ReplacedBy', '@Type': 'String'},
    {'@Name': 'Comment', '@Type': 'String'},
    {'@Name': 'AccessionVersion', '@Type': 'String', '#text': '

In [26]:
# Example call where protein not found
test = api_call("protein", "LN638503")
list(test["eSummaryResult"].items())[0][0]

'ERROR'

In [27]:
# Read protein accession numbers from proteins.txt
infile = os.path.join("Resources", "proteins.txt")
with open(infile, "r") as f:
    lines = f.readlines()
proteins = [line.replace("\n", "") for line in lines]
proteins[:5]

['NP_001116538', 'Q5YCV9', 'XP_012352933', 'XP_002800600', 'XP_003913279']

In [28]:
# Perform API Call to NCBI to get GIDs for each Protein
gids = []
err_proteins = []  # list to hold proteins not found

# Be nice: no more than 3 calls per second -> every three calls wait 1 second
cnt = 0
for protein in proteins:
    print("Protein %s of %s" % (cnt + 1, len(proteins)), end="\r")

    if (cnt % 3) == 0:
        time.sleep(1)

    result = api_call("protein", protein)
    # Check for error and search nuccore instead
    if list(result["eSummaryResult"].items())[0][0] == "ERROR":
        result = api_call("nuccore", protein)
    
    try:
        gid = result["eSummaryResult"]["DocSum"]["Id"]
        gids.append(gid)
    except KeyError:
        err_proteins.append(protein)
        gids.append(np.NaN)
    cnt += 1
gids[0:5]

Protein 88 of 88

['294862258', '59798492', '821025767', '297273333', '1777289710']

In [29]:
print("Total Number of Proteins:", len(proteins))
print("Proteins Without Entries (%s):" % len(err_proteins))
for protein in err_proteins:
    print(protein)

Total Number of Proteins: 88
Proteins Without Entries (4):
scaffold11486
JL1528
scaffold43622
XP_01266736


In [30]:
# Add Proteins to DataFrame
protein_df = pd.DataFrame({"Protein_Accession_Number": proteins, "GID": gids})
protein_df.head()

Unnamed: 0,Protein_Accession_Number,GID
0,NP_001116538,294862258
1,Q5YCV9,59798492
2,XP_012352933,821025767
3,XP_002800600,297273333
4,XP_003913279,1777289710


In [31]:
# Confirm missing GIDs consistent with API result:
is_NaN = protein_df.isnull()
row_has_NaN = is_NaN.any(axis=1)
protein_NaN = protein_df[row_has_NaN]
protein_NaN

Unnamed: 0,Protein_Accession_Number,GID
22,scaffold11486,
31,JL1528,
54,scaffold43622,
70,XP_01266736,


In [32]:
# Need to output sequences to FASTA (.faa) file for alignment before bassing to RAxML to generate
# Phylogenetic tree
# FASTA Format:
#>SEQUENCE_1
#...
#>SEQUENCE_2
#...
#>...

In [43]:
# Get Sequences from GID numbers and write to FASTA file:
cnt = 0
seqs = []
protein_fasta_file = "proteins.faa"
# Write sequence to FASTA file
with open(protein_fasta_file, "w") as f:
    for gid in gids:
        print("Protein %s of %s" % (cnt + 1, len(proteins)), end="\r")
        if (cnt % 3) == 0:
            time.sleep(1)
        if gid is not np.NaN:
            gid_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sequences&id=" + gid + "&rettype=fasta&retmode=text"
            gid_resp = requests.get(gid_url)
            gid_resp_decode = gid_resp.content.decode("utf-8")

            # Get sequence
            seq_arr = gid_resp_decode.split("\n")
            seq = ""
            for i in range(1, len(seq_arr)):
                seq += seq_arr[i]
            seqs.append(seq)

            f.write(gid_resp_decode[:-1])  # not including missing GID results, remove last `\n`
        else:
            seqs.append(np.NaN)
        cnt += 1

Protein 88 of 88

In [34]:
# Add sequences to protein_df
protein_df["Sequence"] = seqs
protein_df.head()

Unnamed: 0,Protein_Accession_Number,GID,Sequence
0,NP_001116538,294862258,MAEPRQEFEVMEDHAGTYGLGDRKDQGGYTMHQDQEGDTDAGLKES...
1,Q5YCV9,59798492,MAEPRQEFDVMEDHAGTYGLGDRKDQGGYTMLQDQEGDTDAGLKES...
2,XP_012352933,821025767,MAEPRQEYDVMEDHAGTYGLGDRKDQGGYTMLQDQEGDTDAGLKES...
3,XP_002800600,297273333,MAEPRQEFDVMEDHAGTYGLGDRKDQEGYTMLQDQEGDTDAGLKES...
4,XP_003913279,1777289710,MAEPRQEFDVMEDHAGTYGLGDRKDQEGYTMLQDQEGDTDAGLKES...


In [35]:
# Confirm consistent NaN's
is_NaN = protein_df.isnull()
row_has_NaN = is_NaN.any(axis=1)
protein_NaN = protein_df[row_has_NaN]
protein_NaN

Unnamed: 0,Protein_Accession_Number,GID,Sequence
22,scaffold11486,,
31,JL1528,,
54,scaffold43622,,
70,XP_01266736,,
