In [79]:
import requests
import json
import xmltodict
import os
import time
import numpy as np
import pandas as pd

In [80]:
# Function to perform API call and return Python Dictionary containing data
def api_call(db, protein):
    """
    Performs API call to NCBI for specified database and protein
    Parameters:
    -----------
    db : str
        NCBI database to search for, i.e. 'protein', 'nuccore', etc.
    protein : str
        Protein accession number to search for
    """
    # Create url for API call
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=" + db + "&id=" + protein
    
    # Define tempory XML and JSON files
    xml_file = "data.xml"
    json_file = "data.json"

    # Perform API call
    resp = requests.get(url)
    
    # Save XML result to temporary file
    with open(xml_file, "wb") as f:
        f.write(resp.content)

    # Read XML file and convert to OrderedDict using xmltodict
    with open(xml_file, "r") as f:
        data_dict = xmltodict.parse(f.read())
    
    # Write OrderedDict to JSON file
    json_data = json.dumps(data_dict)
    with open(json_file, "w") as f:
        f.write(json_data)
    
    # Read in JSON file into regular Python dictionary
    with open(json_file, "r") as f:
        data = json.load(f)
    
    # Clean up temporary files
    os.remove(xml_file)
    os.remove(json_file)

    return data

In [81]:
# Read protein accession numbers from proteins.txt
infile = os.path.join("Resources", "proteins.txt")
with open(infile, "r") as f:
    lines = f.readlines()
proteins = [line.replace("\n", "") for line in lines]
proteins[:5]

['NP_001116538', 'Q5YCV9', 'XP_012352933', 'XP_002800600', 'XP_003913279']

In [82]:
# Perform API Call to NCBI to get GIDs for each Protein
gids = []
err_proteins = []  # list to hold proteins not found
protein_proteins = []  # list to hold proteins found in protein db
nuccore_proteins = []  # list to hold proteins found in nuccore db

# Be nice: no more than 3 calls per second -> every three calls wait 1 second
cnt = 0
for protein in proteins:
    print("Protein %s of %s" % (cnt + 1, len(proteins)), end="\r")

    if (cnt % 3) == 0:
        time.sleep(1)

    # Search protein db
    result = api_call("protein", protein)
    try:
        gid = result["eSummaryResult"]["DocSum"]["Id"]
        protein_proteins.append(protein)
        gids.append(gid)
    except KeyError:
        # If error, search nuccore
        try:
            result = api_call("nuccore", protein)
            gid = result["eSummaryResult"]["DocSum"]["Id"]
            nuccore_proteins.append(protein)
            gids.append(gid)
        # If still error, add NaN
        except KeyError:
            err_proteins.append(protein)
            gids.append(np.NaN)
    cnt += 1
gids[0:5]

Protein 88 of 88

['294862258', '59798492', '821025767', '297273333', '1777289710']

In [83]:
result = api_call("protein", "blah")
try:
    result["eSummaryResult"]["DocSum"]["Id"]
    print("Success")
except KeyError:
    try:
        result = api_call("protein", "NP_001116538")
#        result = api_call("protein", "blah2")
        result["eSummaryResult"]["DocSum"]["Id"]
        print("Success")
    except KeyError:
        print("Fail")

Success


In [86]:
print("Total Number of Proteins in Input File:", len(proteins))
print("\n")
print("Number of Proteins From db=protein:", len(protein_proteins))
print ("Proteins From db=nuccore (%s):" % len(nuccore_proteins))
for protein in nuccore_proteins:
    print(protein)
print("\n")
print("Proteins Not Found (%s):" % len(err_proteins))
for protein in err_proteins:
    print(protein)
print("\n")

Total Number of Proteins in Input File: 88


Number of Proteins From db=protein: 79
Proteins From db=nuccore (5):
GL477576
CT004140
BAHO01035973
KE993814
NW_003943621


Proteins Not Found (4):
scaffold11486
JL1528
scaffold43622
XP_01266736




In [87]:
# Add Proteins to DataFrame
protein_df = pd.DataFrame({"Protein_Accession_Number": proteins, "GID": gids})
protein_df.head()

Unnamed: 0,Protein_Accession_Number,GID
0,NP_001116538,294862258
1,Q5YCV9,59798492
2,XP_012352933,821025767
3,XP_002800600,297273333
4,XP_003913279,1777289710


In [88]:
# Confirm missing GIDs consistent with API result:
is_NaN = protein_df.isnull()
row_has_NaN = is_NaN.any(axis=1)
protein_NaN = protein_df[row_has_NaN]
protein_NaN

Unnamed: 0,Protein_Accession_Number,GID
22,scaffold11486,
31,JL1528,
54,scaffold43622,
70,XP_01266736,


In [89]:
# Need to output sequences to FASTA (.faa) file for alignment before bassing to RAxML to generate
# Phylogenetic tree
# FASTA Format:
#>SEQUENCE_1
#...
#>SEQUENCE_2
#...
#>...

In [90]:
# Get Sequences from GID numbers and write to FASTA file:
cnt = 0
seqs = []
# Write sequence to FASTA file
for gid in gids:
    print("Protein %s of %s" % (cnt + 1, len(proteins)), end="\r")
    if (cnt % 3) == 0:
        time.sleep(1)
    if gid is not np.NaN:
        gid_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sequences&id=" + gid + "&rettype=fasta&retmode=text"
        gid_resp = requests.get(gid_url)
        seq_fasta = gid_resp.content.decode("utf-8")
        seqs.append(seq_fasta)

    else:
        seqs.append(np.NaN)
    cnt += 1

Protein 88 of 88

In [91]:
# Add sequences to protein_df
protein_df["Sequence"] = seqs
protein_df.head()

Unnamed: 0,Protein_Accession_Number,GID,Sequence
0,NP_001116538,294862258,>NP_001116538.2 microtubule-associated protein...
1,Q5YCV9,59798492,>sp|Q5YCV9.4|TAU_HYLLA RecName: Full=Microtubu...
2,XP_012352933,821025767,>XP_012352933.1 PREDICTED: microtubule-associa...
3,XP_002800600,297273333,>XP_002800600.1 PREDICTED: microtubule-associa...
4,XP_003913279,1777289710,>XP_003913279.2 microtubule-associated protein...


In [92]:
# Check a sequence
print(protein_df["Sequence"][0])

>NP_001116538.2 microtubule-associated protein tau isoform 6 [Homo sapiens]
MAEPRQEFEVMEDHAGTYGLGDRKDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAKSTP
TAEDVTAPLVDEGAPGKQAAAQPHTEIPEGTTAEEAGIGDTPSLEDEAAGHVTQEPESGKVVQEGFLREP
GPPGLSHQLMSGMPGAPLLPEGPREATRQPSGTGPEDTEGGRHAPELLKHQLLGDLHQEGPPLKGAGGKE
RPGSKEEVDEDRDVDESSPQDSPPSKASPAQDGRPPQTAAREATSIPGFPAEGAIPLPVDFLSKVSTEIP
ASEPDGPSVGRAKGQDAPLEFTFHVEITPNVQKEQAHSEEHLGRAAFPGAPGEGPEARGPSLGEDTKEAD
LPEPSEKQPAAAPRGKPVSRVPQLKARMVSKSKDGTGSDDKKAKTSTRSSAKTLKNRPCLSPKHPTPGSS
DPLIQPSSPAVCPEPPSSPKYVSSVTSRTGSSGAKEMKLKGADGKTKIATPRGAAPPGQKGQANATRIPA
KTPPAPKTPPSSATKQVQRRPPPAGPRSERGEPPKSGDRSGYSSPGSPGTPGSRSRTPSLPTPPTREPKK
VAVVRTPPKSPSSAKSRLQTAPVPMPDLKNVKSKIGSTENLKHQPGGGKVQIINKKLDLSNVQSKCGSKD
NIKHVPGGGSVQIVYKPVDLSKVTSKCGSLGNIHHKPGGGQVEVKSEKLDFKDRVQSKIGSLDNITHVPG
GGNKKIETHKLTFRENAKAKTDHGAEIVYKSPVVSGDTSPRHLSNVSSTGSIDMVDSPQLATLADEVSAS
LAKQGL




In [93]:
# Confirm consistent NaN's
is_NaN = protein_df.isnull()
row_has_NaN = is_NaN.any(axis=1)
protein_NaN = protein_df[row_has_NaN]
protein_NaN

Unnamed: 0,Protein_Accession_Number,GID,Sequence
22,scaffold11486,,
31,JL1528,,
54,scaffold43622,,
70,XP_01266736,,


In [94]:
# 3 Proteins Returning Sequences Causing issues with alignment:
# 1) GL477576 (sea lamprey) -> scaffold1248
# 2) KE993814 (arctic lamprey) -> scaffold00143
# 3) NW_003943621 (Bolivian squirrel monkey) -> scaffold00018
# All returning extremely long "scaffold" sequences

scaffold_proteins = ["GL477576", "KE993814", "NW_003943621"]
# For now, do not write sequences for the 4 NaNs and 3 "scaffold" proteins above
# Remove rows containing NaN's
protein_df_filt = protein_df.dropna()
# Remove scaffold proteins
for protein in scaffold_proteins:
    protein_df_filt = protein_df_filt[protein_df_filt["Protein_Accession_Number"] != protein]
protein_df_filt.head()

Unnamed: 0,Protein_Accession_Number,GID,Sequence
0,NP_001116538,294862258,>NP_001116538.2 microtubule-associated protein...
1,Q5YCV9,59798492,>sp|Q5YCV9.4|TAU_HYLLA RecName: Full=Microtubu...
2,XP_012352933,821025767,>XP_012352933.1 PREDICTED: microtubule-associa...
3,XP_002800600,297273333,>XP_002800600.1 PREDICTED: microtubule-associa...
4,XP_003913279,1777289710,>XP_003913279.2 microtubule-associated protein...


In [95]:
# Check a scaffold sequence was removed
for protein in scaffold_proteins:
    print(protein_df_filt[protein_df_filt["Protein_Accession_Number"] == protein])

Empty DataFrame
Columns: [Protein_Accession_Number, GID, Sequence]
Index: []
Empty DataFrame
Columns: [Protein_Accession_Number, GID, Sequence]
Index: []
Empty DataFrame
Columns: [Protein_Accession_Number, GID, Sequence]
Index: []


In [96]:
protein_df_filt["Sequence"][0]

'>NP_001116538.2 microtubule-associated protein tau isoform 6 [Homo sapiens]\nMAEPRQEFEVMEDHAGTYGLGDRKDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAKSTP\nTAEDVTAPLVDEGAPGKQAAAQPHTEIPEGTTAEEAGIGDTPSLEDEAAGHVTQEPESGKVVQEGFLREP\nGPPGLSHQLMSGMPGAPLLPEGPREATRQPSGTGPEDTEGGRHAPELLKHQLLGDLHQEGPPLKGAGGKE\nRPGSKEEVDEDRDVDESSPQDSPPSKASPAQDGRPPQTAAREATSIPGFPAEGAIPLPVDFLSKVSTEIP\nASEPDGPSVGRAKGQDAPLEFTFHVEITPNVQKEQAHSEEHLGRAAFPGAPGEGPEARGPSLGEDTKEAD\nLPEPSEKQPAAAPRGKPVSRVPQLKARMVSKSKDGTGSDDKKAKTSTRSSAKTLKNRPCLSPKHPTPGSS\nDPLIQPSSPAVCPEPPSSPKYVSSVTSRTGSSGAKEMKLKGADGKTKIATPRGAAPPGQKGQANATRIPA\nKTPPAPKTPPSSATKQVQRRPPPAGPRSERGEPPKSGDRSGYSSPGSPGTPGSRSRTPSLPTPPTREPKK\nVAVVRTPPKSPSSAKSRLQTAPVPMPDLKNVKSKIGSTENLKHQPGGGKVQIINKKLDLSNVQSKCGSKD\nNIKHVPGGGSVQIVYKPVDLSKVTSKCGSLGNIHHKPGGGQVEVKSEKLDFKDRVQSKIGSLDNITHVPG\nGGNKKIETHKLTFRENAKAKTDHGAEIVYKSPVVSGDTSPRHLSNVSSTGSIDMVDSPQLATLADEVSAS\nLAKQGL\n\n'

In [97]:
# Write cleaned sequence series to file
protein_fasta_file = "proteins.faa"
with open(protein_fasta_file, "w") as f:
    for sequence in protein_df_filt["Sequence"]:
        f.write(sequence[:-1])

In [98]:
print(protein_df[protein_df["Protein_Accession_Number"] == "GL477576"]["Sequence"][32])

>GL477576.1 Petromyzon marinus unplaced genomic scaffold scaffold_1248, whole genome shotgun sequence
TGTTAAGTCACCCTGGGGGGTTAATCTAGAACGTAGTTAATCCGATACAAGACGTAAGATAAATAACTTG
ATGTATTATTTACAGCCCTTTGTCAAAACACTGCTGGATGAAGGCCTCCCCGTGCCGTATTAGTCGAGGG
GGTTGTTTTTGACATACTTCACCACTCTGGTCAGTGCAGATTTCTGATTGGGAAGGCCCAGGACGGGTTC
AGATATTACTCACCTCTGATGTTGGCCATAGCTGGGTCTTGAAAGCAGGTCACCAGGTTCAGAGCGCAGT
GCAAATATCATGCGATCATATATTACATTATCGAGTGGTCAGCGCTTCCCAGATGCAACTTATTCAAGTG
CACAGCTTTGATCGTCTATGGAGGTGAAGGGGTTCTGCTCAATGTAACTGCTCGAGAGAGAGAGAGATAG
GGAGAGCTGTTCAACAACACGTAGCCCTCCATTCCTGGTGCAGGATCAGCCCCTTTGTTAGTCCAAGCAA
TCTGAACGGCGACTGCGACGAAATGCATCGTCAAGCACGCCCCAACTCACGGCGCGAGTTTATGGTGGAA
CAGGGCGTCAATGGAAGAGGACATTTTGGCACAAATTACGTGGCATGGTTGACAACAATTGACACCAAAA
TGTAACCTTTCACCTCCCGCAATTGGACAATTAACTGATATTACACTTGTAGAGCCATATATATATACAC
GAGGCATGGTCTTCATACTATATACGGTCTATATTCATATATTGCAGGATCAATACATAGTTGCAGTTGT
TCACGCTCTGCGTTATGGACCCAGCGACCCGAGTTCAATTCCCAGCCGAGGCTTGGGTCAGCGGCGGGCG
ACATCTGAACCAGTCCTCTGCCTGCCGAGTCTTCAGCAACCTGCGC