In [1]:
import requests
import os
import time
import numpy as np
import pandas as pd
from helper_functions import api_call, show_NaN_rows, get_gids_sequences, get_sequence_and_count

In [2]:
# Read protein accession numbers from proteins.txt
infile = os.path.join("Resources", "proteins.csv")
protein_df = pd.read_csv(infile)
protein_df

Unnamed: 0,accession_num,aa_cnt
0,NP_001116538,776
1,Q5YCV9,776
2,XP_012352933,732
3,XP_002800600,776
4,XP_003913279,776
...,...,...
83,XP_008116759,1176
84,NP_001119982,1164
85,XP_006634278,629
86,XP_006003167,1753


In [3]:
protein_df = get_gids_sequences(protein_df)

Protein 88 of 88

In [6]:
protein_df[protein_df["DB"] == "nuccore"]

Unnamed: 0,accession_num,aa_cnt,DB,GID
32,GL477576,798,nuccore,308150460
33,CT004140,243,nuccore,68298023
58,BAHO01035973,2118,nuccore,405900247
62,KE993814,196,nuccore,543413231
67,NW_003943621,1122,nuccore,395725070


In [None]:
# Perform API Call to NCBI to get GIDs for each Protein
gids = []
err_proteins = []  # list to hold proteins not found
protein_proteins = []  # list to hold proteins found in protein db
nuccore_proteins = []  # list to hold proteins found in nuccore db

# Be nice: no more than 3 calls per second -> every three calls wait 1 second
for i in range(len(protein_df)):
    print("Protein %s of %s" % (i + 1, len(protein_df)), end="\r")

    if (i % 3) == 0:
        time.sleep(1)

    # Search protein db
    protein = protein_df.iloc[i]["accession_num"]
    result = api_call("protein", protein)
    try:
        gid = result["eSummaryResult"]["DocSum"]["Id"]
        protein_proteins.append(protein)
        gids.append(gid)
    except KeyError:
        # If error, search nuccore
        try:
            result = api_call("nuccore", protein)
            gid = result["eSummaryResult"]["DocSum"]["Id"]
            nuccore_proteins.append(protein)
            gids.append(gid)
        # If still error, add NaN
        except KeyError:
            err_proteins.append(protein)
            gids.append(np.NaN)
gids[0:5]

In [None]:
print("Total Number of Proteins in Input File:", len(protein_df))
print("Number of Proteins From db=protein:", len(protein_proteins))
print ("Proteins From db=nuccore (%s):" % len(nuccore_proteins))
for protein in nuccore_proteins:
    print(protein)
print("\n")
print("Proteins Not Found (%s):" % len(err_proteins))
for protein in err_proteins:
    print(protein)

In [None]:
# Add GIDs to proteins_df
protein_df["GID"] = gids
protein_df.head()

In [None]:
# Confirm missing GIDs consistent with API result:
show_NaN_rows(protein_df)

In [None]:
# Need to output sequences to FASTA (.faa) file for alignment before bassing to RAxML to generate
# Phylogenetic tree
# FASTA Format:
#>SEQUENCE_1
#...
#>SEQUENCE_2
#...
#>...

In [None]:
# Get Sequences from GID numbers and write to FASTA file:
cnt = 0
seqs = []
# Write sequence to FASTA file
for gid in gids:
    print("Protein %s of %s" % (cnt + 1, len(protein_df)), end="\r")
    if (cnt % 3) == 0:
        time.sleep(1)
    if gid is not np.NaN:
        gid_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sequences&id=" + gid + "&rettype=fasta&retmode=text"
        gid_resp = requests.get(gid_url)
        seq_fasta = gid_resp.content.decode("utf-8")
        seqs.append(seq_fasta)

    else:
        seqs.append(np.NaN)
    cnt += 1

In [None]:
# Add sequences to protein_df
protein_df["Sequence"] = seqs
protein_df.head()

In [None]:
# Check a sequence
print(get_sequence_and_count(protein_df, "NP_001116538"))  # Homo sapiens (human tau), correct count

In [None]:
# Confirm consistent NaN's
show_NaN_rows(protein_df)

In [None]:
# Example nuccore sequence
sea_lamprey = nuccore_proteins[0]  # GL477576
print(nuccore_proteins)
get_sequence_and_count(protein_df, sea_lamprey)
# nucleotide sequence (210407 nucleotides) for Petromyzon marinus (correct)
# Should have 846aa

In [None]:
hagfish = nuccore_proteins[1]  # CT004140
print(nuccore_proteins)
get_sequence_and_count(protein_df, hagfish)
# mRNA Sequence (837 nucleotides) for homo sapiens
# should be 243aa japanese inshore hagfish

In [None]:
coelacanth = nuccore_proteins[2]  # BAHO01035973
print(nuccore_proteins)
get_sequence_and_count(protein_df, coelacanth)
# nucleotide sequence (15919 nucleotides) for Latimeria chalumnae (correct)
# Should have 2118 aa

In [None]:
arctic_lamprey = nuccore_proteins[3]  # KE993814
print(nuccore_proteins)
get_sequence_and_count(protein_df, arctic_lamprey)
# nucleotide sequence (1564372 nucleotides) for Lethenteron camtschaticum (correct)
# Should have 196aa

In [None]:
squirrel_monkey = nuccore_proteins[4]  # NW_003943621
print(nuccore_proteins)
cnt = get_sequence_and_count(protein_df, squirrel_monkey)[0]
print(cnt)
# nucleotide sequence (30262601 nucleotides) for Saimiri boliviensis (correct)
# Should have 1122 aa

In [None]:
# Checking proteins whose sequence lengths do not match the aa counts in the input file
# Most obvious Mismatches:
# nucleotide sequences for correct protein (GL477576, BAHO01035973, KE993814, NW_003943621)
# mRNA sequence for incorrect protein (CT004140)

ret_cnts = []
for i in range(len(protein_df)):
    protein = protein_df.iloc[i]["accession_num"]
    in_cnt = protein_df.iloc[i]["aa_cnt"]  # aa count from infile
    gid = protein_df.iloc[i]["GID"]
    if gid is not np.NaN:
        ret_cnt = get_sequence_and_count(protein_df, protein)[0]  ## aa count from API returned sequence
        ret_cnts.append(ret_cnt)
    else:
        ret_cnts.append(np.NaN)

protein_df["api_returned_aa_count"] = ret_cnts
protein_df

In [None]:
# Protein sequence counts that do not match the input count:
aa_mismatch_df = protein_df[protein_df["aa_cnt"] != protein_df["api_returned_aa_count"]]
aa_mismatch_df  # 66 of 88 proteins have this issue, cannot drop them all!

In [None]:
# Summary of Proteins to be removed:
protein_NaNs = show_NaN_rows(protein_df)
nuccore_df = protein_df[protein_df["accession_num"].isin(nuccore_proteins)]
protein_drop_df = pd.concat([protein_NaNs, nuccore_df])
protein_drop_df

In [None]:
# For now, remove the above 9 protein sequences
# These are either:
# NaNs, i.e no returned sequence
# nucleotide sequences for correct protein (GL477576, BAHO01035973, KE993814, NW_003943621) (from nuccore)
# mRNA sequence for incorrect protein (CT004140) (from nuccore)

# Remove protein_drop_df (a subset of protein_df) from protein_df:
# First merge the df's with `inidcator` showing if the column is left_only, both, or right_only
merged_df = protein_df.merge(protein_drop_df, how="left", indicator=True)
protein_filt_df = merged_df[merged_df["_merge"] == "left_only"]
protein_filt_df = protein_filt_df.drop("_merge", axis=1)
protein_filt_df

In [None]:
# Check NaN and nuccore sequences were removed
for protein in err_proteins:
    print(protein_filt_df[protein_filt_df["accession_num"] == protein])
for protein in nuccore_proteins:
    print(protein_filt_df[protein_filt_df["accession_num"] == protein])

In [None]:
# Write cleaned sequence series to file in FASTA format
protein_fasta_file = os.path.join("Output", "proteins.faa")
with open(protein_fasta_file, "w") as f:
    for sequence in protein_filt_df["Sequence"]:
        f.write(sequence[:-1])

In [None]:
# Write protein_filt_df to csv for use elsewhere
protein_csv_file = os.path.join("Output", "proteins.csv")
protein_filt_df.to_csv(protein_csv_file, index=False)

In [None]:
# Write nuccore sequences to csv for nucleotide translation script
protein_drop_df.dropna().to_csv(os.path.join("Output", "nucleotides.csv"), index=False)

In [None]:
# Checking MAPT proteins only

In [None]:
# still removing nuccore proteins (GL477576, CT004140)
# and those where sequences not found (scaffold11486, JL1528)
infile = os.path.join("Resources", "mapt_only.txt")
with open(infile, "r") as f:
    lines = f.readlines()
mapt_only = [line.replace("\n", "") for line in lines]
len(mapt_only)

In [None]:
is_mapt = protein_df["Protein_Accession_Number"].isin(mapt_only)
mapt_df = protein_df[is_mapt]
mapt_df

In [None]:
# Write mapt sequence series to file
mapt_fasta_file = "mapt_only.faa"
with open(mapt_fasta_file, "w") as f:
    for sequence in mapt_df["Sequence"]:
        f.write(sequence[:-1])