In [1]:
import requests
import os
import time
import numpy as np
import pandas as pd
from helper_functions import api_call, show_NaN_rows, get_gids_sequences, get_sequence_and_count

In [2]:
# Read protein accession numbers from proteins.txt
infile = os.path.join("Resources", "proteins.csv")
protein_df = pd.read_csv(infile)
protein_df

Unnamed: 0,accession_num,aa_cnt
0,NP_001116538,776
1,Q5YCV9,776
2,XP_012352933,732
3,XP_002800600,776
4,XP_003913279,776
...,...,...
83,XP_008116759,1176
84,NP_001119982,1164
85,XP_006634278,629
86,XP_006003167,1753


In [3]:
protein_df = get_gids_sequences(protein_df)
protein_df

Protein 88 of 88

Unnamed: 0,accession_num,aa_cnt,DB,GID,Sequence
0,NP_001116538,776,protein,294862258,>NP_001116538.2 microtubule-associated protein...
1,Q5YCV9,776,protein,59798492,>sp|Q5YCV9.4|TAU_HYLLA RecName: Full=Microtubu...
2,XP_012352933,732,protein,821025767,>XP_012352933.1 PREDICTED: microtubule-associa...
3,XP_002800600,776,protein,297273333,>XP_002800600.1 PREDICTED: microtubule-associa...
4,XP_003913279,776,protein,1777289710,>XP_003913279.2 microtubule-associated protein...
...,...,...,...,...,...
83,XP_008116759,1176,protein,637342532,>XP_008116759.1 PREDICTED: serine/arginine rep...
84,NP_001119982,1164,protein,187607752,>NP_001119982.1 microtubule-associated protein...
85,XP_006634278,629,protein,573893049,>XP_006634278.1 PREDICTED: microtubule-associa...
86,XP_006003167,1753,protein,557001117,>XP_006003167.1 PREDICTED: microtubule-associa...


In [12]:
protein_df[protein_df["DB"] == "nuccore"]

Unnamed: 0,accession_num,aa_cnt,DB,GID,Sequence
32,GL477576,798,nuccore,308150460,>GL477576.1 Petromyzon marinus unplaced genomi...
33,CT004140,243,nuccore,68298023,>CT004140.1 CT004140 RZPD no.9017 Homo sapiens...
58,BAHO01035973,2118,nuccore,405900247,">BAHO01035973.1 Latimeria chalumnae DNA, conti..."
62,KE993814,196,nuccore,543413231,>KE993814.1 Lethenteron camtschaticum unplaced...
67,NW_003943621,1122,nuccore,395725070,>NW_003943621.1 Saimiri boliviensis boliviensi...


In [14]:
print("Total Number of Proteins in Input File:", len(protein_df))
print("Number of Proteins From db=protein:", len(protein_df[protein_df["DB"] == "protein"]))
print ("Proteins From db=nuccore (%s):" % len(protein_df[protein_df["DB"] == "nuccore"]))
print(protein_df[protein_df["DB"] == "nuccore"])
print("\n")
print("Proteins Not Found (%s):" % len(show_NaN_rows(protein_df)))
print(show_NaN_rows(protein_df))

Total Number of Proteins in Input File: 88
Number of Proteins From db=protein: 79
Proteins From db=nuccore (5):
   accession_num  aa_cnt       DB        GID  \
32      GL477576     798  nuccore  308150460   
33      CT004140     243  nuccore   68298023   
58  BAHO01035973    2118  nuccore  405900247   
62      KE993814     196  nuccore  543413231   
67  NW_003943621    1122  nuccore  395725070   

                                             Sequence  
32  >GL477576.1 Petromyzon marinus unplaced genomi...  
33  >CT004140.1 CT004140 RZPD no.9017 Homo sapiens...  
58  >BAHO01035973.1 Latimeria chalumnae DNA, conti...  
62  >KE993814.1 Lethenteron camtschaticum unplaced...  
67  >NW_003943621.1 Saimiri boliviensis boliviensi...  


Proteins Not Found (4):
    accession_num  aa_cnt    DB  GID Sequence
22  scaffold11486     761  None  NaN      NaN
31         JL1528     864  None  NaN      NaN
54  scaffold43622    1947  None  NaN      NaN
70    XP_01266736    1119  None  NaN      NaN


In [15]:
# Need to output sequences to FASTA (.faa) file for alignment before bassing to RAxML to generate
# Phylogenetic tree
# FASTA Format:
#>SEQUENCE_1
#...
#>SEQUENCE_2
#...
#>...

In [16]:
# Example sequence from DB=protein
print(get_sequence_and_count(protein_df, "NP_001116538"))  # Homo sapiens (human tau), correct count

(776, '>NP_001116538.2 microtubule-associated protein tau isoform 6 [Homo sapiens]\nMAEPRQEFEVMEDHAGTYGLGDRKDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAKSTP\nTAEDVTAPLVDEGAPGKQAAAQPHTEIPEGTTAEEAGIGDTPSLEDEAAGHVTQEPESGKVVQEGFLREP\nGPPGLSHQLMSGMPGAPLLPEGPREATRQPSGTGPEDTEGGRHAPELLKHQLLGDLHQEGPPLKGAGGKE\nRPGSKEEVDEDRDVDESSPQDSPPSKASPAQDGRPPQTAAREATSIPGFPAEGAIPLPVDFLSKVSTEIP\nASEPDGPSVGRAKGQDAPLEFTFHVEITPNVQKEQAHSEEHLGRAAFPGAPGEGPEARGPSLGEDTKEAD\nLPEPSEKQPAAAPRGKPVSRVPQLKARMVSKSKDGTGSDDKKAKTSTRSSAKTLKNRPCLSPKHPTPGSS\nDPLIQPSSPAVCPEPPSSPKYVSSVTSRTGSSGAKEMKLKGADGKTKIATPRGAAPPGQKGQANATRIPA\nKTPPAPKTPPSSATKQVQRRPPPAGPRSERGEPPKSGDRSGYSSPGSPGTPGSRSRTPSLPTPPTREPKK\nVAVVRTPPKSPSSAKSRLQTAPVPMPDLKNVKSKIGSTENLKHQPGGGKVQIINKKLDLSNVQSKCGSKD\nNIKHVPGGGSVQIVYKPVDLSKVTSKCGSLGNIHHKPGGGQVEVKSEKLDFKDRVQSKIGSLDNITHVPG\nGGNKKIETHKLTFRENAKAKTDHGAEIVYKSPVVSGDTSPRHLSNVSSTGSIDMVDSPQLATLADEVSAS\nLAKQGL\n\n', 'MAEPRQEFEVMEDHAGTYGLGDRKDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAKSTPTAEDVTAPLVDEGAPGKQAAAQPHTEIPEGTTAEEAGIGD

In [18]:
# Example sequence from DB=nuccore
sea_lamprey = "GL477576"
get_sequence_and_count(protein_df, sea_lamprey)
# nucleotide sequence (210407 nucleotides) for Petromyzon marinus (correct)
# Should have 846aa

(210407,
 '>GL477576.1 Petromyzon marinus unplaced genomic scaffold scaffold_1248, whole genome shotgun sequence\nTGTTAAGTCACCCTGGGGGGTTAATCTAGAACGTAGTTAATCCGATACAAGACGTAAGATAAATAACTTG\nATGTATTATTTACAGCCCTTTGTCAAAACACTGCTGGATGAAGGCCTCCCCGTGCCGTATTAGTCGAGGG\nGGTTGTTTTTGACATACTTCACCACTCTGGTCAGTGCAGATTTCTGATTGGGAAGGCCCAGGACGGGTTC\nAGATATTACTCACCTCTGATGTTGGCCATAGCTGGGTCTTGAAAGCAGGTCACCAGGTTCAGAGCGCAGT\nGCAAATATCATGCGATCATATATTACATTATCGAGTGGTCAGCGCTTCCCAGATGCAACTTATTCAAGTG\nCACAGCTTTGATCGTCTATGGAGGTGAAGGGGTTCTGCTCAATGTAACTGCTCGAGAGAGAGAGAGATAG\nGGAGAGCTGTTCAACAACACGTAGCCCTCCATTCCTGGTGCAGGATCAGCCCCTTTGTTAGTCCAAGCAA\nTCTGAACGGCGACTGCGACGAAATGCATCGTCAAGCACGCCCCAACTCACGGCGCGAGTTTATGGTGGAA\nCAGGGCGTCAATGGAAGAGGACATTTTGGCACAAATTACGTGGCATGGTTGACAACAATTGACACCAAAA\nTGTAACCTTTCACCTCCCGCAATTGGACAATTAACTGATATTACACTTGTAGAGCCATATATATATACAC\nGAGGCATGGTCTTCATACTATATACGGTCTATATTCATATATTGCAGGATCAATACATAGTTGCAGTTGT\nTCACGCTCTGCGTTATGGACCCAGCGACCCGAGTTCAATTCCCAGCCGAGGCTTGGGTCAGCGGCGGGCG\nACATCTGAACCAGTCCTCTGCC

In [26]:
# Checking proteins whose sequence lengths do not match the aa counts in the input file
# Most obvious Mismatches:
# nucleotide sequences for correct protein (GL477576, BAHO01035973, KE993814, NW_003943621)
# mRNA sequence for incorrect protein (CT004140)

ret_cnts = []
for i in range(len(protein_df)):
    protein = protein_df.iloc[i]["accession_num"]
    in_cnt = protein_df.iloc[i]["aa_cnt"]  # aa count from infile
    gid = protein_df.iloc[i]["GID"]
    if gid is not np.NaN:
        ret_cnt = get_sequence_and_count(protein_df, protein)[0]  ## aa count from API returned sequence
        ret_cnts.append(ret_cnt)
    else:
        ret_cnts.append(np.NaN)

protein_df["api_returned_aa_count"] = ret_cnts
protein_df

Unnamed: 0,accession_num,aa_cnt,DB,GID,Sequence,api_returned_aa_count
0,NP_001116538,776,protein,294862258,>NP_001116538.2 microtubule-associated protein...,776.0
1,Q5YCV9,776,protein,59798492,>sp|Q5YCV9.4|TAU_HYLLA RecName: Full=Microtubu...,776.0
2,XP_012352933,732,protein,821025767,>XP_012352933.1 PREDICTED: microtubule-associa...,776.0
3,XP_002800600,776,protein,297273333,>XP_002800600.1 PREDICTED: microtubule-associa...,776.0
4,XP_003913279,776,protein,1777289710,>XP_003913279.2 microtubule-associated protein...,776.0
...,...,...,...,...,...,...
83,XP_008116759,1176,protein,637342532,>XP_008116759.1 PREDICTED: serine/arginine rep...,2539.0
84,NP_001119982,1164,protein,187607752,>NP_001119982.1 microtubule-associated protein...,93.0
85,XP_006634278,629,protein,573893049,>XP_006634278.1 PREDICTED: microtubule-associa...,643.0
86,XP_006003167,1753,protein,557001117,>XP_006003167.1 PREDICTED: microtubule-associa...,1753.0


In [27]:
# Protein sequence counts that do not match the input count:
aa_mismatch_df = protein_df[protein_df["aa_cnt"] != protein_df["api_returned_aa_count"]]
aa_mismatch_df  # 66 of 88 proteins have this issue, cannot drop them all!

Unnamed: 0,accession_num,aa_cnt,DB,GID,Sequence,api_returned_aa_count
2,XP_012352933,732,protein,821025767,>XP_012352933.1 PREDICTED: microtubule-associa...,776.0
5,XP_008995083,772,protein,675658919,>XP_008995083.1 PREDICTED: microtubule-associa...,497.0
6,XP_010328565,748,protein,1984072572,>XP_010328565.2 microtubule-associated protein...,852.0
7,XP_005983781,758,protein,556777384,>XP_005983781.1 PREDICTED: microtubule-associa...,778.0
8,XP_013845380,784,protein,927194489,>XP_013845380.1 PREDICTED: microtubule-associa...,451.0
...,...,...,...,...,...,...
82,XP_418480,1079,protein,513167116,>XP_418480.4 PREDICTED: microtubule-associated...,1080.0
83,XP_008116759,1176,protein,637342532,>XP_008116759.1 PREDICTED: serine/arginine rep...,2539.0
84,NP_001119982,1164,protein,187607752,>NP_001119982.1 microtubule-associated protein...,93.0
85,XP_006634278,629,protein,573893049,>XP_006634278.1 PREDICTED: microtubule-associa...,643.0


In [28]:
# Summary of Proteins to be removed:
protein_NaNs = show_NaN_rows(protein_df)
nuccore_df = protein_df[protein_df["DB"] == "nuccore"]
protein_drop_df = pd.concat([protein_NaNs, nuccore_df])
protein_drop_df

Unnamed: 0,accession_num,aa_cnt,DB,GID,Sequence,api_returned_aa_count
22,scaffold11486,761,,,,
31,JL1528,864,,,,
54,scaffold43622,1947,,,,
70,XP_01266736,1119,,,,
32,GL477576,798,nuccore,308150460.0,>GL477576.1 Petromyzon marinus unplaced genomi...,210407.0
33,CT004140,243,nuccore,68298023.0,>CT004140.1 CT004140 RZPD no.9017 Homo sapiens...,837.0
58,BAHO01035973,2118,nuccore,405900247.0,">BAHO01035973.1 Latimeria chalumnae DNA, conti...",15919.0
62,KE993814,196,nuccore,543413231.0,>KE993814.1 Lethenteron camtschaticum unplaced...,1564372.0
67,NW_003943621,1122,nuccore,395725070.0,>NW_003943621.1 Saimiri boliviensis boliviensi...,30262601.0


In [31]:
# For now, remove the above 9 protein sequences
# These are either:
# NaNs, i.e no returned sequence
# nucleotide sequences for correct protein (GL477576, BAHO01035973, KE993814, NW_003943621) (from nuccore)
# mRNA sequence for incorrect protein (CT004140) (from nuccore)

# Remove protein_drop_df (a subset of protein_df) from protein_df:
# First merge the df's with `inidcator` showing if the column is left_only, both, or right_only
merged_df = protein_df.merge(protein_drop_df, how="left", indicator=True)
protein_filt_df = merged_df[merged_df["_merge"] == "left_only"]
protein_filt_df = protein_filt_df.drop("_merge", axis=1)
protein_filt_df = protein_filt_df.astype({"api_returned_aa_count": "int"})
protein_filt_df

Unnamed: 0,accession_num,aa_cnt,DB,GID,Sequence,api_returned_aa_count
0,NP_001116538,776,protein,294862258,>NP_001116538.2 microtubule-associated protein...,776
1,Q5YCV9,776,protein,59798492,>sp|Q5YCV9.4|TAU_HYLLA RecName: Full=Microtubu...,776
2,XP_012352933,732,protein,821025767,>XP_012352933.1 PREDICTED: microtubule-associa...,776
3,XP_002800600,776,protein,297273333,>XP_002800600.1 PREDICTED: microtubule-associa...,776
4,XP_003913279,776,protein,1777289710,>XP_003913279.2 microtubule-associated protein...,776
...,...,...,...,...,...,...
83,XP_008116759,1176,protein,637342532,>XP_008116759.1 PREDICTED: serine/arginine rep...,2539
84,NP_001119982,1164,protein,187607752,>NP_001119982.1 microtubule-associated protein...,93
85,XP_006634278,629,protein,573893049,>XP_006634278.1 PREDICTED: microtubule-associa...,643
86,XP_006003167,1753,protein,557001117,>XP_006003167.1 PREDICTED: microtubule-associa...,1753


In [32]:
show_NaN_rows(protein_filt_df)

Unnamed: 0,accession_num,aa_cnt,DB,GID,Sequence,api_returned_aa_count


In [33]:
# Write cleaned sequence series to file in FASTA format
protein_fasta_file = os.path.join("Output", "proteins.faa")
with open(protein_fasta_file, "w") as f:
    for sequence in protein_filt_df["Sequence"]:
        f.write(sequence[:-1])

In [34]:
# Write protein_filt_df to csv for use elsewhere
protein_csv_file = os.path.join("Output", "proteins.csv")
protein_filt_df.to_csv(protein_csv_file, index=False)

In [35]:
# Write nuccore sequences to csv for nucleotide translation script
protein_df[protein_df["DB"] == "nuccore"].to_csv(os.path.join("Output", "nucleotides.csv"), index=False)

In [None]:
# Checking MAPT proteins only

In [None]:
# still removing nuccore proteins (GL477576, CT004140)
# and those where sequences not found (scaffold11486, JL1528)
infile = os.path.join("Resources", "mapt_only.txt")
with open(infile, "r") as f:
    lines = f.readlines()
mapt_only = [line.replace("\n", "") for line in lines]
len(mapt_only)

In [None]:
is_mapt = protein_df["Protein_Accession_Number"].isin(mapt_only)
mapt_df = protein_df[is_mapt]
mapt_df

In [None]:
# Write mapt sequence series to file
mapt_fasta_file = "mapt_only.faa"
with open(mapt_fasta_file, "w") as f:
    for sequence in mapt_df["Sequence"]:
        f.write(sequence[:-1])