In [1]:
import requests
import os
import time
import numpy as np
import pandas as pd
import helper_functions as hf

In [2]:
# Read protein accession numbers from proteins.txt
infile = os.path.join("Input", "proteins.csv")
protein_df = pd.read_csv(infile)
protein_df

Unnamed: 0,accession_num,aa_cnt,species
0,NP_001116538,776,human
1,Q5YCV9,776,common gibbon
2,XP_012352933,732,white-cheeked gibbon
3,XP_002800600,776,Chinese Rhesus monkey
4,XP_003913279,776,olive baboon
...,...,...,...
83,XP_008116759,1176,green anole
84,NP_001119982,1164,western clawed frog
85,XP_006634278,629,spotted gar
86,XP_006003167,1753,African coelacanth


In [3]:
protein_df = hf.get_gids_sequences(protein_df)
protein_df

scaffold11486 88
{'eSummaryResult': {'ERROR': 'Invalid uid scaffold11486 at position=0'}}
JL1528n 32 of 88
{'eSummaryResult': {'ERROR': 'Invalid uid JL1528 at position=0'}}
scaffold43622 88
{'eSummaryResult': {'ERROR': 'Invalid uid scaffold43622 at position=0'}}
XP_01266736of 88
{'eSummaryResult': {'ERROR': 'Invalid uid XP_01266736 at position=0'}}
Protein 88 of 88

Unnamed: 0,accession_num,aa_cnt,species,DB,GID,Sequence
0,NP_001116538,776,human,protein,294862258,>NP_001116538.2 microtubule-associated protein...
1,Q5YCV9,776,common gibbon,protein,59798492,>sp|Q5YCV9.4|TAU_HYLLA RecName: Full=Microtubu...
2,XP_012352933,732,white-cheeked gibbon,protein,821025767,>XP_012352933.1 PREDICTED: microtubule-associa...
3,XP_002800600,776,Chinese Rhesus monkey,protein,297273333,>XP_002800600.1 PREDICTED: microtubule-associa...
4,XP_003913279,776,olive baboon,protein,1777289710,>XP_003913279.2 microtubule-associated protein...
...,...,...,...,...,...,...
83,XP_008116759,1176,green anole,protein,637342532,>XP_008116759.1 PREDICTED: serine/arginine rep...
84,NP_001119982,1164,western clawed frog,protein,187607752,>NP_001119982.1 microtubule-associated protein...
85,XP_006634278,629,spotted gar,protein,573893049,>XP_006634278.1 PREDICTED: microtubule-associa...
86,XP_006003167,1753,African coelacanth,protein,557001117,>XP_006003167.1 PREDICTED: microtubule-associa...


In [4]:
protein_df[protein_df["DB"] == "nuccore"]

Unnamed: 0,accession_num,aa_cnt,species,DB,GID,Sequence
32,GL477576,798,sea lamprey,nuccore,308150460,>GL477576.1 Petromyzon marinus unplaced genomi...
33,CT004140,243,Japanese inshore hagfish,nuccore,68298023,>CT004140.1 CT004140 RZPD no.9017 Homo sapiens...
58,BAHO01035973,2118,African coelacanth,nuccore,405900247,">BAHO01035973.1 Latimeria chalumnae DNA, conti..."
62,KE993814,196,arctic lamprey,nuccore,543413231,>KE993814.1 Lethenteron camtschaticum unplaced...
67,NW_003943621,1122,Bolivian squirrel monkey,nuccore,395725070,>NW_003943621.1 Saimiri boliviensis boliviensi...


In [5]:
print("Total Number of Distinct Proteins in Input File:", len(protein_df))
print("Number of Proteins From db=protein:", len(protein_df[protein_df["DB"] == "protein"]))
print ("Proteins From db=nuccore (%s):" % len(protein_df[protein_df["DB"] == "nuccore"]))
print(protein_df[protein_df["DB"] == "nuccore"])
print("\n")
print("Proteins Not Found (%s):" % len(hf.show_NaN_rows(protein_df)))
print(hf.show_NaN_rows(protein_df))

Total Number of Distinct Proteins in Input File: 88
Number of Proteins From db=protein: 79
Proteins From db=nuccore (5):
   accession_num  aa_cnt                   species       DB        GID  \
32      GL477576     798               sea lamprey  nuccore  308150460   
33      CT004140     243  Japanese inshore hagfish  nuccore   68298023   
58  BAHO01035973    2118        African coelacanth  nuccore  405900247   
62      KE993814     196            arctic lamprey  nuccore  543413231   
67  NW_003943621    1122  Bolivian squirrel monkey  nuccore  395725070   

                                             Sequence  
32  >GL477576.1 Petromyzon marinus unplaced genomi...  
33  >CT004140.1 CT004140 RZPD no.9017 Homo sapiens...  
58  >BAHO01035973.1 Latimeria chalumnae DNA, conti...  
62  >KE993814.1 Lethenteron camtschaticum unplaced...  
67  >NW_003943621.1 Saimiri boliviensis boliviensi...  


Proteins Not Found (4):
    accession_num  aa_cnt                         species    DB  GID Seq

In [6]:
# Need to output sequences to FASTA (.faa) file for alignment before bassing to RAxML to generate
# Phylogenetic tree
# FASTA Format:
#>SEQUENCE_1
#...
#>SEQUENCE_2
#...
#>...

In [9]:
# Example sequence from DB=protein, Homo sapiens (human tau), correct count
fasta_seq = hf.get_fasta_from_df(protein_df, "NP_001116538")
seq = hf.convert_fasta_to_str(fasta_seq)
print("NP_001116538 (human tau) Sequence:")
print(seq)
print("AA Count:", len(seq))

NP_001116538 (human tau) Sequence:
MAEPRQEFEVMEDHAGTYGLGDRKDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAKSTPTAEDVTAPLVDEGAPGKQAAAQPHTEIPEGTTAEEAGIGDTPSLEDEAAGHVTQEPESGKVVQEGFLREPGPPGLSHQLMSGMPGAPLLPEGPREATRQPSGTGPEDTEGGRHAPELLKHQLLGDLHQEGPPLKGAGGKERPGSKEEVDEDRDVDESSPQDSPPSKASPAQDGRPPQTAAREATSIPGFPAEGAIPLPVDFLSKVSTEIPASEPDGPSVGRAKGQDAPLEFTFHVEITPNVQKEQAHSEEHLGRAAFPGAPGEGPEARGPSLGEDTKEADLPEPSEKQPAAAPRGKPVSRVPQLKARMVSKSKDGTGSDDKKAKTSTRSSAKTLKNRPCLSPKHPTPGSSDPLIQPSSPAVCPEPPSSPKYVSSVTSRTGSSGAKEMKLKGADGKTKIATPRGAAPPGQKGQANATRIPAKTPPAPKTPPSSATKQVQRRPPPAGPRSERGEPPKSGDRSGYSSPGSPGTPGSRSRTPSLPTPPTREPKKVAVVRTPPKSPSSAKSRLQTAPVPMPDLKNVKSKIGSTENLKHQPGGGKVQIINKKLDLSNVQSKCGSKDNIKHVPGGGSVQIVYKPVDLSKVTSKCGSLGNIHHKPGGGQVEVKSEKLDFKDRVQSKIGSLDNITHVPGGGNKKIETHKLTFRENAKAKTDHGAEIVYKSPVVSGDTSPRHLSNVSSTGSIDMVDSPQLATLADEVSASLAKQGL
AA Count: 776


In [10]:
fasta_seq = hf.get_fasta_from_df(protein_df, "NP_001119982")
seq = hf.convert_fasta_to_str(fasta_seq)
print(seq)

MADLGQNFSLQDALTDGPAEIESEVKQDFITSLENEKFEDEVGETCDKSSYVPLLDDDDVKEPKNKSERSAAPHDSIMANGEHNLGENEVTET


In [11]:
# Example sequence from DB=nuccore
sea_lamprey = "GL477576"
fasta_seq = hf.get_fasta_from_df(protein_df, sea_lamprey)
seq = hf.convert_fasta_to_str(fasta_seq)
print("%s (Sea Lamprey) Sequence:" % sea_lamprey)
print(seq)
print("AA Count:", len(seq))
# nucleotide sequence (210407 nucleotides) for Petromyzon marinus (correct)
# Should have 846aa

GL477576 (Sea Lamprey) Sequence:
TGTTAAGTCACCCTGGGGGGTTAATCTAGAACGTAGTTAATCCGATACAAGACGTAAGATAAATAACTTGATGTATTATTTACAGCCCTTTGTCAAAACACTGCTGGATGAAGGCCTCCCCGTGCCGTATTAGTCGAGGGGGTTGTTTTTGACATACTTCACCACTCTGGTCAGTGCAGATTTCTGATTGGGAAGGCCCAGGACGGGTTCAGATATTACTCACCTCTGATGTTGGCCATAGCTGGGTCTTGAAAGCAGGTCACCAGGTTCAGAGCGCAGTGCAAATATCATGCGATCATATATTACATTATCGAGTGGTCAGCGCTTCCCAGATGCAACTTATTCAAGTGCACAGCTTTGATCGTCTATGGAGGTGAAGGGGTTCTGCTCAATGTAACTGCTCGAGAGAGAGAGAGATAGGGAGAGCTGTTCAACAACACGTAGCCCTCCATTCCTGGTGCAGGATCAGCCCCTTTGTTAGTCCAAGCAATCTGAACGGCGACTGCGACGAAATGCATCGTCAAGCACGCCCCAACTCACGGCGCGAGTTTATGGTGGAACAGGGCGTCAATGGAAGAGGACATTTTGGCACAAATTACGTGGCATGGTTGACAACAATTGACACCAAAATGTAACCTTTCACCTCCCGCAATTGGACAATTAACTGATATTACACTTGTAGAGCCATATATATATACACGAGGCATGGTCTTCATACTATATACGGTCTATATTCATATATTGCAGGATCAATACATAGTTGCAGTTGTTCACGCTCTGCGTTATGGACCCAGCGACCCGAGTTCAATTCCCAGCCGAGGCTTGGGTCAGCGGCGGGCGACATCTGAACCAGTCCTCTGCCTGCCGAGTCTTCAGCAACCTGCGCTGACCAGCGCGGTGAAGTATGGCCAAACAGCAAAGCGGCGTGGGGAGACCTTCATACAGCAGTGGATTGACAAAGGGCGGC

In [12]:
# Checking proteins whose sequence lengths do not match the aa counts in the input file
# Most obvious Mismatches:
# nucleotide sequences for correct protein (GL477576, BAHO01035973, KE993814, NW_003943621)
# mRNA sequence for incorrect protein (CT004140)

ret_cnts = []
for i in range(len(protein_df)):
    protein = protein_df.iloc[i]["accession_num"]
    in_cnt = protein_df.iloc[i]["aa_cnt"]  # aa count from infile
    gid = protein_df.iloc[i]["GID"]
    if gid is not np.NaN:
        fasta_seq = hf.get_fasta_from_df(protein_df, protein)
        seq = hf.convert_fasta_to_str(fasta_seq)
        ret_cnt = len(seq)
        ret_cnts.append(ret_cnt)
    else:
        ret_cnts.append(np.NaN)

protein_df["api_returned_aa_count"] = ret_cnts
protein_df

Unnamed: 0,accession_num,aa_cnt,species,DB,GID,Sequence,api_returned_aa_count
0,NP_001116538,776,human,protein,294862258,>NP_001116538.2 microtubule-associated protein...,776.0
1,Q5YCV9,776,common gibbon,protein,59798492,>sp|Q5YCV9.4|TAU_HYLLA RecName: Full=Microtubu...,776.0
2,XP_012352933,732,white-cheeked gibbon,protein,821025767,>XP_012352933.1 PREDICTED: microtubule-associa...,776.0
3,XP_002800600,776,Chinese Rhesus monkey,protein,297273333,>XP_002800600.1 PREDICTED: microtubule-associa...,776.0
4,XP_003913279,776,olive baboon,protein,1777289710,>XP_003913279.2 microtubule-associated protein...,776.0
...,...,...,...,...,...,...,...
83,XP_008116759,1176,green anole,protein,637342532,>XP_008116759.1 PREDICTED: serine/arginine rep...,2539.0
84,NP_001119982,1164,western clawed frog,protein,187607752,>NP_001119982.1 microtubule-associated protein...,93.0
85,XP_006634278,629,spotted gar,protein,573893049,>XP_006634278.1 PREDICTED: microtubule-associa...,643.0
86,XP_006003167,1753,African coelacanth,protein,557001117,>XP_006003167.1 PREDICTED: microtubule-associa...,1753.0


In [13]:
# Protein sequence counts that do not match the input count:
aa_mismatch_df = protein_df[protein_df["aa_cnt"] != protein_df["api_returned_aa_count"]]
aa_mismatch_df  # 66 of 88 proteins have this issue, cannot drop them all!

Unnamed: 0,accession_num,aa_cnt,species,DB,GID,Sequence,api_returned_aa_count
2,XP_012352933,732,white-cheeked gibbon,protein,821025767,>XP_012352933.1 PREDICTED: microtubule-associa...,776.0
5,XP_008995083,772,white-tufted-ear marmoset,protein,675658919,>XP_008995083.1 PREDICTED: microtubule-associa...,497.0
6,XP_010328565,748,Bolivian squirrel monkey,protein,1984072572,>XP_010328565.2 microtubule-associated protein...,852.0
7,XP_005983781,758,Tibetan antelope,protein,556777384,>XP_005983781.1 PREDICTED: microtubule-associa...,778.0
8,XP_013845380,784,domestic pig,protein,927194489,>XP_013845380.1 PREDICTED: microtubule-associa...,451.0
...,...,...,...,...,...,...,...
82,XP_418480,1079,chicken,protein,513167116,>XP_418480.4 PREDICTED: microtubule-associated...,1080.0
83,XP_008116759,1176,green anole,protein,637342532,>XP_008116759.1 PREDICTED: serine/arginine rep...,2539.0
84,NP_001119982,1164,western clawed frog,protein,187607752,>NP_001119982.1 microtubule-associated protein...,93.0
85,XP_006634278,629,spotted gar,protein,573893049,>XP_006634278.1 PREDICTED: microtubule-associa...,643.0


In [14]:
# Building df of Proteins to be removed:
protein_NaNs = hf.show_NaN_rows(protein_df)

nuccore_df = protein_df[protein_df["DB"] == "nuccore"]

duplicate_species_df = protein_df[protein_df.duplicated("species")]  # default keep='first' -> keep first occurence

protein_drop_df = pd.concat([protein_NaNs, nuccore_df, duplicate_species_df])
protein_drop_df

Unnamed: 0,accession_num,aa_cnt,species,DB,GID,Sequence,api_returned_aa_count
22,scaffold11486,761,Austalian saltwater crocodile,,,,
31,JL1528,864,Japanese arctic lamprey,,,,
54,scaffold43622,1947,Australian saltwater crocodile,,,,
70,XP_01266736,1119,small-eared galago,,,,
32,GL477576,798,sea lamprey,nuccore,308150460.0,>GL477576.1 Petromyzon marinus unplaced genomi...,210407.0
33,CT004140,243,Japanese inshore hagfish,nuccore,68298023.0,>CT004140.1 CT004140 RZPD no.9017 Homo sapiens...,837.0
58,BAHO01035973,2118,African coelacanth,nuccore,405900247.0,">BAHO01035973.1 Latimeria chalumnae DNA, conti...",15919.0
62,KE993814,196,arctic lamprey,nuccore,543413231.0,>KE993814.1 Lethenteron camtschaticum unplaced...,1564372.0
67,NW_003943621,1122,Bolivian squirrel monkey,nuccore,395725070.0,>NW_003943621.1 Saimiri boliviensis boliviensi...,30262601.0
34,XP_011509496,1915,human,protein,767918175.0,>XP_011509496.1 PREDICTED: microtubule-associa...,1915.0


In [18]:
# For now, remove the above protein sequences
# These are either:
# NaNs, i.e no returned sequence
# nucleotide sequences for correct protein (GL477576, BAHO01035973, KE993814, NW_003943621) (from nuccore)
# mRNA sequence for incorrect protein (CT004140) (from nuccore)
# Alternative sequence for the same species - currently just keeping the MAPT version

# Remove protein_drop_df (a subset of protein_df) from protein_df:
# First merge the df's with `inidcator` showing if the column is left_only, both, or right_only
protein_filt_df = hf.remove_subset_from_df(protein_df, protein_drop_df)
protein_filt_df = protein_filt_df.astype({"api_returned_aa_count": "int"})
print(protein_filt_df)
print("Number of Proteins:", len(protein_filt_df))

   accession_num  aa_cnt                    species       DB         GID  \
0   NP_001116538     776                      human  protein   294862258   
1         Q5YCV9     776              common gibbon  protein    59798492   
2   XP_012352933     732       white-cheeked gibbon  protein   821025767   
3   XP_002800600     776      Chinese Rhesus monkey  protein   297273333   
4   XP_003913279     776               olive baboon  protein  1777289710   
5   XP_008995083     772  white-tufted-ear marmoset  protein   675658919   
6   XP_010328565     748   Bolivian squirrel monkey  protein  1984072572   
7   XP_005983781     758           Tibetan antelope  protein   556777384   
8   XP_013845380     784               domestic pig  protein   927194489   
9   XP_006199331     764                     alpaca  protein   560953646   
10  XP_005858648     803               Brandt's bat  protein   554527651   
11      EFB25687     776                giant panda  protein   281350103   
12  XP_00795

In [20]:
hf.show_NaN_rows(protein_filt_df)

Unnamed: 0,accession_num,aa_cnt,species,DB,GID,Sequence,api_returned_aa_count


In [21]:
# Write cleaned sequence series to file in FASTA format
protein_fasta_file = os.path.join("Output", "proteins_unique.faa")
with open(protein_fasta_file, "w") as f:
    for sequence in protein_filt_df["Sequence"]:
        f.write(sequence[:-1])

In [17]:
# Write protein_filt_df to csv for use elsewhere
protein_csv_file = os.path.join("Output", "proteins.csv")
protein_filt_df.to_csv(protein_csv_file, index=False)

In [18]:
# Write nuccore sequences to csv for nucleotide translation script
protein_df[protein_df["DB"] == "nuccore"].to_csv(os.path.join("Output", "nucleotides.csv"), index=False)

In [None]:
# Checking MAPT proteins only

In [None]:
# still removing nuccore proteins (GL477576, CT004140)
# and those where sequences not found (scaffold11486, JL1528)
infile = os.path.join("Resources", "mapt_only.txt")
with open(infile, "r") as f:
    lines = f.readlines()
mapt_only = [line.replace("\n", "") for line in lines]
len(mapt_only)

In [None]:
is_mapt = protein_df["Protein_Accession_Number"].isin(mapt_only)
mapt_df = protein_df[is_mapt]
mapt_df

In [None]:
# Write mapt sequence series to file
mapt_fasta_file = "mapt_only.faa"
with open(mapt_fasta_file, "w") as f:
    for sequence in mapt_df["Sequence"]:
        f.write(sequence[:-1])