In [1]:
import os
from Bio import Entrez
from Bio import SeqIO

In [4]:
UWnet_ID = "keku" #replace this in with your NET ID

In [5]:
Entrez.email = f'{UWnet_ID}@uw.edu'

In [6]:
stream = Entrez.esearch(db="protein", idtype="acc", term="VEGFA AND Homo sapiens[Orgn]", retmax=10000)

In [7]:
record = Entrez.read(stream)

In [8]:
record["Count"]

'312'

In [9]:
record["IdList"]

['BHE53204.1', 'BHE53203.1', 'BHE53202.1', 'NP_112177.2', 'NP_005470.2', 'NP_001316057.1', 'NP_005525.2', 'Q14135.5', 'P15692.3', 'Q9BZL6.3', 'P35916.3', 'P17252.4', 'Q15139.2', 'P17948.2', 'P53667.3', 'P02788.6', 'Q9BUL8.1', 'P35613.2', 'P05771.4', 'P35968.2', 'P54851.1', 'P41159.1', 'P01106.2', 'B0L3A2.1', 'Q96Q45.2', 'Q96AV8.3', 'Q7Z5L9.2', 'Q69YL0.1', 'O14786.3', 'A0AVK6.1', 'O00548.2', 'Q5VWQ8.2', 'Q9Y653.2', 'P49591.3', 'Q9NYA1.1', 'P80188.2', 'P35625.2', 'P08034.1', 'P0DME0.2', 'P41250.3', 'Q9Y3V2.4', 'Q9H8L6.2', 'P39060.5', 'Q99828.4', 'P42336.2', 'Q9C0D0.3', 'Q9Y2X7.2', 'Q9NQQ7.2', 'Q8IXJ6.2', 'P49763.2', 'O15431.1', 'Q9Y5L2.1', 'P27701.1', 'P21741.1', 'P08962.2', 'NP_004111.2', 'NP_057288.1', 'NP_003217.4', 'NP_001401407.1', 'NP_001401408.1', 'NP_001401403.1', 'NP_001401405.1', 'NP_001401402.1', 'NP_001401406.1', 'NP_001401404.1', 'NP_001401401.1', 'NP_001398004.1', 'NP_001381326.1', 'NP_001268917.1', 'NP_653170.3', 'NP_001381327.1', 'NP_001381328.1', 'NP_001369580.1', 'NP_00

In [11]:
#len(record["IdList"])

In [12]:
idlist = ",".join(record["IdList"])

In [13]:
#idlist

In [14]:
stream = Entrez.efetch(db="protein", id=idlist, rettype="gb", retmode="text")

In [16]:
records = SeqIO.parse(stream, "gb")

In [17]:
#print(len(records))

In [21]:
output_filename = "output.fasta"
with open(output_filename, "w") as handle:
    SeqIO.write(records, handle, "fasta")
#ignore warning about BiopythonParserWarning: Dropping bond qualifer in feature location if it pops up

In [22]:
#for record in records:
    #print(f"{record.name}, length {len(record)}, with {len(record.features)} features")
#    print(f">{record.id}    {record.description}\n{record.seq}")

In [23]:
with open(output_filename, "r") as f:
    for line in f:
        print(line.strip())

In [38]:
def create_FASTA(protein_name, species, count):
    """Create a FASTA file of a protein of interest in a particular species.
    Example: create_FASTA(VEGFA, Homo sapiens, 20}
    protein_name also takes ID input
    Returns the requested number of matches from the NCBI's Entrez databases."""
    print("Genereating FASTA file...")
    stream = Entrez.esearch(db="protein", idtype="acc", term=f'{protein_name} AND {species}', retmax=count)
    record = Entrez.read(stream)
    idlist = ",".join(record["IdList"])
    stream = Entrez.efetch(db="protein", id=idlist, rettype="gb", retmode="text")
    records = SeqIO.parse(stream, "gb")
    
    mod_protein_name = protein_name.replace(" ", "_")
    mod_species = species.replace(" ", "_")
    
    folder_name = "FASTA_files"
    os.makedirs(folder_name, exist_ok=True)
    file_name = f'{mod_protein_name}_{mod_species}_output.fasta'
    output_filename = os.path.join(folder_name, file_name)
    
    print(f'The length of the generated FASTA file for {protein_name} in {species} is {len(record["IdList"])}.')
    if len(record["IdList"]) != count:
        if len(record["IdList"]) == 0:
            return ">>>>> There are no results for the search terms you have provided! <<<<<"
        print(f'Requested number of matches ({count}) is unavaialble, this is all you get.')
    print()
    print("FASTA file output: \n ----------------")
    with open(output_filename, "w") as handle:
        SeqIO.write(records, handle, "fasta")

    with open(output_filename, "r") as f:
        for line in f:
            print(line.strip())
    

In [46]:
# This function produces a FASTA file (it will make a folder called "FASTA_files" in whatever folder this Jupyter Notebook is open in).

create_FASTA("Vascular endothelial growth factor A", "Homo sapiens", 3)

Genereating FASTA file...
The length of the generated FASTA file for Vascular endothelial growth factor A in Homo sapiens is 3.

FASTA file output: 
 ----------------
>BHE53204.1 vascular endothelial growth factor A, partial [Homo sapiens]
GQEEGASLRVSGTRFLTRKD
>BHE53203.1 vascular endothelial growth factor A, partial [Homo sapiens]
GQEEGASLRVSGTRSLTRKD
>BHE53202.1 vascular endothelial growth factor A, partial [Homo sapiens]
GQEEGASLRVSGTRSLTRKD
