In [1]:
# import libraries
import requests
import pandas as pd
from tqdm.auto import tqdm
from time import sleep
import os

In [2]:
# Function to download from URL
def download(url, fileName):
    for i in range(10):
        try:
            # Delete existing files with filename
            try:
                os.remove(fileName) 
            except:
                pass
            
            """ Use requests to download file. 
            Works with streams to be able large files without having the need of a 
            large memory.
            """
            with requests.get(url, stream=True) as r:
                r.raise_for_status()
                with open(fileName, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192): 
                        if chunk:
                            f.write(chunk)
            return fileName
        except:
            w = 10*i
            print("Download", url,"failed:",i,",wait for",w,"seconds")
            sleep(w)
            
def uniprotRetrieve(fileName, query="",format="list",columns="",include="no",compress="no",limit=0,offset=0):
    """Downloads file from uniprot for given parameters
    
    If no parameters are given the function will download a list of all the 
    proteins ID's. More information about how the URL should be constructed can
    be found on: 
    https://www.uniprot.org/help/api%5Fqueries
    
    Parameters
    ----------
    fileName : str
        name for the downloaded file
    query : str (Default='')
        query that would be searched if as you used the webinterface on 
        https://www.uniprot.org/. If no query is provided, all protein entries
        are selected. 
    format : str (Default='list')
        File format you want to retrieve from uniprot. Available format are:
        html | tab | xls | fasta | gff | txt | xml | rdf | list | rss
    columns : str (Default='')
        Column information you want to know for each entry in the query 
        when format tab or xls is selected.
    include : str (Default='no')
        Include isoform sequences when the format parameter is set to fasta.
        Include description of referenced data when the format parameter is set to rdf.
        This parameter is ignored for all other values of the format parameter.
    compress : str (Default='no')
        download file in gzipped compression format.
    limit : int (Default=0)
        Limit the amount of results that is given. 0 means you download all.
    offset : int (Default=0)
        When you limit the amount of results, offset determines where to start.
        
    Returns
    -------
    fileName : str
        Name of the downloaeded file.
    """
    def generateURL(baseURL, query="",format="list",columns="",include="no",compress="no",limit="0",offset="0"):
        """Generate URL with given parameters"""
        def glueParameters(**kwargs):
            gluedParameters = ""
            for parameter, value in kwargs.items():
                gluedParameters+=parameter + "=" + str(value) + "&"
            return gluedParameters.replace(" ","+")[:-1] #Last "&" is removed, spacec replaced by "+"
        return baseURL + glueParameters(query=query,
                                        format=format,
                                        columns=columns,
                                        include=include,
                                        compress=compress,
                                        limit=limit,
                                        offset=offset)
    URL = generateURL("https://www.uniprot.org/uniprot/?",
               query=query,
               format=format,
               columns=columns,
               include=include,
               compress=compress,
               limit=limit,
               offset=offset)
    return download(URL, fileName)

In [3]:
# Read in organism names
organisms = pd.read_csv("../2020-03-30.getOrganisms/organisms.tab",sep="\t",index_col=0)["Organism"].unique()
organisms

array(['Shigella boydii', 'Yersinia pestis', 'Buchnera aphidicola', ...,
       'Photorhabdus bodei', 'Klebsiella sp. OBRC7',
       'Serratia sp. 16ANAIII'], dtype=object)

In [None]:
SUCCES="succes.log"
FAILED="failed.log"
for FILE in [SUCCES,FAILED]:
    with open(FILE,"w") as f:
        f.write(FILE.replace(".log",""))
        f.write("\n\n")

sleepTime = 5
for org in tqdm(organisms):   
    try:
        sleep(sleepTime)

        fileNameBaseRaw="{0}".format(org)
        fileNameBase = fileNameBaseRaw.replace("/","").replace("(","").replace(")","").replace(" ","_")

        # Download protein of cytoplasm
        QUERY='(goa:(cytoplasm) OR goa:(cytosol)) NOT annotation:(type:signal) AND organism:"{0}"'.format(org)
        FORMAT='fasta'
        FILENAME1= "cytoplasm/{0}.{1}".format(fileNameBase,FORMAT)
        uniprotRetrieve(FILENAME1,query=QUERY,format=FORMAT)

        sleep(sleepTime)

        # Download protein of Periplasm
        QUERY='annotation:(type:signal) goa:("periplasmic space") AND organism:"{0}"'.format(org)
        FORMAT='fasta'
        FILENAME2= "periplasm/{0}.{1}".format(fileNameBase,FORMAT)
        uniprotRetrieve(FILENAME2,query=QUERY,format=FORMAT)

        # Log Succes
        with open(SUCCES,"a") as f:
            f.write(org)
            f.write("\n")

        # let it wait less long
        sleepTime=max(5,sleepTime-1)
    except:
        # let it wait longer
        sleepTime=min(60,sleepTime+5)

        # Log Fail
        with open(FAILED,"a") as f:
            f.write(org)
            f.write("\n")     

HBox(children=(IntProgress(value=0, max=3102), HTML(value='')))