In [1]:
# Import libraries
import requests
import os
from time import sleep
import pandas as pd

In [2]:
#------------------------------------------------------------------------------
#   Section with general functions
#------------------------------------------------------------------------------
def download(url, fileName):
    for i in range(10):
        try:
            # Delete existing files with filename
            try:
                os.remove(fileName) 
            except:
                pass
            
            """ Use requests to download file. 
            Works with streams to be able large files without having the need of a 
            large memory.
            """
            with requests.get(url, stream=True) as r:
                r.raise_for_status()
                with open(fileName, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192): 
                        if chunk:
                            f.write(chunk)
            return fileName
        except:
            print("Download", url,"failed:",i)
            sleep(5)
            
def uniprotRetrieve(fileName, query="",format="list",columns="",include="no",compress="no",limit=0,offset=0):
    """Downloads file from uniprot for given parameters
    
    If no parameters are given the function will download a list of all the 
    proteins ID's. More information about how the URL should be constructed can
    be found on: 
    https://www.uniprot.org/help/api%5Fqueries
    
    Parameters
    ----------
    fileName : str
        name for the downloaded file
    query : str (Default='')
        query that would be searched if as you used the webinterface on 
        https://www.uniprot.org/. If no query is provided, all protein entries
        are selected. 
    format : str (Default='list')
        File format you want to retrieve from uniprot. Available format are:
        html | tab | xls | fasta | gff | txt | xml | rdf | list | rss
    columns : str (Default='')
        Column information you want to know for each entry in the query 
        when format tab or xls is selected.
    include : str (Default='no')
        Include isoform sequences when the format parameter is set to fasta.
        Include description of referenced data when the format parameter is set to rdf.
        This parameter is ignored for all other values of the format parameter.
    compress : str (Default='no')
        download file in gzipped compression format.
    limit : int (Default=0)
        Limit the amount of results that is given. 0 means you download all.
    offset : int (Default=0)
        When you limit the amount of results, offset determines where to start.
        
    Returns
    -------
    fileName : str
        Name of the downloaeded file.
    """
    def generateURL(baseURL, query="",format="list",columns="",include="no",compress="no",limit="0",offset="0"):
        """Generate URL with given parameters"""
        def glueParameters(**kwargs):
            gluedParameters = ""
            for parameter, value in kwargs.items():
                gluedParameters+=parameter + "=" + str(value) + "&"
            return gluedParameters.replace(" ","+")[:-1] #Last "&" is removed, spacec replaced by "+"
        return baseURL + glueParameters(query=query,
                                        format=format,
                                        columns=columns,
                                        include=include,
                                        compress=compress,
                                        limit=limit,
                                        offset=offset)
    URL = generateURL("https://www.uniprot.org/uniprot/?",
               query=query,
               format=format,
               columns=columns,
               include=include,
               compress=compress,
               limit=limit,
               offset=offset)
    return download(URL, fileName)

In [None]:
# Download organisms of all proteins in gammaproteobacteria 
QUERY='taxonomy:gammaproteobacteria'
FORMAT="tab"
COLUMNS="organism,lineage(SPECIES),lineage(GENUS)"
FILENAME="gammaproteobacteriaOrganisms.tab"
FILENAME = uniprotRetrieve(FILENAME,query=QUERY,format=FORMAT,columns=COLUMNS)

In [5]:
# Read in proteins
FILENAME="gammaproteobacteriaOrganisms.tab.bu"
PROTEINS= pd.read_csv(FILENAME,sep="\t")
PROTEINS

Unnamed: 0,Organism,Taxonomic lineage (SPECIES),Taxonomic lineage (GENUS)
0,Xanthomonas campestris pv. campestris (strain ...,Xanthomonas campestris,Xanthomonas
1,Alkalilimnicola ehrlichii (strain ATCC BAA-110...,Alkalilimnicola ehrlichii,Alkalilimnicola
2,Xanthomonas oryzae pv. oryzae (strain PXO99A),Xanthomonas oryzae,Xanthomonas
3,Shewanella pealeana (strain ATCC 700345 / ANG-...,Shewanella pealeana,Shewanella
4,Hydrogenovibrio crunogenus (strain XCL-2) (Thi...,Hydrogenovibrio crunogenus,Hydrogenovibrio
...,...,...,...
27597072,Brenneria alni,Brenneria alni,Brenneria
27597073,Alcanivorax sp. DG881,Alcanivorax sp. DG881,Alcanivorax
27597074,Salmonella typhimurium (strain SL1344),Salmonella enterica (Salmonella choleraesuis),Salmonella
27597075,Pseudomonas soli,Pseudomonas soli,Pseudomonas


In [9]:
# See how many unique units there are at different taxonomic levels
for LVL in ["Organism", "Taxonomic lineage (SPECIES)", "Taxonomic lineage (GENUS)"]:
    print(len(PROTEINS[LVL].unique()),LVL)

25663 Organism
20950 Taxonomic lineage (SPECIES)
455 Taxonomic lineage (GENUS)


In [10]:
#uniqueOrganisms = proteins["Organism"].unique()
#organisms = pd.DataFrame(uniqueOrganisms,columns=["Organism"])
#organisms.to_csv("organisms.tab",sep="\t")


In [12]:
# Download organisms of all proteins in gammaproteobacteria 
QUERY='taxonomy:Enterobacterales'
FORMAT="tab"
COLUMNS="organism,lineage(SPECIES),lineage(GENUS)"
FILENAME="enterobacteralesProteins.tab"
FILENAME = uniprotRetrieve(FILENAME,query=QUERY,format=FORMAT,columns=COLUMNS)

In [15]:
# Read in proteins
FILENAME="enterobacteralesProteins.tab.bu"
PROTEINS= pd.read_csv(FILENAME,sep="\t")
PROTEINS

Unnamed: 0,Organism,Taxonomic lineage (SPECIES),Taxonomic lineage (GENUS)
0,Shigella boydii serotype 4 (strain Sb227),Shigella boydii,Shigella
1,Yersinia pestis bv. Antiqua (strain Antiqua),Yersinia pestis,Yersinia
2,Buchnera aphidicola subsp. Baizongia pistaciae...,Buchnera aphidicola,Buchnera (aphid P-endosymbionts)
3,Cronobacter sakazakii (strain ATCC BAA-894) (E...,Cronobacter sakazakii (Enterobacter sakazakii),Cronobacter
4,Klebsiella pneumoniae (strain 342),Klebsiella pneumoniae,Klebsiella
...,...,...,...
8878902,Shigella boydii serotype 18 (strain CDC 3083-9...,Shigella boydii,Shigella
8878903,Yokenella regensburgei ATCC 43003,Yokenella regensburgei,Yokenella
8878904,Serratia sp. DD3,Serratia sp. DD3,Serratia
8878905,Brenneria alni,Brenneria alni,Brenneria


In [16]:
# See how many unique units there are at different taxonomic levels
for LVL in ["Organism", "Taxonomic lineage (SPECIES)", "Taxonomic lineage (GENUS)"]:
    print(len(PROTEINS[LVL].unique()),LVL)

6127 Organism
3102 Taxonomic lineage (SPECIES)
99 Taxonomic lineage (GENUS)


In [17]:
uniqueOrganisms = PROTEINS["Taxonomic lineage (SPECIES)"].unique()
organisms = pd.DataFrame(uniqueOrganisms,columns=["Organism"])
organisms.to_csv("organisms.tab",sep="\t")