# skrypt do pobierania danych jako tsv z bazy uniprota

In [1]:
QUERY = '"antibiotic resistance" taxonomy:"[562]"'
FIELDS = [
    "id",
    "entry name",
    "protein names",
    "genes(PREFERRED)",
    "reviewed"
]
UNIPROT_URL = "https://www.uniprot.org/uniprot/"
OUTPUT_FILE = "uniprot_data.tsv"

In [2]:
_fields = ",".join(FIELDS)
_url= (
    f"{UNIPROT_URL}"
    f"?query={QUERY}"
    f"&format=tab"
    f"&columns={_fields}"
)
_file = f"./data/{OUTPUT_FILE}"

In [None]:
import requests

def download_file(url, filename=None):
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)

In [11]:
def download_antibiotics_resistance_by_taxonomy(taxonomy_id, folder_path="./data"):
    """
    Download data from Uniprot for a given taxonomy id.
    """
    _query = f'"antibiotic resistance" taxonomy:"[{taxonomy_id}]"'
    _url = (
        f"{UNIPROT_URL}"
        f"?query={_query}"
        f"&format=tab"
        f"&columns={_fields}"
    )
    _file = f"{folder_path}/{taxonomy_id}.tsv"
    download_file(_url, _file)

In [4]:
download_file(_url, _file)

In [5]:
import pandas as pd

df = pd.read_csv(_file, sep='\t')
df.head()

Unnamed: 0,Entry,Entry name,Protein names,Gene names (primary ),Status
0,P0A9U4,YBIT_ECOL6,Probable ATP-binding protein YbiT,ybiT,reviewed
1,A0A0H2VBH0,YHES_ECOL6,Probable ATP-binding protein YheS,yheS,reviewed
2,P33941,YOJI_ECOLI,ABC transporter ATP-binding/permease protein YojI,yojI,reviewed
3,Q0TD49,UPPP_ECOL5,Undecaprenyl-diphosphatase (EC 3.6.1.27) (Baci...,uppP,reviewed
4,P0A8V2,RPOB_ECOLI,DNA-directed RNA polymerase subunit beta (RNAP...,rpoB,reviewed


In [7]:
taxon_df = pd.read_csv('./data/taxonomy-ancestor_562.tab', sep='\t')

In [8]:
taxon_df.head()

Unnamed: 0,Taxon,Mnemonic,Scientific name,Common name,Synonym,Other Names,Reviewed,Rank,Lineage,Parent,Virus hosts
0,1005443,,Escherichia coli 0.1197,,,,,,Bacteria; Proteobacteria; Gammaproteobacteria;...,562,
1,1005442,,Escherichia coli 0.1288,,,,annotated,,Bacteria; Proteobacteria; Gammaproteobacteria;...,562,
2,1005441,,Escherichia coli 0.1304,,,,annotated,,Bacteria; Proteobacteria; Gammaproteobacteria;...,562,
3,1005566,,Escherichia coli 07798,,,,,,Bacteria; Proteobacteria; Gammaproteobacteria;...,562,
4,1081888,,Escherichia coli 08BKT055439,,,Escherichia coli str. 08BKT055439; Escherichia...,,,Bacteria; Proteobacteria; Gammaproteobacteria;...,562,


In [12]:
download_antibiotics_resistance_by_taxonomy(562)