# skrypt do pobierania danych jako tsv z bazy uniprota

In [1]:
# QUERY = '"antibiotic resistance" taxonomy:"[562]"'
FIELDS = [
    "id",
    "entry name",
    "protein names",
    "genes(PREFERRED)",
    "reviewed"
]
UNIPROT_URL = "https://www.uniprot.org/uniprot/"
OUTPUT_FILE = "uniprot_data.tsv"

In [7]:
_fields = ",".join(FIELDS)
# _url= (
#     f"{UNIPROT_URL}"
#     f"?query={QUERY}"
#     f"&format=tab"
#     f"&columns={_fields}"
# )
# _file = f"./data/{OUTPUT_FILE}"

In [8]:
import requests

def download_file(url, filename=None):
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)

In [9]:
def download_antibiotics_resistance_by_taxonomy(taxonomy_id, folder_path="./data"):
    """
    Download data from Uniprot for a given taxonomy id.
    """
    _query = f'"antibiotic resistance" taxonomy:"[{taxonomy_id}]"'
    _url = (
        f"{UNIPROT_URL}"
        f"?query={_query}"
        f"&format=tab"
        f"&columns={_fields}"
    )
    
    _tax_id = str(taxonomy_id).replace(' ', '_')
    _file = f"{folder_path}/{_tax_id}.tsv"
    download_file(_url, _file)

In [10]:
download_antibiotics_resistance_by_taxonomy('Mycoplasma pneumoniae M129')

In [13]:
download_antibiotics_resistance_by_taxonomy(226185)

In [14]:
import pandas as pd

df = pd.read_csv('./data/226185.tsv', sep='\t')
df.head()

Unnamed: 0,Entry,Entry name,Protein names,Gene names (primary ),Status
0,Q06893,VANB_ENTFA,Vancomycin B-type resistance protein VanB (EC ...,vanB,reviewed
1,Q47744,VANR_ENTFA,Regulatory protein VanRB,vanRB,reviewed
2,Q47745,VANS_ENTFA,Sensor protein VanSB (EC 2.7.13.3) (Vancomycin...,vanSB,reviewed
3,Q47746,VANY_ENTFA,"D-alanyl-D-alanine carboxypeptidase (D,D-carbo...",vanYB,reviewed
4,Q831R1,UPPP1_ENTFA,Undecaprenyl-diphosphatase (EC 3.6.1.27) (Baci...,uppP,reviewed


In [None]:
taxon_df = pd.read_csv('./data/taxonomy-ancestor_562.tab', sep='\t')

In [9]:
taxon_df.head()

Unnamed: 0,Taxon,Mnemonic,Scientific name,Common name,Synonym,Other Names,Reviewed,Rank,Lineage,Parent,Virus hosts
0,1005443,,Escherichia coli 0.1197,,,,,,Bacteria; Proteobacteria; Gammaproteobacteria;...,562,
1,1005442,,Escherichia coli 0.1288,,,,annotated,,Bacteria; Proteobacteria; Gammaproteobacteria;...,562,
2,1005441,,Escherichia coli 0.1304,,,,annotated,,Bacteria; Proteobacteria; Gammaproteobacteria;...,562,
3,1005566,,Escherichia coli 07798,,,,,,Bacteria; Proteobacteria; Gammaproteobacteria;...,562,
4,1081888,,Escherichia coli 08BKT055439,,,Escherichia coli str. 08BKT055439; Escherichia...,,,Bacteria; Proteobacteria; Gammaproteobacteria;...,562,


In [None]:
download_antibiotics_resistance_by_taxonomy('Mycoplasma pneumoniae M129')

In [11]:
import time

taxons = taxon_df.Taxon.unique()

for t in taxons:
    download_antibiotics_resistance_by_taxonomy(t, folder_path="./data/562_ancestors")
    time.sleep(5)