In [4]:
import re
from io import StringIO
import time
import pandas as pd
import requests
import jsonpath_ng as jp



 ## Acceso a Uniprot API

In [5]:


# Define the base URL for the Uniprot API
base_url = "https://rest.uniprot.org/uniprotkb/search"
# Define the parameters for the API request
params = {
  # TP53 in mammals (taxonomy ID 40674)
  "query": "(reviewed:true) AND (taxonomy_id:40674) AND (gene:TP53)",
  "format": "tsv",
  "fields": "accession,id,sequence,protein_name,organism_name,go",
  "size": 10
}

# Make the API request
response = requests.get(base_url, params=params, timeout=30)

response.raise_for_status()



 ### Extract and print protein information

In [6]:
data = pd.read_csv(
  StringIO(response.content.decode("utf-8")),
  sep="\t",
  header=0
)

print(data)

    Entry Entry Name                                           Sequence  \
0  P04637  P53_HUMAN  MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...   
1  P67939  P53_BOVIN  MEESQAELNVEPPLSQETFSDLWNLLPENNLLSSELSAPVDDLLPY...   
2  P10361    P53_RAT  MEDSQSDMSIELPLSQETFSCLWKLLPPDDILPTTATGSPNSMEDL...   
3  P02340  P53_MOUSE  MTAMEESQSDISLELPLSQETFSGLWKLLPPEDILPSPHCMDDLLL...   
4  Q64662  P53_OTOBE  DLWNLLPENNVLSPVLSPPMDDLLLSSEDVENWFDKGPDEALQMSA...   
5  Q95330  P53_RABIT  MEESQSDLSLEPPLSQETFSDLWKLLPENNLLTTSLNPPVDDLLSA...   
6  Q9WUR6  P53_CAVPO  MEEPHSDLSIEPPLSQETFSDLWKLLPENNVLSDSLSPPMDHLLLS...   
7  O36006  P53_MARMO  MEEAQSDLSIEPPLSQETFSDLWNLLPENNVLSPVLSPPMDDLLLS...   
8  P13481  P53_CHLAE  MEEPQSDPSIEPPLSQETFSDLWKLLPENNVLSPLPSQAVDDLMLS...   
9  Q29480  P53_EQUAS  YSPALNKMFCQLAKTCPVYLRISSPPPPGTRVRAMAIYKKSEFMTE...   

                                       Protein names  \
0  Cellular tumor antigen p53 (Antigen NY-CO-13) ...   
1  Cellular tumor antigen p53 (Tumor suppressor p53)   
2  Cel



 ### Obtener el resto de los resultados

In [9]:
response.headers["link"]

'<https://rest.uniprot.org/uniprotkb/search?format=tsv&fields=accession,id,sequence,protein_name,organism_name,go&query=%28reviewed%3Atrue%29%20AND%20%28taxonomy_id%3A40674%29%20AND%20%28gene%3ATP53%29&cursor=1le4n4faph6gnbg3cp612wro081sdyrzf247&size=10>; rel="next"'

In [10]:
pattern = re.compile(".*<(.+)>.*$")

match = re.match(pattern, response.headers["link"])

if match:
  next_page_url = match.group(1)
  print(next_page_url)
else:
  print("No match found in the link header.")

https://rest.uniprot.org/uniprotkb/search?format=tsv&fields=accession,id,sequence,protein_name,organism_name,go&query=%28reviewed%3Atrue%29%20AND%20%28taxonomy_id%3A40674%29%20AND%20%28gene%3ATP53%29&cursor=1le4n4faph6gnbg3cp612wro081sdyrzf247&size=10


In [11]:


response = requests.get(next_page_url, timeout=30)

response.raise_for_status()
data = pd.read_csv(
  StringIO(response.content.decode("utf-8")),
  sep="\t",
  header=0
)
print(data)

    Entry Entry Name                                           Sequence  \
0  P67938  P53_BOSIN  MEESQAELNVEPPLSQETFSDLWNLLPENNLLSSELSAPVDDLLPY...   
1  Q00366  P53_MESAU  MEEPQSDLSIELPLSQETFSDLWKLLPPNNVLSTLPSSDSIEELFL...   
2  P51664  P53_SHEEP  MEESQAELGVEPPLSQETFSDLWNLLPENNLLSSELSAPVDDLLPY...   
3  O09185  P53_CRIGR  MEEPQSDLSIELPLSQETFSDLWKLLPPNNVLSTLPSSDSIEELFL...   
4  Q8SPZ3  P53_DELLE  MEESQAELGVEPPLSQETFSDLWKLLPENNLLSSELSPAVDDLLLS...   
5  Q9TTA1  P53_TUPBE  MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...   
6  P61260  P53_MACFU  MEEPQSDPSIEPPLSQETFSDLWKLLPENNVLSPLPSQAVDDLMLS...   
7  P41685  P53_FELCA  MQEPPLELTIEPPLSQETFSELWNLLPENNVLSSELSSAMNELPLS...   
8  P56424  P53_MACMU  MEEPQSDPSIEPPLSQETFSDLWKLLPENNVLSPLPSQAVDDLMLS...   
9  P79892  P53_HORSE  PAVNNLLLSPDVVNWLDEGPDEAPRMPAAPAPLAPAPATSWPLSSF...   

                                       Protein names  \
0  Cellular tumor antigen p53 (Tumor suppressor p53)   
1  Cellular tumor antigen p53 (Tumor suppressor p53)   
2  Cel



 ### Campos disponibles
 | Campo                   | Descripción                           |
 |-------------------------|---------------------------------------|
 | accession               | Acceso principal de la entrada        |
 | id                      | Nombre del entry (como `INSR_HUMAN`)  |
 | protein_name            | Nombre de la proteína                 |
 | gene_primary            | Nombre principal del gen              |
 | gene_synonym            | Sinónimos del gen                     |
 | organism_name           | Nombre del organismo                  |
 | organism_id             | ID taxonómico (NCBI) del organismo    |
 | length                  | Longitud de la secuencia              |
 | sequence                | Secuencia de aminoácidos completa     |
 | function / cc_function  | Función biológica anotada             |
 | cc_subcellular_location | Localización subcelular               |
 | cc_pathway              | Rutas metabólicas en las que participa|
 | go                      | Anotaciones Gene Ontology (todas)     |
 | xref_pdb                | Referencias a estructuras 3D (PDB)    |
 | xref_geneid             | Enlace al NCBI Gene ID                |
 | xref_kegg               | Enlace a la base de datos KEGG        |
 | ft_binding              | Sitios de unión                       |
 | ft_active               | Sitios activos                        |
 | ft_domain               | Dominios anotados                     |

 Lista completa: [https://www.uniprot.org/help/return_fields](https://www.uniprot.org/help/return_fields)

In [12]:


base_url = "https://rest.uniprot.org/uniparc/search"
# Define the parameters for the API request
params = {
  # TP53 in mammals (taxonomy ID 40674)
  "query": "(taxonomy_id:40674) AND (gene:TP53)",
  "format": "json",
  "fields": "accession,sequence",
  "size": 10
}

# Make the API request
response = requests.get(base_url, params=params, timeout=30)

response.raise_for_status()

data = response.json()

In [15]:
import json
print(json.dumps(data, indent=2))

{
  "results": [
    {
      "uniParcId": "UPI00000E5DE2",
      "uniProtKBAccessions": [
        "Q920Y0"
      ],
      "sequence": {
        "value": "MEEPQSDLSIEPPLSQETFSDLWKLLPPKNLLSALEPMEDLLLPQDVTSWLGDADEALPVCTAPAEGPAPEAPAPAAPAPPASWPLSSFVPSHKTFQGNYGFRLGFLQSGTAKSVTCTYSPSLNKLFCQLAKTCPVQLWVSSAPPPGTRVRAMAIYKNSQHMTEVVRRCPHHERCSENEASDPRGRAPPQHLIRVEGNLHAEYVDDRQTFRHSVLVPYESPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDPSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKQRCPELPQGSAKRALPTNTSSSPQSKRKPADGEYFTLKIRGRKRFEVFRELNEALELKDAQAAGESGDGRAQASCLKTKKDKSTSPRKNPMIKREEPDSD"
      },
      "oldestCrossRefCreated": "2001-12-01",
      "mostRecentCrossRefUpdated": "2025-04-02"
    },
    {
      "uniParcId": "UPI00038EA708",
      "uniProtKBAccessions": [
        "A0A8C2YMX2"
      ],
      "sequence": {
        "value": "MEEPQSDLSIEPPLSQETFSDLWKLLPENNVLSNSLPSPMDDLLLCPEDVVNWLEENPDEDVQMSAAPVPEPPTPAAPAPAAPPPATSWPLSSSVPSHKPYRGNYGFRLGFLQSGTAKSVTCTYSPDLNKLFCQLAKTCPVQVWVESPPPPGTRVRALAIYKKSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLHAEYL

In [16]:


sequences = [x.value for x in jp.parse("$..sequence.value").find(data)]
print(sequences)

['MEEPQSDLSIEPPLSQETFSDLWKLLPPKNLLSALEPMEDLLLPQDVTSWLGDADEALPVCTAPAEGPAPEAPAPAAPAPPASWPLSSFVPSHKTFQGNYGFRLGFLQSGTAKSVTCTYSPSLNKLFCQLAKTCPVQLWVSSAPPPGTRVRAMAIYKNSQHMTEVVRRCPHHERCSENEASDPRGRAPPQHLIRVEGNLHAEYVDDRQTFRHSVLVPYESPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDPSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKQRCPELPQGSAKRALPTNTSSSPQSKRKPADGEYFTLKIRGRKRFEVFRELNEALELKDAQAAGESGDGRAQASCLKTKKDKSTSPRKNPMIKREEPDSD', 'MEEPQSDLSIEPPLSQETFSDLWKLLPENNVLSNSLPSPMDDLLLCPEDVVNWLEENPDEDVQMSAAPVPEPPTPAAPAPAAPPPATSWPLSSSVPSHKPYRGNYGFRLGFLQSGTAKSVTCTYSPDLNKLFCQLAKTCPVQVWVESPPPPGTRVRALAIYKKSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLHAEYLDDRTTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGKVLGRNSFEVRVCACPGRDRRTEEENFRKKGGSCPEPTPGSIKRALPTSTSSSPQPKKKPLDGEYFTLKIRGRERFEMFRELNEALELKDAQTEKEPGESRPHSSYLKSKKGQSTSCHKKPMFKREGPDSD', 'MEESQAELGVEPPLSQETFSDLWNLLPENNLLSSELSAPVDDLLPYSEDVATWLVECPNEAPQMPEPPAQAALPPATSWPLSSFVPSQKTYPGNYGFRLGFLHSGTAKSVTCTYSPSLNKLFCQLAKTCPVQLWVDSPPPPGTRVRAMAIYKKLEHMTEVVRRCPHHERSSDYSDGLAPPQHLIRVEGNLRAEYLDDRNTFRHSVVVPY

In [17]:


base_url = "https://rest.uniprot.org/uniref/search"
# Define the parameters for the API request
params = {
  "query": "(taxonomy_id:9606) AND (name:FUS) AND (identity:0.5)",
  "format": "json",
  "fields": "id,members",
  "size": 10
}

# Make the API request
response = requests.get(base_url, params=params, timeout=30)

response.raise_for_status()

data = response.json()
print(data)

{'results': [{'id': 'UniRef50_Q70T19', 'members': ['Q70T19', 'A0A452UEQ5']}, {'id': 'UniRef50_P35637', 'members': ['P35637', 'Q28009', 'Q6IBQ5', 'A0A8B7GAM2', 'A0A8B7GB87', 'A0A2K6B648', 'F7D056', 'A0A2K5M862', 'I7GH09', 'A0A096NHY1']}, {'id': 'UniRef50_Q16273', 'members': ['Q16273', 'UPI00174F7CC9']}, {'id': 'UniRef50_UPI0002643217', 'members': ['UPI0002643217', 'UPI00101E8685', 'UPI000C6FB2D4']}, {'id': 'UniRef50_Q70T17', 'members': ['Q70T17', 'L5KJG0']}, {'id': 'UniRef50_A0AAQ5BIK3', 'members': ['A0AAQ5BIK3', 'UPI001C05AFAA']}, {'id': 'UniRef50_A0A286NGJ4', 'members': ['A0A286NGJ4', 'A0A286NGJ7', 'A0A286NGJ5', 'A0A286NGJ8']}, {'id': 'UniRef50_B4DVJ7', 'members': ['B4DVJ7']}]}


In [18]:


sequences = [x.value for x in jp.parse("$..members").find(data)]
print(sequences)

[['Q70T19', 'A0A452UEQ5'], ['P35637', 'Q28009', 'Q6IBQ5', 'A0A8B7GAM2', 'A0A8B7GB87', 'A0A2K6B648', 'F7D056', 'A0A2K5M862', 'I7GH09', 'A0A096NHY1'], ['Q16273', 'UPI00174F7CC9'], ['UPI0002643217', 'UPI00101E8685', 'UPI000C6FB2D4'], ['Q70T17', 'L5KJG0'], ['A0AAQ5BIK3', 'UPI001C05AFAA'], ['A0A286NGJ4', 'A0A286NGJ7', 'A0A286NGJ5', 'A0A286NGJ8'], ['B4DVJ7']]




 ## Ejercicios


In [136]:
import json
from io import StringIO
import re

import pandas as pd
import matplotlib.pyplot as plt
import requests
import jsonpath_ng.ext as jp

def uniprot(
    query: str,
    fields: list[str],
    db:str = ["uniprotkb", "uniref", "uniparc"][0],
    format:str = "tsv"
):
    url = f"https://rest.uniprot.org/{db}/search"
    params = {
        "query": query,
        "fields": fields,
        "size": 100,
        "format": format
    }
    response = requests.get(url, params=params, timeout=10)
    response.raise_for_status()
    if format == "tsv":
        return pd.read_csv(
            StringIO(response.content.decode("utf8")),
            sep="\t"
        )
    if format == "json":
        return response.json()
    return response.content.decode("utf8")

def uniprot_stream(
    query: str,
    fields: list[str],
    db:str = ["uniprotkb", "uniref", "uniparc"][0],
    format:str = "tsv"
):
    url = f"https://rest.uniprot.org/{db}/search"
    params = {
        "query": query,
        "fields": fields,
        "size": 100,
        "format": format
    }
    loop_counter = 0
    all_results = []
    while url:
        loop_counter += 1
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        all_results.append(response.content.decode("utf8"))
        link = [x.value for x in jp.parse("$..Link").find(response.headers)]
        if len(link) > 0:
            link = re.sub("^<", "", link[0])
            url = re.sub(">;.+$", "", link)
            params = None
        else:
            url = None
    if format == "tsv":
        return pd.concat(
            [pd.read_csv(StringIO(j), sep="\t") for j in all_results]
        ).reset_index(drop=True)
    if format == "json":
        json_data = {}
        all_json = [json.loads(x) for x in all_json]
        for j in all_json:
            json_data.update(j)
        return json_data
    all_contents = "\n".join(all_results)
    return all_contents



 1. Recuperar de UniprotKB todas las proteínas del género *Caenorhabditis* que
    estén anotadas con el E.C. number 1.1.1.1 (alcohol dehidrogenasa).
    Recuperar los campos `accession`, `id`, `sequence`, `protein_name`,
    `organism_name`, `gene_name` y `reviewed`, en formato TSV.

In [17]:
query = '(taxonomy_name:"Caenorhabditis elegans") AND (ec:1.1.1.1)'
fields = [
    "accession",
    "id",
    "sequence",
    "protein_name",
    "organism_name",
    "gene_names",
    "reviewed"
]
ce_adh = uniprot(query, fields)
ce_adh

Unnamed: 0,Entry,Gene Names,Entry Name,Organism,Protein names,Reviewed,Sequence
0,A5JYX5,dhs-3 T02E1.5,DHS3_CAEEL,Caenorhabditis elegans,Protein dhs-3 (Alcohol dehydrogenase dhs-3) (E...,reviewed,MPYVFLLSPQLEIASQWDGYYEKTFEVSDHVHKEIILKVSGQTVLI...
1,Q17335,adh-5 H24K24.3,ADHX_CAEEL,Caenorhabditis elegans,Alcohol dehydrogenase class-3 (EC 1.1.1.1) (Al...,reviewed,MSSTAGQVINCKAAVAWSAKAPLSIETIQVAPPKAHEVRVKILYTA...
2,Q17334,sodh-1 K12G11.3,ADH1_CAEEL,Caenorhabditis elegans,Alcohol dehydrogenase 1 (EC 1.1.1.1) (Sorbitol...,reviewed,MTVELPSTQRALVFDTWNGPLEVRQVPVPSPADDEILVKIEYSGIC...
3,O45687,sodh-2 K12G11.4,ADH2_CAEEL,Caenorhabditis elegans,Alcohol dehydrogenase 2 (EC 1.1.1.1) (Sorbitol...,reviewed,MSSANIPATQSALIFEKYGGPLEVRQVSVPQPQENELLVKIEYSGI...
4,Q9UAT1,CELE_D2063.1 D2063.1,Q9UAT1_CAEEL,Caenorhabditis elegans,alcohol dehydrogenase (EC 1.1.1.1),unreviewed,MVSSDVPKTQRALIFESYGGPLEIKQLPIPQPNEDELLVKMEYSGI...




 2. Recuperar de UniprotKB todas las proteínas que interactúan
    con la proteína *BRCA1* en humanos (revisada). Recuperar además los campos
   `accession`, `id`, y las proteínas que interactúan con cada una de ellas.
    Generar una matriz de interacción, en formato TSV.

In [20]:
query = "(reviewed:true) AND (gene:BRCA1) AND (organism_id:9606)"
fields = [
    "accession",
    "id",
    "cc_interaction"
]
brca1_result = uniprot(query, fields)
display(brca1_result)

Unnamed: 0,Entry,Interacts with,Entry Name
0,P38398,Q6UWZ7; Q13085; Q92560; Q99728; Q7Z569; Q6PJG6...,BRCA1_HUMAN


In [39]:
brca1_acc = brca1_result.iloc[0]["Entry"]
interactors = brca1_result.iloc[0]["Interacts with"].split("; ")
print(brca1_acc)
len(interactors)

P38398


43

In [None]:
interactions = []
for i in interactors:
    interactions.append([brca1_acc, i])
    interactions.append([i, brca1_acc])

    query = f"(accession:{i}) AND (reviewed:true)"
    fields = ["cc_interaction"]
    result = uniprot(query, fields)
    if result.count().all() == 0:
        continue
    for j in result.iloc[0]["Interacts with"].split("; "):
        interactions.append([i, j])
        interactions.append([j, i])

In [None]:
interactions_df = (
    pd.DataFrame(interactions, columns=["from", "to"])
        .assign(value = 1)
        .drop_duplicates()
)
interactions_df

Unnamed: 0,from,to,value
0,P38398,Q6UWZ7,1
1,Q6UWZ7,P38398,1
4,Q6UWZ7,P46736,1
5,P46736,Q6UWZ7,1
6,Q6UWZ7,P13569,1
...,...,...,...
4041,P51660,P42858,1
4042,P42858,P29994,1
4043,P29994,P42858,1
4044,P42858,Q99KR7,1


In [None]:
interactions_df.pivot(index="from", values="value", columns="to").fillna(0)

to,A0A0C4DH22,A0A0S2Z3N6,A0A0S2Z4M1,A0A0S2Z4Q8,A0A0S2Z5G4,A0A384DVV8,A0A384NQ31,A0A3G5BIZ0,A0A6Q8PF08,A0AVK6,...,Q9Y5V3,Q9Y605,Q9Y639-1,Q9Y691,Q9Y6C2-2,Q9Y6H1,Q9Y6K9,Q9Y6M9,Q9Y6Q9,W5RWE1
from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0A0C4DH22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0A0S2Z3N6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0A0S2Z4M1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0A0S2Z4Q8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0A0S2Z5G4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9Y6H1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q9Y6K9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q9Y6M9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q9Y6Q9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0




 3. Recuperar de swissprot todas las proteínas humanas cuya ubicación
    subcelular se la mitocondria y el peroxisoma (que contengan ambas
    anotaciones), en formato TSV.

In [78]:
# mithocondrion_term = "SL-0173"
mithocondrion_term = "Mitochondrion"
# peroxisoma_term = "SL-0204"
peroxisoma_term = "Peroxisome"
query = f"(organism_id:9606) AND (cc_scl_term:{mithocondrion_term}) AND (cc_scl_term:{peroxisoma_term}) AND (reviewed:true)"
fields = ["accession", "id", "protein_name"]

result = uniprot(query, fields)

result



Unnamed: 0,Entry,Entry Name,Protein names
0,O75521,ECI2_HUMAN,Enoyl-CoA delta isomerase 2 (EC 5.3.3.8) (DRS-...
1,Q6YN16,HSDL2_HUMAN,Hydroxysteroid dehydrogenase-like protein 2 (E...
2,Q9UHK6,AMACR_HUMAN,Alpha-methylacyl-CoA racemase (EC 5.1.99.4) (2...
3,A3KMH1,VWA8_HUMAN,von Willebrand factor A domain-containing prot...
4,Q13011,ECH1_HUMAN,"Delta(3,5)-Delta(2,4)-dienoyl-CoA isomerase, m..."
5,O95822,DCMC_HUMAN,"Malonyl-CoA decarboxylase, mitochondrial (MCD)..."
6,P30044,PRDX5_HUMAN,"Peroxiredoxin-5, mitochondrial (EC 1.11.1.24) ..."
7,P35914,HMGCL_HUMAN,"Hydroxymethylglutaryl-CoA lyase, mitochondrial..."
8,Q9NTK1,DEPP1_HUMAN,Protein DEPP1 (Decidual protein induced by pro...
9,Q709F0,ACD11_HUMAN,Acyl-CoA dehydrogenase family member 11 (ACAD-...




 4. Recuperar de UniprotKB todas DNA helicasas de bacterias del grupo
    Cyanobacteriota.
    Recuperar los el accession, el id, la secuencia, el nombre de la proteína,
    el nombre del organismo, el nombre del gen y la longitud de la secuencia.

In [None]:
# query = "(protein_name:DNA helicase) AND (taxonomy_id:1117) AND (reviewed:true)"
query = "(protein_name:DNA helicase) AND (taxonomy_id:1117)"
fields = [
    "accession",
    "id",
    "sequence",
    "protein_name",
    "organism_name",
    "gene_names",
    "length"
]
result = uniprot_stream(query, fields)
result

Unnamed: 0,Entry,Gene Names,Entry Name,Length,Organism,Protein names,Sequence
0,Q55418,dnaB slr0833,DNAB_SYNY3,872,Synechocystis sp. (strain ATCC 27184 / PCC 680...,Replicative DNA helicase DnaB (EC 5.6.2.3) (DN...,MAANPALPPQNIEAEECILGGILLDPEAMGRIIDLLVVDAFYVKAH...
1,P73421,recQ slr1536,RECQ_SYNY3,478,Synechocystis sp. (strain ATCC 27184 / PCC 680...,ATP-dependent DNA helicase RecQ (EC 5.6.2.4) (...,MADRQSLEEALRRIWGYDHFRYPQGEVIDCLLARRDCLVVLPTGGG...
2,Q55681,recG slr0020,RECG_SYNY3,831,Synechocystis sp. (strain ATCC 27184 / PCC 680...,ATP-dependent DNA helicase RecG (EC 5.6.2.4) (...,MQCSLVEVSVSVDWPRLQKALTVEVERGFQNLQGKQHRFGDFLCLS...
3,Q8YZA1,dnaB all0578,DNAB_NOSS1,879,Nostoc sp. (strain PCC 7120 / SAG 25.82 / UTEX...,Replicative DNA helicase DnaB (EC 5.6.2.3) (DN...,MAEELSFQGDGSNRLPPQNIEAEEAILGGILLDPEAIGRVSEVLIT...
4,P74397,priA sll0270,PRIA_SYNY3,831,Synechocystis sp. (strain ATCC 27184 / PCC 680...,ATP-dependent DNA helicase PriA (EC 5.6.2.4) (...,MTVSPSALAELGFNYQEEDSVRPWAAVLVDLPQNEGVYTYAIPPGL...
5,P52023,dnaN Synpcc7942_0001,DPO3B_SYNE7,375,Synechococcus elongatus (strain ATCC 33912 / P...,Beta sliding clamp (Beta clamp) (Sliding clamp...,MKLVCRQNELNTSLSLVSRAVPSRPNHPVLANVLLAADAGTQRLSL...
6,P72856,dnaN slr0965,DPO3B_SYNY3,391,Synechocystis sp. (strain ATCC 27184 / PCC 680...,Beta sliding clamp (Beta clamp) (Sliding clamp...,MKLICRQSDLSSGLSLVSRAVSSRPTHPVLGNVLLEADADKNYLRL...




 5. Recuperar de Uniref los grupos de proteínas al 50% de identidad que
    contengan, alguna de las isoformas humanas de la enzima
    phosphofructokinase. ¿Las isoformas humanas están en el mismo grupo?
    ¿Cuantos organismos diferentes hay en cada grupo?

In [118]:
query = "(organism_id:9606) AND (protein_name:phosphofructokinase) AND (reviewed:true)"
fields = ["accession"]

human_isoforms = uniprot(query, fields)["Entry"].to_list()
human_isoforms

['Q01813', 'P08237', 'P17858']

In [134]:
clusters = {}
for hi in human_isoforms:
    query = f"(uniprotkb:{hi}) AND (identity:0.5)"
    fields = ["members"]
    clusters[hi] = uniprot(query, fields, db="uniref")["Cluster members"].iloc[0].split("; ")

In [135]:
for ref, cluster in clusters.items():
    print(f"El cluster de {ref}:")
    for i in human_isoforms:
        if i in cluster:
            print(f"   -> Contiene a {i}")

El cluster de Q01813:
   -> Contiene a Q01813
El cluster de P08237:
   -> Contiene a P08237
   -> Contiene a P17858
El cluster de P17858:
   -> Contiene a P08237
   -> Contiene a P17858




 6. Recuperar de UniprotKb las anotaciones de Regiones, dominios y Zinc-finger
    de todas las proteínas del gene *FUS* (revisadas).
    Para cada entrada, generar un tabla que contenga estas anotaciones.

In [None]:
query = "(gene:FUS) AND (reviewed:true)"
fields = ["accession", "id", "ft_region", "ft_domain", "ft_zn_fing"]
result = uniprot(query, fields)
result


7. Recuperar los Uniprot AC de los miembros del clusters UniRef50 en el que se encuentra la proteía "HELI_NPVAC".
   Para cada miembro recuperar el organismo y el linaje taxonomico y si son parte de swissprot.

In [148]:
query = "(uniprotkb:HELI_NPVAC) AND (identity:0.5)"
fields = ["members"]
cluster_members = uniprot(query, fields, db ="uniref").iloc[0]["Cluster members"].split('; ')
cluster_members

['P24307',
 'Q83950',
 'A1YRD4',
 'Q8B9G3',
 'A0A4P7XN32',
 'A0A2L0WU16',
 'A8HT75',
 'A0A2Z6C5Y0',
 'A8W7F9',
 'Q1HH36']

In [None]:
all_resuts = []
for member in cluster_members:
    query = f"(accession:{member})"
    fields = ["accession", "reviewed", "lineage", "organism_name"]
    result = uniprot(query, fields)
    all_resuts.append(result)

all_resuts = pd.concat(all_resuts)
all_resuts

Unnamed: 0,Entry,Taxonomic lineage,Organism,Reviewed
0,P24307,"Viruses (superkingdom), Naldaviricetes (class)...",Autographa californica nuclear polyhedrosis vi...,reviewed
0,Q83950,"Viruses (superkingdom), Naldaviricetes (class)...",Orgyia pseudotsugata multicapsid polyhedrosis ...,reviewed
0,A1YRD4,"Viruses (superkingdom), Naldaviricetes (class)...",Maruca vitrata nucleopolyhedrovirus,unreviewed
0,Q8B9G3,"Viruses (superkingdom), Naldaviricetes (class)...",Rachiplusia ou multiple nucleopolyhedrovirus (...,unreviewed
0,A0A4P7XN32,"Viruses (superkingdom), Naldaviricetes (class)...",Troides aeacus nucleopolyhedrovirus,unreviewed
0,A0A2L0WU16,"Viruses (superkingdom), Naldaviricetes (class)...",Oxyplax ochracea nucleopolyhedrovirus,unreviewed
0,A8HT75,"Viruses (superkingdom), Naldaviricetes (class)...",Samia cynthia nucleopolyhedrovirus,unreviewed
0,A0A2Z6C5Y0,"Viruses (superkingdom), Naldaviricetes (class)...",Antheraea proylei nucleopolyhedrovirus,unreviewed
0,A8W7F9,"Viruses (superkingdom), Naldaviricetes (class)...",Antheraea pernyi nuclear polyhedrosis virus (A...,unreviewed
0,Q1HH36,"Viruses (superkingdom), Naldaviricetes (class)...",Antheraea pernyi nuclear polyhedrosis virus (A...,unreviewed




 ## Uniprot Mapping ID API

In [19]:
base_url = "https://rest.uniprot.org/idmapping/run"

params = {
  "from": "UniProtKB_AC-ID",
  "to": "PDB",
  "ids": "P12345,Q8N158"
}

response = requests.post(base_url, data=params, timeout=30)
response.raise_for_status()

job_id = response.json()["jobId"]

In [20]:
print(job_id)

yHqIxcdk1s


In [21]:


# Check the status of the job
status_url = f"https://rest.uniprot.org/idmapping/status/{job_id}"
while True:
  time.sleep(2)
  status_response = requests.get(status_url, timeout=30, allow_redirects=False)
  status_response.raise_for_status()
  status_data = status_response.json()
  if "jobStatus" not in status_data:
    print("Job status not found in response.")
    print(status_data)
    break
  if status_data["jobStatus"] == "FINISHED":
    break

In [22]:


# Retrieve the results
results_url = f"https://rest.uniprot.org/idmapping/results/{job_id}"
results_response = requests.get(results_url, timeout=30)
results_response.raise_for_status()

# Parse and display the results
results_data = results_response.json()
for result in results_data["results"]:
  print(f"From: {result['from']} -> To: {result['to']}")

From: Q8N158 -> To: 6WJL
From: Q8N158 -> To: 7T62


In [24]:
print(json.dumps(results_data, indent=2))

{
  "results": [
    {
      "from": "Q8N158",
      "to": "6WJL"
    },
    {
      "from": "Q8N158",
      "to": "7T62"
    }
  ],
  "failedIds": [
    "P12345"
  ]
}




 ## Biomart Service

In [42]:
url = 'http://www.ensembl.org/biomart/martservice'

query = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query  virtualSchemaName = "default" formatter = "FASTA" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
			
	<Dataset name = "hsapiens_gene_ensembl" interface = "default" >
		<Filter name = "entrezgene_id" value = "7157"/>
		<Attribute name = "ensembl_gene_id" />
		<Attribute name = "transcript_exon_intron" />
	</Dataset>
</Query>"""

query = query.replace("\n", "").replace("\t", "")

params = {
  "query": query
}

response = requests.get(url, params, timeout=30)

# print(response.request.url)

response.raise_for_status()

data = response.content.decode("utf-8")


In [44]:
print(data)

>ENSG00000141510
TGAGGCCAGGAGATGGAGGCTGCAGTGAGCTGTGATCACACCACTGTGCTCCAGCCTGAG
TGACAGAGCAAGACCCTATCTCAAAAAAAAAAAAAAAAAAGAAAAGCTCCTGAGGTGTAG
ACGCCAACTCTCTCTAGCTCGCTAGTGGGTTGCAGGAGGTGCTTACGCATGTTTGTTTCT
TTGCTGCCGTCTTCCAGTTGCTTTATCTGTTCACTTGTGCCCTGACTTTCAACTCTGTCT
CCTTCCTCTTCCTACAGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGA
CCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCA
TGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATG
AGCGCTGCTCAGATAGCGATGGTGAGCAGCTGGGGCTGGAGAGACGACAGGGCTGGTTGC
CCAGGGTCCCCAGGCCTCTGATTCCTCACTGATTGCTCTTAGGTCTGGCCCCTCCTCAGC
ATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTC
GACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTCTGGTTTGCAACTGGGGTCTCTG
GGAGGAGGGGTTAAGGGTGGTTGTCAGTGGCCCTCCAGGTGAGCAGTAGGGGGGCTTTCT
CCTGCTGCTTATTTGACCTCCCTATAACCCCATGAGATGTGCAAAGTAAATGGGTTTAAC
TATTGCACAGTTGAAAAAACTGAAGCTTACAGAGGCTAAGGGCCTCCCCTGCTTGGCTGG
GCGCAGTGGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCCAAGGCAGGCGGATCACGA
GGTTGGGAGATCGAGACCATCCTGGCTAACGGTGAAACCCCGTCTCTACTGAAAAATACA
AAAAAAA

In [28]:
pd.read_csv(
  StringIO(data),
  sep="\t",
  header=None
)

Unnamed: 0,0,1,2,3
0,ENSG00000142208,ENSG00000142208.19,ENST00000554581,ENST00000554581.5
1,ENSG00000142208,ENSG00000142208.19,ENST00000407796,ENST00000407796.7
2,ENSG00000142208,ENSG00000142208.19,ENST00000649815,ENST00000649815.2
3,ENSG00000142208,ENSG00000142208.19,ENST00000349310,ENST00000349310.7
4,ENSG00000142208,ENSG00000142208.19,ENST00000714123,ENST00000714123.1
5,ENSG00000142208,ENSG00000142208.19,ENST00000553506,ENST00000553506.5
6,ENSG00000142208,ENSG00000142208.19,ENST00000557552,ENST00000557552.1
7,ENSG00000142208,ENSG00000142208.19,ENST00000402615,ENST00000402615.6
8,ENSG00000142208,ENSG00000142208.19,ENST00000714130,ENST00000714130.1
9,ENSG00000142208,ENSG00000142208.19,ENST00000555528,ENST00000555528.5




 ## Ejercicios Uniprot Mapping y Biomart



 1. Recuperar todas las todos los identificadores de UniProtKB de la proteína
    codificada en el gen *AKT1* en humanos, Mus musculus, Drosophila
    melanogaster, Danio rerio y Sus scrofa. Primero recuperar los Gene IDs de
    NCBI Gene y luego mapearlos a UniProtKB, usando la API de Uniprot Mapping.


In [155]:

ESearch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
def eSearch(term: str, db: str, retstart=0, retmax=20) -> dict:
    params = {
        "retmode": "json",
        "db":db,
        "term": term,
        "retstart": retstart,
        "retmax": retmax
    }
    response = requests.get(ESearch, params=params, timeout=10)
    response.raise_for_status()
    return response.json()

In [161]:
organisms = [
    "Homo sapiens",
    "Mus musculus",
    "Drosophila melanogaster",
    "Danio rerio",
    "Sus scrofa"
]

organism_gene_ids = {}
for org  in organisms:
    response = eSearch(f'AKT1[Gene Name] AND "{org}"[Organism]', db ="gene")
    ids = [x.value for x in jp.parse("$..idlist[*]").find(response)]
    organism_gene_ids[org] = ids[0]

organism_gene_ids



{'Homo sapiens': '207',
 'Mus musculus': '11651',
 'Drosophila melanogaster': '41957',
 'Danio rerio': '101910198',
 'Sus scrofa': '100126861'}

In [189]:
def id_mapping_run(ids: str, idfrom: str, idto: str):
    url = "https://rest.uniprot.org/idmapping/run"
    params = {
        "ids": ids,
        "from": idfrom,
        "to": idto
    }
    response = requests.post(url, data=params, timeout=10)
    response.raise_for_status()
    return jp.parse("$..jobId").find(response.json())[0].value

def id_mapping_status(jobId:str):
    url = f"https://rest.uniprot.org/idmapping/status/{jobId}"
    response = requests.get(url, timeout=10, allow_redirects=False)
    response.raise_for_status()
    return jp.parse("$..jobStatus").find(response.json())[0].value == "FINISHED"

def id_mapping_results(jobId:str):
    url = f"https://rest.uniprot.org/idmapping/results/{jobId}"
    params = {
        "format" : "tsv"
    }
    response = requests.get(url, params=params, timeout=10, allow_redirects=False)
    response.raise_for_status()
    return pd.read_csv(
        StringIO(response.content.decode("utf8")),
        sep="\t"
    )


In [191]:
jobIds = []
for org, gene_id in organism_gene_ids.items():
    jobId = id_mapping_run(
        ids = gene_id,
        idfrom = 'GeneID',
        idto= "UniProtKB"
    )
    jobIds.append((org, jobId))

print(jobIds)

[('Homo sapiens', 'onG2lp9VEG'), ('Mus musculus', 'lNGiRFAMQP'), ('Drosophila melanogaster', 'wD7fWdEed7'), ('Danio rerio', 'drMC2ccaVp'), ('Sus scrofa', 'wNCWCYwN1J')]


In [192]:
all_results = []
finished = 0
while finished < len(jobIds):
    for org, jobId in jobIds:
        if id_mapping_status(jobId):
            df = id_mapping_results(jobId)
            df = df.assign(organism = org)
            all_results.append(df)
            finished +=1

In [194]:
pd.concat(all_results)

Unnamed: 0,From,To,organism
0,207,P31749,Homo sapiens
1,207,B0LPE5,Homo sapiens
2,207,B3KVH4,Homo sapiens
0,11651,P31750,Mus musculus
0,41957,Q8INB9,Drosophila melanogaster
1,41957,A0A0B4LIA3,Drosophila melanogaster
0,101910198,M4MD44,Danio rerio
0,100126861,A0A4X1TYM4,Sus scrofa
1,100126861,C1PIG3,Sus scrofa
2,100126861,G9BWQ1,Sus scrofa



 2. Repetir el ejercicio anterior, pero usando la API de Biomart. Comparar los
    resultados obtenidos con ambos métodos.





 ### Formatos de salida
 | Formato                              | Tipo MIME                   | Ext.  |
 |--------------------------------------|-----------------------------|------ |
 | JavaScript Object Notation (JSON)    | application/json            | json  |
 | Extensible Markup Language (XML)     | application/xml             | xml   |
 | Text file representation             | text/plain; format=flatfile | txt   |
 | List of one or more IDs              | text/plain; format=list     | list  |
 | Tab-Separated-Values                 | text/plain; format=tsv      | tsv   |
 | FASTA: a text-based                  | text/plain; format=fasta    | fasta |
 | Genomic Feature Format (GFF)         | text/plain; format=gff      | gff   |
 | Open Biomedical Ontologies (OBO)     | text/plain; format=obo      | obo   |
 | Resource Description Framework (RDF) | application/rdf+xml         | rdf   |
 | Excel                                | application/vnd.ms-excel    | xlsx  |

In [None]:
from io import StringIO
import pandas as pd
import requests

BIOMART = "https://www.ensembl.org/biomart/martservice"

datasets =[
    ("hsapiens_gene_ensembl", 207),
    ("sscrofa_gene_ensembl", 100126861),
    ("dmelanogaster_gene_ensembl", 41957),
    ("drerio_gene_ensembl",101910198),
    ("mmusculus_gene_ensembl", 11651)
]

all_results = []
for ds, i in datasets:
    query = f"""<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
<Dataset name = "{ds}" interface = "default" >
<Filter name = "entrezgene_id" value = "{i}"/>
<Attribute name = "uniprotswissprot" />
<Attribute name = "uniprotsptrembl" />
</Dataset>
</Query>""".replace("\n", "")
    response = requests.get(BIOMART, params={"query": query}, timeout=30)
    response.raise_for_status()
    content = response.content.decode("utf8")
    if not content:
        continue
    data = pd.read_csv(StringIO(content), sep="\t", header=None)
    data.columns = ["Accession", "ID"]
    data = data.assign(GeneId=i)
    all_results.append(data)

P31749	B0LPE5
P31749	B0LPE5
P31749	B0LPE5
P31749	B0LPE5
	
P31749	B0LPE5
	
P31749	B0LPE5
P31749	B0LPE5
	G3V3X1
	A0A804HJM6
	G3V2I6
	A0A087WY56
P31749	B0LPE5

	A0A287AL02
	A0A287B2S5
	K7GNW4
	G9BWQ1
	A0A4X1TYM4

	A0A0B4LIA3
	A0A0B4LIA3
	A0A0B4LIA3
	A0A0B4LIA3
	A0A0B4LIA3
	A0A0B4LIA3
	A0A0B4LIA3
	A0A0B4LIA3
	A0A0B4LIA3
	A0A0B4LIA3
	A0A0B4LIA3
	A0A0B4LIA3
Q8INB9	H1ZY63
Q8INB9	H1ZY64
Q8INB9	H1ZY63
Q8INB9	H1ZY64
Q8INB9	H1ZY63
Q8INB9	H1ZY64
Q8INB9	H1ZY63
Q8INB9	H1ZY64
Q8INB9	H1ZY63
Q8INB9	H1ZY64
Q8INB9	H1ZY63
Q8INB9	H1ZY64
	A0A0B4LIA3
	A0A0B4LIA3
	A0A0B4LIA3
	A0A0B4LIA3
	A0A0B4LIA3
	A0A0B4LIA3
	A0A0B4LIA3
	A0A0B4LIA3
	A0A0B4LIA3
	A0A0B4LIA3
	A0A0B4LIA3
	A0A0B4LIA3


P31750	
	D3Z783
	D3YXX3
	D3YYP9



In [25]:
final_results = pd.concat(
    all_results
)
final_results = final_results.drop_duplicates()
display(final_results)

Unnamed: 0,Accession,ID,GeneId
0,P31749,B0LPE5,207
4,,,207
9,,G3V3X1,207
10,,A0A804HJM6,207
11,,G3V2I6,207
12,,A0A087WY56,207
0,,A0A287AL02,100126861
1,,A0A287B2S5,100126861
2,,K7GNW4,100126861
3,,G9BWQ1,100126861


 3. Recuperar las secuencias de todos los exones de todos los transcriptos del
    gen *TP53* humano.

In [27]:
query = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query  virtualSchemaName = "default" formatter = "FASTA" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
<Dataset name = "hsapiens_gene_ensembl" interface = "default" >
<Filter name = "entrezgene_id" value = "7157"/>
<Attribute name = "gene_exon" />
<Attribute name = "external_gene_name" />
<Attribute name = "ensembl_exon_id" />
<Attribute name = "exon_chrom_start" />
<Attribute name = "exon_chrom_end" />
<Attribute name = "strand" />
</Dataset>
</Query>
""".replace("\n", "")

response = requests.get(BIOMART, params={"query": query}, timeout=30)
response.raise_for_status()
content = response.content.decode("utf8")
print(content)



>TP53|ENSE00002337729|7676521|7676594|-1
ATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCA
GACCTATGGAAACT
>TP53|ENSE00003545950|7670609|7670715|-1
ATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAG
GATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAG
>TP53|ENSE00003625790|7675994|7676272|-1
TCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAA
TGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCC
GTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCC
CTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGC
TTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACG
>TP53|ENSE00003725258|7673701|7673837|-1
TGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGA
CCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCC
AGGGAGCACTAAGCGAG
>TP53|ENSE00003752869|7676521|7676622|-1
CAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGA
GCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACT
>TP53|ENSE00003786593|7673535|7673608|-1
CACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAG

4. Recuperar los nombres de todos los genes humanos que están asociados con el
   término GO "apoptosis" (GO:0006915), junto con sus identificadores NCBI Gene ID
   y el Gene symbol, utilizando la API de Biomart.



In [32]:
query = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
<Dataset name = "hsapiens_gene_ensembl" interface = "default" >
<Filter name = "go_parent_term" value = "GO:0006915"/>
<Attribute name = "external_gene_name" />
<Attribute name = "entrezgene_id" />
<Attribute name = "hgnc_symbol" />
</Dataset>
</Query>""".replace("\n", "")

response = requests.get(BIOMART, params={"query": query}, timeout=90)
response.raise_for_status()
content = response.content.decode("utf8")
print(content)

RYBP	23429	RYBP
RYBP	23429	RYBP
SLC25A6	293	SLC25A6
BCL2L14	79370	BCL2L14
BCL2L14	79370	BCL2L14
BCL2L14	79370	BCL2L14
BCL2L14	79370	BCL2L14
BCL2L14	79370	BCL2L14
BCL2L14	79370	BCL2L14
BCL2L14	79370	BCL2L14
BCL2L14	79370	BCL2L14
BCL2L14	79370	BCL2L14
BCL2L14	79370	BCL2L14
BCL2L14	79370	BCL2L14
BCL2L14	79370	BCL2L14
BCL2L14	79370	BCL2L14
ECSCR	641700	ECSCR
RRP1B	23076	RRP1B
POU4F1	5457	POU4F1
RNF152	220441	RNF152
RNF152	220441	RNF152
BRSK2	9024	BRSK2
BRSK2	9024	BRSK2
BRSK2	9024	BRSK2
BRSK2	9024	BRSK2
BRSK2	9024	BRSK2
BRSK2	9024	BRSK2
BRSK2	9024	BRSK2
FOXO1	2308	FOXO1
FOXO1	2308	FOXO1
PHLPP1	23239	PHLPP1
PIM3	415116	PIM3
FAM3B	54097	FAM3B
FAM3B	54097	FAM3B
FAM3B	54097	FAM3B
GAS6	2621	GAS6
GAS6	2621	GAS6
GAS6	2621	GAS6
SLC25A6	293	SLC25A6
ADORA2A	135	ADORA2A
ADORA2A	135	ADORA2A
ADORA2A	135	ADORA2A
ADORA2A	135	ADORA2A
STK24	8428	STK24
STK24	8428	STK24
MAPK1	5594	MAPK1
MAPK1	5594	MAPK1
MAPK1	5594	MAPK1
MAPK1	5594	MAPK1
MAPK1	5594	MAPK1
MAPK1	5594	MAPK1
TMTC4	84899	TMTC4
TNFRSF19	55504	TNFRSF

In [44]:
final_results = (
    pd.read_csv(
        StringIO(content),
        sep="\t",
        header=None
    )
        .drop_duplicates()
        .reset_index(drop=True)
        .set_axis(["External Gene Name", "NCBI Gene ID", "HGNC Symbol"], axis=1)
)
final_results

Unnamed: 0,External Gene Name,NCBI Gene ID,HGNC Symbol
0,RYBP,23429,RYBP
1,SLC25A6,293,SLC25A6
2,BCL2L14,79370,BCL2L14
3,ECSCR,641700,ECSCR
4,RRP1B,23076,RRP1B
...,...,...,...
943,CD5L,922,CD5L
944,STEAP3,55240,STEAP3
945,NHLH2,4808,NHLH2
946,PLK3,1263,PLK3
