# In this notebook, I use UniProt's RESTful API to access GO terms and EC numbers for specific proteins. This will be done through Python's requests library.
## Author: Korede Ogundele
## Date Created: March 11, 2024
## Last Modified: March 22, 2024

In [1]:
import requests
import re
from bs4 import BeautifulSoup # for using XML response format

The function below uses requests.get to get protein data from UniProt's API, extracts the relevant information into named variables, then returns a tuple.

### function below does not work

In [None]:
def get_protein_info(protein_id):
    """
    This function uses UniProt's API to retrive information for specific proteins.

    Arguments
    ---------
    protein_id : str
        ID of protein whose info we want

    Returns
    -------
    protein_name : 
    go_terms : 
    ec_numbers : 

    """

    url = 'https://rest.uniprot.org/uniprotkb/stream?compressed=false&format=fasta&query=%28organism_id%3A2697049%29%20AND%20%28reviewed%3Atrue%29'

    # create a dictionary containing parameters for the UniProt API request.
    params = {
        "query": f"id:{protein_id}",
        "format": "json",
        "columns": "id,entry_name,go(molecular function),ec", # columns to retrieve
    }

    # send HTTP GET request to UniProt API
    all_fastas = requests.get(url, params=params).text


    # # convert response to Python dictionary w/ json method
    # data = response.json()

    # # extract relevant information
    # entry = data["entries"][0]
    # protein_name = entry["entry_name"]
    # go_terms = entry.get("go", {}).get("MF", [])
    # ec_numbers = entry.get("ec", [])

    # return protein_name, go_terms, ec_numbers
    

### The following two cells contain the provided example code from UniProt's api access instructions

In [None]:
url = "https://rest.uniprot.org/uniprotkb/search?groupBy=go&query=%28reviewed%3Atrue%29+AND+%28organism_id%3A9606%29"
all_fastas = requests.get(url).text

To get a single result, such as all all sequences with header mentioning SPIKE

In [None]:
fasta_list = re.split(r'\n(?=>)', all_fastas)
[fasta for fasta in fasta_list if 'SPIKE' in fasta]

# ~~~
# the following two cells work but outputs fastas of all proteins for an organism

In [None]:
def get_protein_info_for_organism(organism_id, reviewed_only=True):

    base_url = "https://rest.uniprot.org/uniprotkb/stream"
    
    # query parameters
    query_params = {
        "compressed": "false",
        "format": "fasta",
        "query": f"(organism_id:{organism_id})",    }
    

    # send HTTP GET request to UniProt API
    response = requests.get(base_url, params=query_params)
    
    # check if request was successful. print error otherwise
    if response.status_code == 200:
        return response.text
    else:
        print(f"Error accessing UniProt API. Status code: {response.status_code}")
        return None

In [None]:
organism_id = 2697049  # human
protein_info = get_protein_info_for_organism(organism_id)

protein_info

# ~~~ 

# WORKING ON FUNCTION BELOW 
### i've got go and ec urls. now figure out how to extract info for particular proteins from the soup.

In [14]:
def get_protein_info(protein_id):
  """
  This function retrieves protein information from UniProt's REST API based on protein ID.

  Arguments
  ---------
  protein_id : str
    protein ID

  Returns
  -------
  protein_info : dictionary
    dictionary containing protein information (GO, EC, etc.)
  """

  # base_url = "https://rest.uniprot.org/uniprotkb/search?query=human&fields=accession,xref_proteomes"
  go_url = "https://rest.uniprot.org/uniprotkb/search?groupBy=go&query=%28reviewed%3Atrue%29+AND+%28organism_id%3A9606%29"
  #ec_url = "https://rest.uniprot.org/uniprotkb/search?groupBy=ec&query=%28reviewed%3Atrue%29+AND+%28organism_id%3A9606%29"

  # build query string with protein ID
  #url = f"{base_url}/{protein_id}"
  
  try:
    # send get request to UniProt API
    response = requests.get(go_url)
    response.raise_for_status()  # Raise error for any status code other than 200
    print("*Status raise passed")

    # change response to string
    response_data = response.text

    # parse the XML response
    soup = BeautifulSoup(response_data, 'html.parser')
    print("*Parsing complete")

    # make dictionary for protein info
    protein_info = {}

    # get GO terms
    go_terms = []
    for go in soup.find_all('GO'):
      print("*Entering loop to find GO terms")
      go_terms.append(go.text.strip())
    protein_info["GO"] = go_terms

    # get EC numbers
    ec_numbers = []
    for ec in soup.find_all('ec'):
      ec_numbers.append(ec.text.strip())
      print("*Found EC number")
    protein_info["EC"] = ec_numbers

    return protein_info

  except requests.HTTPError as err:
    print(f"Error retrieving protein information: {err}")
    return None
  except Exception as e:
    print(f"Unexpected error parsing response: {e}")
    return None

Not every protein has EC and GO; make sure you test with one that does

In [15]:
protein_id = "25738459" 
protein_data = get_protein_info(protein_id)

if protein_data:
  print(f"Protein ID: {protein_id}")
  print(f"GO terms: {', '.join(protein_data['GO'])}")
  print(f"EC numbers: {', '.join(protein_data['EC'])}")
else:
  print("No protein information found.")

*Status raise passed
*Parsing complete
Protein ID: 25738459
GO terms: 
EC numbers: 


In [12]:
%%capture cap

print(protein_data)

# Save the captured output to a text file
with open('output.txt', 'w') as file:
    file.write(cap.stdout)