In [4]:
import requests
import json
import urllib.parse

In [48]:
# Use Ensembl to find orthologues
# TODO: add ability to query multiple genes, or a list of genes related to some disease
# OR all genes in a subset of species
server = "https://rest.ensembl.org"
ext = "/homology/id/ENSG00000157764?"

response = requests.get(server+ext, headers={"Content-Type": "application/json"})
data = json.loads(response.content)
# data is a dictionary with only one key, which is "data"
# data["data"] is list with only one element
# data["data"][0] is a dictionary with the following keys:
#  "id", and the value is just a string with the gene ID (e.g. "ENSG00000157764")
# "homologies", and the value is a list with 227 elements, iterating over the homologues (orthologues, paralogues)

In [49]:
print(len(data["data"][0]["homologies"]))
print(data["data"][0]["homologies"][0]['source'])

227
{'id': 'ENSG00000157764', 'perc_id': 18.1462, 'species': 'homo_sapiens', 'taxon_id': 9606, 'cigar_line': '58M2D78M3D2MDM15D10M9D2M5D6M4DM20D2M3DMD3M5DM2DM18D2M5D2M3D4MD12M3D3M7D2M15DM9D2M5D3M12D2MD2M4D2M4D5M2D12M3DM4D3M7D3M3D5M3DM2DM2D3MD8MD5M2D4M8DM6D3M9D2M2DM3D3M2D3M4D14M3D4MD3M6D4M4D5MD2M3D8M3D6M13D3MD6M2D5M15D5MD7MD4M6D20MD25MD4M6D5M3D11MD11MD6M2D13M3D41M5D12M5D6M3DMD8MD5M2D12M2D19M5D27M2D16MD4M3D9M4D16M2D16M2D15MD14M4D9MD6M3D16MD6MD4M2D2MDMD7M3DM8D3M5DM5DM8DM5DMD2M4DM4DM3D2MDMDM7D3M2DM8D4M3DMD5M3DM2D8M4D4MDM5D3M4D4M2DMD4M2D', 'protein_id': 'ENSP00000493543', 'perc_pos': 28.7206, 'align_seq': 'MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQ--EHIEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTVTSSSSSSLSVLPSSLS---VF-Q---------------NPTDVARSNP---------KS-----PQKPIV----R--------------------VF---L-PNK-----Q--R------------------TV-----VP---ARCG-VTVRDSLKKALM---MRG-------LI---------------P---------EC-----CAV------------YR-IQ----DG----EKKPI--GWDTDISWLTGE---E----LHV-

In [50]:
species_of_interest = ['mus_musculus', 'homo_sapiens']
orthologues = []
for homology in data["data"][0]["homologies"]:
    if homology["target"]["species"] in species_of_interest:  # orthologues within the species of interest
        target_gene = homology["target"]["id"]
        target_species = homology["target"]["species"]
        target_orthology = homology["type"]
        orthologues.append((target_gene, target_species, target_orthology))
    else:
        continue
print(orthologues)
print(len(orthologues))

[('ENSG00000116783', 'homo_sapiens', 'other_paralog'), ('ENSG00000104312', 'homo_sapiens', 'other_paralog'), ('ENSG00000172680', 'homo_sapiens', 'other_paralog'), ('ENSG00000129465', 'homo_sapiens', 'other_paralog'), ('ENSG00000078061', 'homo_sapiens', 'other_paralog'), ('ENSG00000182541', 'homo_sapiens', 'other_paralog'), ('ENSG00000006432', 'homo_sapiens', 'other_paralog'), ('ENSG00000188906', 'homo_sapiens', 'other_paralog'), ('ENSG00000166333', 'homo_sapiens', 'other_paralog'), ('ENSG00000173327', 'homo_sapiens', 'other_paralog'), ('ENSG00000070759', 'homo_sapiens', 'other_paralog'), ('ENSG00000168404', 'homo_sapiens', 'other_paralog'), ('ENSG00000107140', 'homo_sapiens', 'other_paralog'), ('ENSG00000091436', 'homo_sapiens', 'other_paralog'), ('ENSG00000137275', 'homo_sapiens', 'other_paralog'), ('ENSG00000139625', 'homo_sapiens', 'other_paralog'), ('ENSG00000130758', 'homo_sapiens', 'other_paralog'), ('ENSG00000073803', 'homo_sapiens', 'other_paralog'), ('ENSG00000143674', 'homo_s

In [None]:
# Use UniProt to find the proteins produced by the orthologues
proteins = []
for orthologue in orthologues:
    uniprot_url = f"https://rest.uniprot.org/uniprotkb/stream?compressed=false&format=json&query=%28{orthologue[1]}%20{orthologue[0]}%29"
    response = requests.get(uniprot_url)
    # convert response.content from bytes to dictionary
    data = json.loads(response.content)
    print(data["results"])
    # If the data is empty, skip it
    # data is a dictionary with only one key, which is "results"
    # data["results"] is a list with 3 elements, one for each protein
    # data["results"][0] is a dictionary with these entries: 'entryType', 'primaryAccession', 'uniProtkbId', 'entryAudit', 'annotationScore', 'organism', 'proteinExistence', 'proteinDescription', 'genes', 'comments', 'features', 'keywords', 'references', 'uniProtKBCrossReferences', 'sequence', 'extraAttributes'
    if(len(data["results"]) == 0):
        continue
    elif(orthologue[1] in species_of_interest):
    # check if organism match with the orthologue input, print both
    # print(data["results"][0]["organism"]["scientificName"], orthologue[1])
    # print(data["results"][0]["primaryAccession"], data["results"][0]["uniProtkbId"])
        print(orthologue[0], orthologue[1])
        protein_species = (data["results"][0]["primaryAccession"], data["results"][0]["organism"]["taxonId"])
        proteins.append(protein_species)

print(proteins)

In [None]:
# Set the URL for the API endpoint and the parameters for the request
url = "https://reactome.org/ContentService/data/pathways/top/9606"
params = {
    "species": "Homo sapiens",
    "page": 1,
    "pageSize": 50,
}

# Make a GET request to the API endpoint and retrieve the response
response = requests.get(url, params=params)

# Check if the request was successful
if response.status_code == 200:
    # Extract the data from the response
    data = response.json()
    print(len(data))
    for pathway in range(0, len(data)):
        print(data[pathway]['name'])

    # Print the names of all pathways in the response
    #for pathway in data["pathways"]:
    #    print(pathway["name"])
else:
    # Print an error message if the request was unsuccessful
    print(f"Request failed with status code {response.status_code}: {response.text}")
    

In [85]:
# Define a dictionary mapping species names to KEGG organism codes
species_codes = {'mus_musculus': 'mmu', 'homo_sapiens': 'hsa'}

for protein in proteins:
    # Set the base URL for the KEGG API
    base_url = 'http://rest.kegg.jp/'

    # Set the protein ID and species name
    protein_id = 'Q9Z2X5'
    species_name = 'mus_musculus'

    species_code = species_codes.get(species_name)

    # Build the URL for the API query
    url = f'{base_url}get/{species_code}:{protein_id}/pathway'

    # Make the API request
    response = requests.get(url)

    # Print the pathways
    if response.status_code == 200:
        print(response.text)
    else:
        print('Error:', response.status_code)

Error: 400
Error: 400
Error: 400
Error: 400
Error: 400


KeyboardInterrupt: 