In [1]:
import requests
import json
import urllib.parse

In [4]:
import requests

# Define the API endpoint
api_url = 'http://mygene.info/v3/query'

# Define the search term
search_term = 'cardiovascular disease'

# Define the API parameters
params = {
    'q': search_term,
    'fields': 'symbol,name,summary',
    'species': 'human',
    'size': 10  # maximum number of results to return
}

# Send the GET request to the API and retrieve the response
response = requests.get(api_url, params=params)

# Check if the request was successful
if response.ok:
    # Retrieve the JSON data from the response
    data = response.json()
    
    
    # Extract gene information from the response
    for result in data['hits']:
        gene_symbol = result['symbol']
        gene_name = result['name']
        gene_summary = result['summary']
        
        # Print the gene information
        print('Symbol:', gene_symbol)
        print('Name:', gene_name)
        print('Summary:', gene_summary)
        print()
else:
    # Print an error message if the request was unsuccessful
    print('Error:', response.status_code, response.text)

Symbol: CDKN2B-AS1
Name: CDKN2B antisense RNA 1
Summary: This gene is located within the CDKN2B-CDKN2A gene cluster at chromosome 9p21. The gene product is a functional RNA molecule that interacts with polycomb repressive complex-1 (PRC1) and -2 (PRC2), leading to epigenetic silencing of other genes in this cluster. This region is a significant genetic susceptibility locus for cardiovascular disease, and has also been linked to a number of other pathologies, including several cancers, intracranial aneurysm, type-2 diabetes, periodontitis, Alzheimer's disease, endometriosis, frailty in the elderly, and glaucoma. Multiple alternatively processed transcript variants have been detected, some of which may take the form of circular RNA molecules (PMID:21151960). [provided by RefSeq, May 2014].

Symbol: MGP
Name: matrix Gla protein
Summary: This gene encodes a member of the osteocalcin/matrix Gla family of proteins. The encoded vitamin K-dependent protein is secreted by chondrocytes and vascu

In [2]:
# Use Ensembl to find orthologues
# TODO: add ability to query multiple genes, or a list of genes related to some disease
# OR all genes in a subset of species
server = "https://rest.ensembl.org"
ext = "/homology/id/ENSG00000157764?"

response = requests.get(server+ext, headers={"Content-Type": "application/json"})
data = json.loads(response.content)
# data is a dictionary with only one key, which is "data"
# data["data"] is list with only one element
# data["data"][0] is a dictionary with the following keys:
#  "id", and the value is just a string with the gene ID (e.g. "ENSG00000157764")
# "homologies", and the value is a list with 227 elements, iterating over the homologues (orthologues, paralogues)

In [3]:
print(len(data["data"][0]["homologies"]))
print(data["data"][0]["homologies"][0]['source'])

227
{'species': 'homo_sapiens', 'perc_id': 18.1462, 'taxon_id': 9606, 'id': 'ENSG00000157764', 'cigar_line': '58M2D78M3D2MDM15D10M9D2M5D6M4DM20D2M3DMD3M5DM2DM18D2M5D2M3D4MD12M3D3M7D2M15DM9D2M5D3M12D2MD2M4D2M4D5M2D12M3DM4D3M7D3M3D5M3DM2DM2D3MD8MD5M2D4M8DM6D3M9D2M2DM3D3M2D3M4D14M3D4MD3M6D4M4D5MD2M3D8M3D6M13D3MD6M2D5M15D5MD7MD4M6D20MD25MD4M6D5M3D11MD11MD6M2D13M3D41M5D12M5D6M3DMD8MD5M2D12M2D19M5D27M2D16MD4M3D9M4D16M2D16M2D15MD14M4D9MD6M3D16MD6MD4M2D2MDMD7M3DM8D3M5DM5DM8DM5DMD2M4DM4DM3D2MDMDM7D3M2DM8D4M3DMD5M3DM2D8M4D4MDM5D3M4D4M2DMD4M2D', 'perc_pos': 28.7206, 'protein_id': 'ENSP00000493543', 'align_seq': 'MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQ--EHIEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTVTSSSSSSLSVLPSSLS---VF-Q---------------NPTDVARSNP---------KS-----PQKPIV----R--------------------VF---L-PNK-----Q--R------------------TV-----VP---ARCG-VTVRDSLKKALM---MRG-------LI---------------P---------EC-----CAV------------YR-IQ----DG----EKKPI--GWDTDISWLTGE---E----LHV-

In [4]:
species_of_interest = ['mus_musculus', 'homo_sapiens']
orthologues = []
for homology in data["data"][0]["homologies"]:
    if homology["target"]["species"] in species_of_interest:  # orthologues within the species of interest
        target_gene = homology["target"]["id"]
        target_species = homology["target"]["species"]
        target_orthology = homology["type"]
        orthologues.append((target_gene, target_species, target_orthology))
    else:
        continue
print(orthologues)
print(len(orthologues))

[('ENSG00000116783', 'homo_sapiens', 'other_paralog'), ('ENSG00000104312', 'homo_sapiens', 'other_paralog'), ('ENSG00000172680', 'homo_sapiens', 'other_paralog'), ('ENSG00000129465', 'homo_sapiens', 'other_paralog'), ('ENSG00000078061', 'homo_sapiens', 'other_paralog'), ('ENSG00000182541', 'homo_sapiens', 'other_paralog'), ('ENSG00000006432', 'homo_sapiens', 'other_paralog'), ('ENSG00000188906', 'homo_sapiens', 'other_paralog'), ('ENSG00000166333', 'homo_sapiens', 'other_paralog'), ('ENSG00000173327', 'homo_sapiens', 'other_paralog'), ('ENSG00000070759', 'homo_sapiens', 'other_paralog'), ('ENSG00000168404', 'homo_sapiens', 'other_paralog'), ('ENSG00000107140', 'homo_sapiens', 'other_paralog'), ('ENSG00000091436', 'homo_sapiens', 'other_paralog'), ('ENSG00000137275', 'homo_sapiens', 'other_paralog'), ('ENSG00000139625', 'homo_sapiens', 'other_paralog'), ('ENSG00000130758', 'homo_sapiens', 'other_paralog'), ('ENSG00000073803', 'homo_sapiens', 'other_paralog'), ('ENSG00000143674', 'homo_s

In [29]:
# Use UniProt to find the proteins produced by the orthologues
proteins = []
for orthologue in orthologues:
    uniprot_url = f"https://rest.uniprot.org/uniprotkb/stream?compressed=false&format=json&query=%28{orthologue[1]}%20{orthologue[0]}%29"
    response = requests.get(uniprot_url)
    # convert response.content from bytes to dictionary
    data = json.loads(response.content)
    # If the data is empty, skip it
    # data is a dictionary with only one key, which is "results"
    # data["results"] is a list with 3 elements, one for each protein
    # data["results"][0] is a dictionary with these entries: 'entryType', 'primaryAccession', 'uniProtkbId', 'entryAudit', 'annotationScore', 'organism', 'proteinExistence', 'proteinDescription', 'genes', 'comments', 'features', 'keywords', 'references', 'uniProtKBCrossReferences', 'sequence', 'extraAttributes'
    if(len(data["results"]) == 0):
        continue
    elif(orthologue[1] in species_of_interest):
    # check if organism match with the orthologue input, print both
    # print(data["results"][0]["organism"]["scientificName"], orthologue[1])
    # print(data["results"][0]["primaryAccession"], data["results"][0]["uniProtkbId"])
        #print(orthologue[0], orthologue[1])
        print(data['results'][0]['comments'][5])
        print('============================')
        protein_species = (data["results"][0]["primaryAccession"], data["results"][0]["organism"]["taxonId"], data['results'][0]['genes'][0]['geneName']['value'])
        proteins.append(protein_species)

print(proteins)

{'commentType': 'INTERACTION', 'interactions': [{'interactantOne': {'uniProtKBAccession': 'Q59H18', 'intActId': 'EBI-704142'}, 'interactantTwo': {'uniProtKBAccession': 'Q9UKT5', 'geneName': 'FBXO4', 'intActId': 'EBI-960409'}, 'numberOfExperiments': 3, 'organismDiffer': False}, {'interactantOne': {'uniProtKBAccession': 'Q59H18', 'intActId': 'EBI-704142'}, 'interactantTwo': {'uniProtKBAccession': 'P19429', 'geneName': 'TNNI3', 'intActId': 'EBI-704146'}, 'numberOfExperiments': 2, 'organismDiffer': False}, {'interactantOne': {'uniProtKBAccession': 'Q59H18-2', 'intActId': 'EBI-10762055'}, 'interactantTwo': {'uniProtKBAccession': 'P19429', 'geneName': 'TNNI3', 'intActId': 'EBI-704146'}, 'numberOfExperiments': 2, 'organismDiffer': False}]}
{'commentType': 'INTERACTION', 'interactions': [{'interactantOne': {'uniProtKBAccession': 'O43353', 'intActId': 'EBI-358522'}, 'interactantTwo': {'uniProtKBAccession': 'Q13490', 'geneName': 'BIRC2', 'intActId': 'EBI-514538'}, 'numberOfExperiments': 3, 'orga

IndexError: list index out of range

In [None]:
# Set the URL for the API endpoint and the parameters for the request
url = "https://reactome.org/ContentService/data/pathways/top/9606"
params = {
    "species": "Homo sapiens",
    "page": 1,
    "pageSize": 50,
}

# Make a GET request to the API endpoint and retrieve the response
response = requests.get(url, params=params)

# Check if the request was successful
if response.status_code == 200:
    # Extract the data from the response
    data = response.json()
    print(len(data))
    for pathway in range(0, len(data)):
        print(data[pathway]['name'])

    # Print the names of all pathways in the response
    #for pathway in data["pathways"]:
    #    print(pathway["name"])
else:
    # Print an error message if the request was unsuccessful
    print(f"Request failed with status code {response.status_code}: {response.text}")
    

In [12]:
# Define a dictionary mapping species names to KEGG organism codes
species_codes = {10090: 'mmu', 9606: 'hsa'}

kegg_ids = []
for protein in proteins:
    # Define the base URL for the KEGG API
    base_url = "http://rest.kegg.jp/"

    # Define the search term (gene name)
    search_term = protein[2]

    # Construct the full URL for the API query
    url = base_url + "find/genes/" + search_term

    # Send the API request and get the response
    response = requests.get(url)

    # Parse the response to extract the KEGG IDs for the matching genes
    for line in response.text.split("\n"):
        if line.startswith("hsa:"):
            gene_id = line.split("\t")[0]
            kegg_ids.append(gene_id)

    # Print the list of KEGG IDs
    #print(kegg_ids)

print(len(kegg_ids))

376


In [139]:
pathways_dict = {}

# Iterate over the KEGG IDs and retrieve the pathways for each one
for kegg_id in kegg_ids:
    # Construct the full URL for the API query
    url = base_url + "link/pathway/" + kegg_id
    
    # Send the API request and get the response
    response = requests.get(url)
    
    # Parse the response to extract the pathways for the current KEGG ID
    pathways = []
    for line in response.text.split("\n"):
        if line.startswith("PATH"):
            pathway_name = line.split("\t")[1].strip()
            pathways.append(pathway_name)
    
    # Store the pathways for the current KEGG ID in the pathways_dict
    pathways_dict[kegg_id] = pathways

# Print the pathways for each KEGG ID
for kegg_id, pathways in pathways_dict.items():
    print(f"KEGG ID: {kegg_id}")
    print(f"Pathways: {pathways}")
    print("---------------------------")

KEGG ID: hsa:100526835
Pathways: []
---------------------------
KEGG ID: hsa:51086
Pathways: []
---------------------------
KEGG ID: hsa:8767
Pathways: []
---------------------------
KEGG ID: hsa:124905557
Pathways: []
---------------------------
KEGG ID: hsa:124905450
Pathways: []
---------------------------
KEGG ID: hsa:107984732
Pathways: []
---------------------------
KEGG ID: hsa:124903713
Pathways: []
---------------------------
KEGG ID: hsa:107986375
Pathways: []
---------------------------
KEGG ID: hsa:54991
Pathways: []
---------------------------
KEGG ID: hsa:54996
Pathways: []
---------------------------
KEGG ID: hsa:101059915
Pathways: []
---------------------------
KEGG ID: hsa:56245
Pathways: []
---------------------------
KEGG ID: hsa:342918
Pathways: []
---------------------------
KEGG ID: hsa:123775
Pathways: []
---------------------------
KEGG ID: hsa:140680
Pathways: []
---------------------------
KEGG ID: hsa:206412
Pathways: []
---------------------------
KEGG ID: 