## Guide for validate hugo gene name and query mygene info 

1. query mygene to get uniprot id 
2. query mygene to get gene info for all genes in the root node 

In [None]:
## Step 1: Download the HGNC data (if not already in the data folder)

!sh ./download_genenames.sh

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  849k    0  849k    0     0   163k      0 --:--:--  0:00:05 --:--:--  172k


### Example for validate hugo gene name and update the gene name 

In [1]:
from hugo import GeneValidator

# Usage example:
file_path = "./hgnc_genes.tsv"
genes = ['ATP5MPL', 'DKK3', 'DMAC2', 'MCM3AP']
validator = GeneValidator(file_path)
result = validator.validate_human_genes(genes)
print(result)

updated_gene_symbols = list(result['official_genes'])
invalid_gene_symbols = list(result['invalid'])
updated_genes_mapping = result['updated_genes']
print("Updated gene symbols:", updated_gene_symbols)
print("Invalid gene symbols:", invalid_gene_symbols)
print("Updated genes mapping:", updated_genes_mapping)


validate Hugo symbol for ATP5MPL
validate Hugo symbol for DKK3
validate Hugo symbol for DMAC2
validate Hugo symbol for MCM3AP
{'official_genes': {'ATP5MJ', 'MCM3AP', 'DKK3', 'DMAC2'}, 'invalid': {'ATP5MPL'}, 'updated_genes': {'ATP5MPL': 'ATP5MJ'}}
Updated gene symbols: ['ATP5MJ', 'MCM3AP', 'DKK3', 'DMAC2']
Invalid gene symbols: ['ATP5MPL']
Updated genes mapping: {'ATP5MPL': 'ATP5MJ'}


### Test code for get only uniprot id

In [2]:
from hugo import GeneValidator
from query_mygene import get_uniprot_id

# Example usage
file_path = "./hgnc_genes.tsv"
genes = ['ATP5MPL', 'DKK3', 'DMAC2', 'MCM3AP']
validator = GeneValidator(file_path)
result = validator.validate_human_genes(genes)
updated_gene_symbols = list(result['official_genes'])
uniprot_ids = get_uniprot_id(updated_gene_symbols)
print(uniprot_ids)



validate Hugo symbol for ATP5MPL
validate Hugo symbol for DKK3
validate Hugo symbol for DMAC2
validate Hugo symbol for MCM3AP
{'ATP5MJ': 'P56378', 'MCM3AP': 'O60318', 'DKK3': 'Q9UBP4', 'DMAC2': 'Q9NW81'}


# Example query mygene info for MuSIC root node 

* save the mygene info for root node, no need to query mygene for each systems
* just read the json file for getting uniprot id 

In [1]:
import os
from file_io import get_model_directory_path
import json

%env MODEL_ANNOTATION_ROOT=Projects/cellmaps_annotate_hierarchy/cellmaps_annotate_hierarchy/

model_name = "MuSIC2_Maps"
version = "v1.1_April2023"
model_cx2_filename = "MuSIC2_v1.1_April2023.cx2"
print(get_model_directory_path(model_name, version))
model_path = os.path.join(get_model_directory_path(model_name, version), model_cx2_filename)
print(model_path)
with open(model_path, encoding='utf-8') as f:
    data = f.read()
    model = json.loads(data)
# print(model)

env: MODEL_ANNOTATION_ROOT=Projects/cellmaps_annotate_hierarchy/cellmaps_annotate_hierarchy/
/cellar/users/mhu/Projects/cellmaps_annotate_hierarchy/cellmaps_annotate_hierarchy/MuSIC2_Maps/v1.1_April2023
/cellar/users/mhu/Projects/cellmaps_annotate_hierarchy/cellmaps_annotate_hierarchy/MuSIC2_Maps/v1.1_April2023/MuSIC2_v1.1_April2023.cx2


In [2]:
from query_mygene import get_mygene_for_system
from model_cx2 import get_system
from file_io import write_system_json, get_root_path

# this step takes less then 2 minutes for 5183 genes (the original Hugo info query takes >5 minutes for genes starting with A)

root_node= get_system(model, 'Cluster0-0')
hugo_file_path = "./hgnc_genes.tsv"

results = get_mygene_for_system(root_node, hugo_file_path)

write_system_json(results, model_name, version, 'root_node', 'my_gene', get_root_path())
# results 
# genes[-10:]

In [10]:
for result in results:
    if result['query'] == 'DKK3':
        print(result)

{'query': 'DKK3', 'AllianceGenome': '2893', 'HGNC': '2893', 'MIM': '605416', '_id': '27122', '_score': 18.25229, 'accession': {'genomic': ['AB035182.2', 'AB057804.1', 'AC124276.5', 'CH471064.2', 'CP068267.2', 'NC_000011.10', 'NC_060935.1'], 'protein': ['AAC35996.1', 'AAF02676.1', 'AAH07660.1', 'AAK92488.1', 'AAQ88744.1', 'AAS86757.1', 'BAA85488.1', 'BAA87044.2', 'BAA90548.1', 'BAB84360.1', 'BAB84361.1', 'BAC03555.1', 'BAF82586.1', 'BAG52632.1', 'BAG58381.1', 'BAG58448.1', 'EAW68533.1', 'EAW68534.1', 'EAW68535.1', 'EAW68536.1', 'EAW68537.1', 'EAW68538.1', 'EAW68539.1', 'NP_001018067.1', 'NP_001317149.1', 'NP_037385.2', 'NP_056965.3', 'Q9UBP4.2', 'XP_006718241.1', 'XP_016873044.1', 'XP_047282730.1', 'XP_047282731.1'], 'rna': ['AB033421.1', 'AB034203.1', 'AB057591.1', 'AF052161.1', 'AF177396.1', 'AF400439.1', 'AK090952.1', 'AK092979.1', 'AK098756.1', 'AK225156.1', 'AK289897.1', 'AK295441.1', 'AK295539.1', 'AL535720.3', 'AY358378.1', 'AY587550.1', 'BC007660.2', 'BF724383.1', 'BQ690088.1', 

In [4]:
from model_cx2 import get_system
from file_io import read_system_json, get_root_path
from hugo import get_gene_symbols
from uniprot import query_uniprot_by_id, filter_uniprot_response
# get uniprot data
def get_uniprot_data(system, json_file, hgnc_file_path = './hgnc_genes.tsv'):
    gene_names = get_gene_symbols(system, hgnc_file_path)
    # print(gene_names)
    analysis_data = {}

    # Iterate through the list of genes and extract UniProt IDs
    for result in json_file:
        hugo_gene_symbol = result["query"]
        if hugo_gene_symbol in gene_names:
            uniprot = result.get("uniprot", {})
            uniprot_id = uniprot.get("Swiss-Prot")
            print(uniprot_id)
            if uniprot_id is not None:
                uniprot_data = query_uniprot_by_id(uniprot_id)
                if uniprot_data:
                    filtered_data = filter_uniprot_response(uniprot_data)
                    analysis_data[hugo_gene_symbol] = filtered_data
        else:
            print(f'no uniprot id found for {hugo_gene_symbol}')

    return analysis_data
root_mygene_dict = read_system_json(model_name, version, 'root_node', 'data', get_root_path())
system= get_system(model, 'Cluster3-38')
res = get_uniprot_data(system, root_mygene_dict)
res

no uniprot id found for DHX57
no uniprot id found for SENP1
no uniprot id found for KLHL12
no uniprot id found for PIK3R1
no uniprot id found for GTF2B
no uniprot id found for CDH8
no uniprot id found for CHI3L1
no uniprot id found for SIRT1
no uniprot id found for CSNK1G2
no uniprot id found for EXOG
no uniprot id found for SEC62
no uniprot id found for CD2AP
no uniprot id found for PRRC2A
no uniprot id found for SKA1
no uniprot id found for POLR2H
no uniprot id found for PRPS1L1
no uniprot id found for ACTN2
no uniprot id found for KLHDC2
no uniprot id found for METTL15
no uniprot id found for NCBP1
no uniprot id found for GNG12
no uniprot id found for LPXN
no uniprot id found for ATP6V1H
no uniprot id found for MAPK3
no uniprot id found for PCDHGA11
no uniprot id found for PCDHGA5
no uniprot id found for CCNL2
no uniprot id found for ULBP3
no uniprot id found for BCAR1
no uniprot id found for RNPEP
no uniprot id found for LACTB
no uniprot id found for RPS6KA6
no uniprot id found for

{'DMAC2': {'UniProtKB_ID': 'DMAC2_HUMAN',
  'Description': 'Distal membrane-arm assembly complex protein 2',
  'GO': ['mitochondrion',
   'SCF ubiquitin ligase complex',
   'mitochondrial respiratory chain complex I assembly',
   'SCF-dependent proteasomal ubiquitin-dependent protein catabolic process'],
  'Location': ['mitochondrion'],
  'Disease': [],
  'Disease_description': [],
  'Complexes': ['Interacts with incompletely assembled mitochondrial NADH:ubiquinone oxidoreductase complex (complex I)']},
 'DKK3': {'UniProtKB_ID': 'DKK3_HUMAN',
  'Description': 'Dickkopf-related protein 3',
  'GO': ['extracellular space',
   'co-receptor binding',
   'receptor antagonist activity',
   'adrenal gland development',
   'anatomical structure morphogenesis',
   'negative regulation of aldosterone biosynthetic process',
   'negative regulation of anti-Mullerian hormone signaling pathway',
   'negative regulation of canonical Wnt signaling pathway',
   'negative regulation of cortisol biosynthe