https://github.com/leahvschaffer/U2OS_music_pipeline

### Load the MuSIC 2, April 2023 model

https://www.ndexbio.org/viewer/networks/24137326-d962-11ed-b4a3-005056ae23aa

In [1]:
import os 
home_dir =os.path.expanduser("~")
%env MODEL_ANNOTATION_ROOT=Projects/U2OS_MuSIC/MuSIC_maps/

os.path.join(home_dir, os.getenv("MODEL_ANNOTATION_ROOT"))

env: MODEL_ANNOTATION_ROOT=Projects/U2OS_MuSIC/MuSIC_maps/


'/cellar/users/mhu/Projects/U2OS_MuSIC/MuSIC_maps/'

In [2]:
import os
import json
from file_io import get_model_directory_path

%env MODEL_ANNOTATION_ROOT=Projects/cellmaps_annotate_hierarchy/cellmaps_annotate_hierarchy/

model_name = "MuSIC2_Maps"
version = "v1.1_April2023"
model_cx2_filename = "MuSIC2_v1.1_April2023.cx2"
print(get_model_directory_path(model_name, version))

model_path = os.path.join(get_model_directory_path(model_name, version), model_cx2_filename)

with open(model_path, encoding='utf-8') as f:
    data = f.read()
    model = json.loads(data)
# print(model)

env: MODEL_ANNOTATION_ROOT=Projects/cellmaps_annotate_hierarchy/cellmaps_annotate_hierarchy/
/cellar/users/mhu/Projects/cellmaps_annotate_hierarchy/cellmaps_annotate_hierarchy/MuSIC2_Maps/v1.1_April2023


### Select the system


In [3]:
from file_io import write_system_json, get_root_path
from model_cx2 import get_system, get_genes

system_name = "Cluster3-38"
system = get_system(model, system_name)
# system
write_system_json(system, model_name, version, system_name, "data", get_root_path())
genes = get_genes(system)
print(f'{system_name}: {genes}')
for gene in genes:
    print(gene)

Cluster3-38: ['ATP5MPL', 'DKK3', 'DMAC2', 'MCM3AP', 'SLC25A25', 'TTC21B', 'TULP2', 'VWA8']
ATP5MPL
DKK3
DMAC2
MCM3AP
SLC25A25
TTC21B
TULP2
VWA8


### Get HUGO data

In [4]:
from file_io import write_system_json
from hugo import get_hugo_data

hugo_data = get_hugo_data(system)
# hugo_data
# write_system_json(hugo_data, model_name, version, system_name, "hugo", get_root_path())


getting Hugo data for ATP5MPL
getting Hugo data for DKK3
getting Hugo data for DMAC2
getting Hugo data for MCM3AP
getting Hugo data for SLC25A25
getting Hugo data for TTC21B
getting Hugo data for TULP2
getting Hugo data for VWA8


In [4]:
from hugo import get_gene_symbols
from uniprot_mygene import get_uniprot_id

file_path = "./hgnc_genes.tsv"
updated_gene_symbols = get_gene_symbols(file_path, system)
uniprot_ids = get_uniprot_id(updated_gene_symbols)
print(uniprot_ids)

validate Hugo symbol for ATP5MPL
validate Hugo symbol for DKK3
validate Hugo symbol for DMAC2
validate Hugo symbol for MCM3AP
validate Hugo symbol for SLC25A25
validate Hugo symbol for TTC21B
validate Hugo symbol for TULP2
validate Hugo symbol for VWA8
{'TULP2': 'O00295', 'VWA8': 'A3KMH1', 'SLC25A25': 'Q6KCM7', 'DMAC2': 'Q9NW81', 'TTC21B': 'Q7Z4L5', 'ATP5MJ': 'P56378', 'DKK3': 'Q9UBP4', 'MCM3AP': 'O60318'}


### Get Uniprot Data
Gathers a protein's function, pathway, disease association, aliases, and summary description data from the uniprot database using its REST api

In [None]:
from uniprot import get_uniprot_data_for_system
from file_io import read_system_json

hugo_data = read_system_json(model_name, version, system_name, "hugo", get_root_path())
uniprot_data = get_uniprot_data_for_system(system, hugo_data=hugo_data)
write_system_json(uniprot_data, model_name, version, system_name, "uniprot", get_root_path())

gene names: ['COLQ', 'FLT4', 'MYADM', 'NMT2', 'SLC4A2', 'TOR1B']
gene name = COLQ
uniprot_ids = ['Q9Y215']
querying uniprot id Q9Y215
gene name = FLT4
uniprot_ids = ['P35916']
querying uniprot id P35916
gene name = MYADM
uniprot_ids = ['Q96S97']
querying uniprot id Q96S97
gene name = NMT2
uniprot_ids = ['O60551']
querying uniprot id O60551
gene name = SLC4A2
uniprot_ids = ['P04920']
querying uniprot id P04920
gene name = TOR1B
uniprot_ids = ['O14657']
querying uniprot id O14657


### Summarized Features
analyze the information to find features shared between n or more system proteins

In [None]:
import pandas as pd
from io import StringIO
from file_io import write_system_tsv
from uniprot import summarize_uniprot_features, summarized_uniprot_features_to_tsv

summarized_features = summarize_uniprot_features(uniprot_data)
tsv_data = summarized_uniprot_features_to_tsv(summarized_features)
write_system_tsv(tsv_data, model_name, version, system_name, "uniprot_summary", get_root_path())


tsv_file = StringIO(tsv_data)
df = pd.read_csv(tsv_file, sep='\t')
df

Unnamed: 0,Feature,Number of Genes,Genes
0,plasma membrane,5,"COLQ, FLT4, MYADM, NMT2, SLC4A2"
1,membrane,5,"FLT4, MYADM, NMT2, SLC4A2, TOR1B"
2,Disease variant,2,"COLQ, FLT4"
3,cytosol,2,"FLT4, NMT2"
4,ATP binding,2,"FLT4, TOR1B"
...,...,...,...
121,endoplasmic reticulum organization,1,TOR1B
122,nuclear membrane organization,1,TOR1B
123,protein localization to nucleus,1,TOR1B
124,response to unfolded protein,1,TOR1B


### Create Prompts

In [None]:
from model_cx2 import get_genes
from io import StringIO
import pandas as pd
from chatgpt_prompts import create_system_prompt_page
from pages import write_system_page, write_model_page, dataframe_to_html_table

def create_music_2_chatGPT_prompt(system, tsv_data, n_genes=2, gene_candidacy_text=''):
    """
    Create a ChatGPT prompt based on the given protein list and TSV data.

    :param protein_list: A list of protein names.
    :param tsv_data: A string containing TSV formatted uniprot summary data.
    :param n_genes: An integer representing the minimum number of genes for a feature to be included.
    :return: A string containing the ChatGPT prompt in HTML format.
    """
    # Read the TSV data into a DataFrame
    tsv_file = StringIO(tsv_data)
    df = pd.read_csv(tsv_file, sep='\t')
    
    protein_list = get_genes(system)

    # Generate the ChatGPT prompt in HTML format
    prompt_text = f"Your response should be formatted as HTML paragraphs"
    prompt_text += f"The following is a system of interacting proteins."
    prompt_text += f' Write a critical analysis of this system, describing your reasoning as you go.'
    prompt_text += f'\nWhat mechanisms and biological processes are performed by this system?'
    prompt_text += f'\nWhat cellular components and complexes are involved in this system?'
    prompt_text += f"\nDiscuss potential names for the system. Select the best name and place it in a paragraph at the beginning of your output?"
    prompt_text += f'\nProteins: '
    prompt_text += ", ".join(protein_list) + ".\n\n"
    prompt_text += f"\nA critical goal of the analysis is to determine what, if any, relationship this system has to cancer, and specifically to pediatric cancer" #or osteosarcoma
    prompt_text += f'\n\nSystem features from a Uniprot analysis: \n'
    
    prompt_text = add_uniprot_feature_summary(prompt_text, df)

    prompt = f"<div class='code-section'><button class='copy-prompt-button' onclick='copyPrompt()'>Copy Prompt</button>"
    prompt += f"<pre><code id='prompt-code'>{prompt_text}</code></pre></div>"
    prompt += "<script>function copyPrompt() {var copyText = document.getElementById('prompt-code').innerText; navigator.clipboard.writeText(copyText);}</script>"
    return prompt

def add_uniprot_feature_summary(prompt_text, feature_dataframe, n_genes=2):
    for index, row in feature_dataframe.iterrows():
        number_of_genes = 0
        if row['Number of Genes'] is not None:
            number_of_genes = int(row['Number of Genes'])
        if number_of_genes >= n_genes:
            prompt_text += f"{row['Feature']}: {row['Number of Genes']} proteins: {row['Genes']}\n"
    return prompt_text

            
def create_music_2_system_analysis_page(system_name, tsv_data, n_genes=2):
    # Read the TSV data into a DataFrame
    tsv_file = StringIO(tsv_data)
    df = pd.read_csv(tsv_file, sep='\t')

    # Filter the DataFrame based on the n_genes criterion
    df = df[df['Number of Genes'] >= n_genes]
    
    uniprot_table = dataframe_to_html_table(df)

    # Create the ChatGPT analysis section with a placeholder for the analysis text
    chatgpt_analysis = "<h2>ChatGPT 4 Analysis</h2>\n<p>Paste ChatGPT analysis here:</p>\n<!-- Analysis goes here -->"

    # Create the HTML page with the system summary
    page_title = f"{system_name} Analysis"
    html_page = f"<!DOCTYPE html>\n<html>\n<head>\n<title>{page_title}</title>\n</head>\n<body>\n<h1>{system_name} System Summary</h1>\n<h2>Proteins</h2>\n<p>{', '.join(get_genes(system))}</p>\n<h2>UniProt Data</h2>\n{uniprot_table}\n{chatgpt_analysis}\n</body>\n</html>"

    return html_page

prompt = create_music_2_chatGPT_prompt(system, tsv_data)
prompt_page = create_system_prompt_page(system_name, prompt)
write_system_page(prompt_page, model_name, version, system_name, "chatgpt_prompt", get_root_path())
analysis_page = create_music_2_system_analysis_page(system_name, tsv_data)
write_system_page(analysis_page, model_name, version, system_name, "analysis", get_root_path())
# update the model page to include links to the new pages
write_model_page(model_name, version, get_root_path())




# TEST THE CREATE PROMPT PYTHON FILE 

In [6]:
workdir = 'Projects/cellmaps_annotate_hierarchy/cellmaps_annotate_hierarchy/'
sig_sys_txt = '/cellar/users/mhu/Projects/U2OS_MuSIC/Mutation/hisig_res/April2023_V1.1/pan_pediatric/042623_sig_systems_q0.3.txt'

%run create_prompt_for_MuSIC2.py $workdir $sig_sys_txt

/cellar/users/mhu/Projects/cellmaps_annotate_hierarchy/cellmaps_annotate_hierarchy/MuSIC2_Maps/v1.1_April2023
getting Hugo data for CRY1
getting Hugo data for CSNK1D
getting Hugo data for FBXL17
getting Hugo data for FBXL19
getting Hugo data for FBXO11
getting Hugo data for FBXO46
getting Hugo data for FBXW2
getting Hugo data for FBXW9
getting Hugo data for KDM2A
getting Hugo data for LZTR1
getting Hugo data for MDM4
getting Hugo data for SKP1
getting Hugo data for TP53
gene names: ['CRY1', 'CSNK1D', 'FBXL17', 'FBXL19', 'FBXO11', 'FBXO46', 'FBXW2', 'FBXW9', 'KDM2A', 'LZTR1', 'MDM4', 'SKP1', 'TP53']
gene name = CRY1
uniprot_ids = ['Q16526']
querying uniprot id Q16526
gene name = CSNK1D
uniprot_ids = ['P48730']
querying uniprot id P48730
gene name = FBXL17
uniprot_ids = ['Q9UF56']
querying uniprot id Q9UF56
gene name = FBXL19
uniprot_ids = ['Q6PCT2']
querying uniprot id Q6PCT2
gene name = FBXO11
uniprot_ids = ['Q86XK2']
querying uniprot id Q86XK2
gene name = FBXO46
uniprot_ids = ['Q6PJ61

In [None]:
# workdir = 'Projects/cellmaps_annotate_hierarchy/cellmaps_annotate_hierarchy/'
# sig_sys_txt = '/cellar/users/mhu/Projects/U2OS_MuSIC/Mutation/hisig_res/April2023_V1.1/pan_pediatric/042623_sig_systems_q0.3.txt'


# import os
# import json
# import sys
# from file_io import get_model_directory_path,write_system_json, get_root_path
# from model_cx2 import get_system, get_genes

# from hugo import get_hugo_data
# from file_io import read_system_json, write_system_tsv
# import pandas as pd
# from io import StringIO
# from uniprot import get_uniprot_data_for_system, summarize_uniprot_features, summarized_uniprot_features_to_tsv

# os.environ["MODEL_ANNOTATION_ROOT"] = workdir

# # load gene symbol menta data 

# ## load the model 
# model_name = "MuSIC2_Maps"
# version = "v1.1_April2023"
# model_cx2_filename = "MuSIC2_v1.1_April2023.cx2"
# print(get_model_directory_path(model_name, version))

# model_path = os.path.join(get_model_directory_path(model_name, version), model_cx2_filename)

# with open(model_path, encoding='utf-8') as f:
#     data = f.read()
#     model = json.loads(data)
# # print(model)

# # load the system list
# with open(sig_sys_txt, 'r') as f:
#     systems = f.readlines()
# # get genes for each system
# for system_name in systems:
#     # select systems
#     system = get_system(model, system_name)
#     # system
#     write_system_json(system, model_name, version, system_name, "data", get_root_path())
#     genes = get_genes(system)
#     # print(f'{system_name}: {genes}')
#     # for gene in genes:
#     #     print(gene)

#     for gene in genes: 

#     if len(genes)<50: # right now checking small systems
#         ## convert all gene names to the most up-to-date gene symbol
#         # get hugo data
#         hugo_data = get_hugo_data(system)

#         write_system_json(hugo_data, model_name, version, system_name, "hugo", get_root_path())

#         hugo_data = read_system_json(model_name, version, system_name, "hugo", get_root_path())
#         # get uniprot data
#         uniprot_data = get_uniprot_data_for_system(system, hugo_data=hugo_data)
#         write_system_json(uniprot_data, model_name, version, system_name, "uniprot", get_root_path())
            
#         # get uniprot summary
#         summarized_features = summarize_uniprot_features(uniprot_data)
#         tsv_data = summarized_uniprot_features_to_tsv(summarized_features)
#         write_system_tsv(tsv_data, model_name, version, system_name, "uniprot_summary", get_root_path())



/cellar/users/mhu/Projects/cellmaps_annotate_hierarchy/cellmaps_annotate_hierarchy/MuSIC2_Maps/v1.1_April2023


AttributeError: 'NoneType' object has no attribute 'get'