### Load the multi-scale map


In [1]:
import sys,os,os.path
os.environ['MODEL_ANNOTATION_ROOT']='Desktop/projects/model_annotation/'

In [2]:
os.getenv("MODEL_ANNOTATION_ROOT")

'Desktop/projects/model_annotation/'

In [3]:
## Parameters to edit
runMode   = "test" # "test",  "full"
sizeThresh = 50 # ToDo: use third quartile value
fixGeneNames = True 

### Load the libraries and functions

In [5]:
import os
import pandas as pd
import json
import networkx as nx

from file_io import get_model_directory_path, read_system_json, write_system_json, write_system_tsv, get_root_path
from model_cx2 import get_system, get_genes
from hugo import get_hugo_data
from uniprot import get_uniprot_data_for_system, summarize_uniprot_features, summarized_uniprot_features_to_tsv
from io import StringIO
from chatgpt_prompts import create_system_prompt_page, create_nesa_chatGPT_prompt, add_uniprot_feature_summary
from pages import write_system_page, write_model_page, dataframe_to_html_table
import cx2_network



In [6]:
import FixGeneSymbols

  hgnc_raw_DF = pd.read_csv('lib/hgnc_idsymbolnamelocus_grouplocus_typestatus.txt', sep = '\t', dtype = {'hgnc_id': str, 'uniprot_ids': str, 'symbol':str})


In [7]:
def dataframe_to_dict(df):
    """
    Convert a pandas DataFrame into a dictionary indexed by the first column.

    :param df: The pandas DataFrame to convert.
    :return: A dictionary indexed by the first column.
    """
    # Set the index to be the first column
    df = df.set_index(df.columns[0])

    # Convert the DataFrame to a dictionary
    result_dict = df.to_dict(orient='index')

    return result_dict

def make_gene_candidacy_text(gene_data, selected_genes):
    attribute_descriptions = {
        'hasHighConfidenceMut': "Genes with high confidence mutation in ASD-diagnosed individuals:",
        'in_WES_2020': "ASD-risk genes identified in Satterstrom et al., 2020:",
       # 'in_WES_2022': "ASD-risk genes identified in Fu et al., 2022:",
        'connectedToASDPPI': "Proteins connected to ASD-risk proteins (AP-MS experiment):" # ASD-PPI preys

      #  'in_SFARI_cat_2_3': "ASD-risk in SFARI categories 2 and 3:"
    }
    attributes = {key: [] for key in attribute_descriptions.keys()}

    for gene, attributes_data in gene_data.items():
        if gene not in selected_genes:
            continue
        for attribute in attributes.keys():
            if attributes_data[attribute] == 1:
                attributes[attribute].append(gene)

    text_output = ''
    for attribute, genes in attributes.items():
        if len(genes) != 0:
            gene_list = ', '.join(genes)
            text_output += f"{attribute_descriptions[attribute]} {gene_list}\n"

    return text_output.strip()

In [8]:
def create_nesa_system_analysis_page(model_name, version, system_name, protein_list, tsv_data, n_genes=2):


    # Read the TSV data into a DataFrame
    tsv_file = StringIO(tsv_data)
    df = pd.read_csv(tsv_file, sep='\t')

    # Filter the DataFrame based on the n_genes criterion
    df = df[df['Number of Genes'] >= n_genes]
    
    uniprot_table = dataframe_to_html_table(df)

    # Create the ChatGPT analysis section with a placeholder for the analysis text
    chatgpt_analysis = "<h2>ChatGPT 4 Analysis</h2>\n<p>Paste ChatGPT analysis here:</p>\n<!-- Analysis goes here -->"

    page_title = f"{system_name} Summary"
    
    # Create the HTML page with the system summary
    html_page = f"<!DOCTYPE html>\n<html>\n<head>\n<title>{page_title}</title>\n<style>\n \
              body {{background-color: skyblue;}} \n \
              h1, h2 {{color: white; font-family: 'Roboto', sans-serif;}} \n \
              </style>\n</head>\n<body>\n<h1>{system_name} System Summary</h1>\n \
              <h2>{model_name}: {version}</h2>\n \
              \n{chatgpt_analysis}\
              <h2>Proteins</h2>\n<p>{', '.join(protein_list)}</p>\n \
              <h2>UniProt Data</h2>\n{uniprot_table}\n \
              </body>\n</html>"
    
    #html_page = f"<!DOCTYPE html>\n<html>\n<head>\n<title>{page_title}</title>\n</head>\n<body>\n<h1>{system_name} System Summary</h1>\n<h2>Proteins</h2>\n<p>{', '.join(protein_list)}</p>\n<h2>UniProt Data</h2>\n{uniprot_table}\n{chatgpt_analysis}\n</body>\n</html>"

    return html_page

## Multi-scale map level 

In [9]:
model_name = "nesa"
version = "Krogan_230424"
model_cx2_filename = "hidef_50_0.75_5_leiden.edges.cx2"
print(get_model_directory_path(model_name, version))
model_path = os.path.join(get_model_directory_path(model_name, version), model_cx2_filename)
print(model_path)
with open(model_path, encoding='utf-8') as f:
    data = f.read()
    model = json.loads(data)
#print(model)

/Users/salkhairy/Desktop/projects/model_annotation/nesa/Krogan_230424
/Users/salkhairy/Desktop/projects/model_annotation/nesa/Krogan_230424/hidef_50_0.75_5_leiden.edges.cx2


In [None]:
# print(model)

In [11]:
# This NeSA-specific excel spreadsheet contains ASD gene candidacy information
# Set the file path for 'geneCandidacy_DF.xlsx' in the 'nesa' folder
file_path = os.path.join(get_model_directory_path(model_name, version ), 'geneCandidacy_DF.xlsx')  # SA: Note 

# Load the first worksheet of the Excel file into a DataFrame
df = pd.read_excel(file_path, sheet_name=0)

# Convert the DataFrame to a dictionary indexed by the first column
gene_data = dataframe_to_dict(df)


### Perform topological sorting of systems

In [None]:
## Done using R's igraph package because Python has major limitations

In [None]:
import subprocess

In [None]:
modelPath = get_model_directory_path(model_name, version )

In [None]:
edgesFile = 'hidef_50_0.75_5_leiden.edges'

In [None]:
subprocess.run(['/usr/local/bin/Rscript --vanilla TopologicalSorting.R ' + modelPath + ' ' + edgesFile], shell=True) 


In [None]:
topologicalSort_DF = pd.read_csv(os.path.join(get_model_directory_path(model_name, version ), 'topologicalSort_DF.txt'), sep="\t", header=None)

In [None]:
topologicalSort_list = list(topologicalSort_DF.iloc[:,0])

In [None]:
topologicalSort_list

## System level 

In [None]:
# system_name_list = ["Cluster5-3", "Cluster4-10", "Cluster1-26", "Cluster3-16" , "Cluster7-0", "Cluster5-8", "Cluster4-14", "Cluster2-41", "Cluster2-20"]

In [12]:
if runMode == "test":
    system_name_list = ["Cluster5-3"]#, "Cluster4-10", "Cluster1-26", "Cluster3-16" , "Cluster7-0", "Cluster5-8", "Cluster4-14", "Cluster2-41", "Cluster2-20"]
else:
    system_name_list = topologicalSort_list[1:5]

In [62]:
def getSystemIndex(model, system_name):
    systemList = model[4]['nodes']
    for systemInd in range(len(systemList)):
        if systemList[systemInd]['v']['n'] == system_name:
            return systemInd

In [None]:
def set_genes(model, system_name, genes_fixed_str):
    systemInd = getSystemIndex(model, system_name)
    model[4]['nodes'][systemInd]['v']['n'] = 
    

In [56]:
systemList = model[4]['nodes']

In [64]:
systemList[1]['v']

{'n': 'Cluster1-0',
 'CD_MemberList': 'AAAS AARS1 ABCD3 ABLIM2 ACADM ACBD3 ACTL6A ACTL6B ADH5 ADNP ADNP2 ADSL AEBP2 AFG3L2 AGMAT AHCTF1 AIM2 AK7 ALDH3A2 ALKBH5 ALPK2 ALX4 AMPD1 AMPD2 ANAPC7 ANKEF1 ANKRD11 ANKRD13B ANP32B ANP32E AP3B1 AP3D1 AP3M1 APLF AR ARFGEF1 ARFGEF3 ARFIP2 ARHGEF25 ARID1A ARID1B ARID2 ARID4A ARID4B ARID5B ARIH1 ARL10 ARL2BP ARL6IP5 ARMH3 ASCL2 ASCL3 ASCL4 ASF1A ASF1B ASH1L ASH2L ASXL1 ASXL2 ASXL3 ATE1 ATF7 ATM ATN1 ATOH7 ATP2B1 ATP6V1H ATR ATRX AUP1 B2M B3GNT8 BAALC BAG2 BAG5 BAHCC1 BAHD1 BAP1 BARX2 BAX BBX BCL11A BCL11B BCL7A BCL7B BCL7C BCL9 BEND3 BICRA BICRAL BOD1 BOD1L1 BPTF BRCA1 BRD2 BRD3 BRD4 BRD7 BRD9 BRMS1 BRMS1L BSG BTAF1 BTBD17 BTN3A1 BTN3A3 BZW2 C15orf40 C16orf87 C17orf49 C19orf38 C19orf48 C1orf112 C1orf52 C21orf58 C22orf15 C22orf31 C2orf49 C3orf18 CA14 CABIN1 CACNA1E CACNG3 CAMK2D CAMK2G CAND2 CAP1 CAPN2 CARNMT1 CASP8AP2 CATSPERE CAVIN3 CBX1 CBX3 CBX5 CCDC65 CCDC71L CCDC88A CCNH CCNJ CCNL1 CCNL2 CCT8L2 CD274 CD40 CD70 CD80 CDAN1 CDC42BPA CDC73 CDCA2 CDC

In [63]:
getSystemIndex(model, system_name)

158

In [45]:
from model_cx2 import get_nodes

In [33]:
model.set_node_attribute(system_name, "CD_MemberList", genes_fixed)


AttributeError: 'list' object has no attribute 'set_node_attribute'

In [36]:
type(model)

list

In [None]:
set_node_attribute

In [None]:
for system_name in system_name_list:
    print("================================================")
    print(system_name)
    ## Select the system and get genes
    system = get_system(model, system_name)
    system["genes_attribute"] = "CD_MemberList"
    genes = get_genes(system)
    
    if len(genes) > sizeThresh:
        break # need to use different approach
        # ToDo: write up approach for larger systems
        
    # print(f'{system_name}: {genes}')
    
    ## Fix names
    if fixGeneNames:
        print("Fixing gene names")
        genes_fixed = [FixGeneSymbols.fixGeneSymbol(gene) for gene in genes]
        # ToDo: save as data frame the genes with names that were fixed 
    else:
        genes_fixed = genes
            
     ## replace genes with fixed names in the  model itself because every other function is reading from the same name

    model.set_node_attribute(system_name, "CD_MemberList", genes_fixed)
        
    ## Get the system again because modified names 
    system = get_system(model, system_name)
    
    ## Get HUGO data
    print("Getting HUGO data")
    hugo_data = get_hugo_data(genes_fixed) # SA modified -- using fixed gene names instead of system_name # ToDo: get back to system name after writing
    
    write_system_json(hugo_data, model_name, version, system_name, "hugo", get_root_path()) 

    ## Get genes from model data for system
    gene_candidacy_text = make_gene_candidacy_text(gene_data, get_genes(system))
    
    ## Get Uniprot Data
    print("Getting Uniprot data")
    
    # SA: here getting Uniprot IDs
    uniprotIDs = [FixGeneSymbols.latestGeneSymbol_2_uniprotID(gene) for gene in genes_fixed]
    
    # Q: a couple of genes map to multiple uniprot IDs, what to do with them?
    ## Till here
    
    # Gathers a protein's function, pathway, disease association, aliases, and summary description data from the uniprot database using its REST api
    hugo_data = read_system_json(model_name, version +'/'+ system_name, system_name, "hugo", get_root_path()) # SA modified
    uniprot_data = get_uniprot_data_for_system(system, hugo_data=hugo_data)
    write_system_json(uniprot_data, model_name, version, system_name, "uniprot", get_root_path()) # SA modified
    
    ## Summarized Features
    # analyze the information to find features shared between n or more system proteins
    print("Summarizing features")
    summarized_features = summarize_uniprot_features(uniprot_data)
    tsv_data = summarized_uniprot_features_to_tsv(summarized_features)
    write_system_tsv(tsv_data, model_name, version +'/' + system_name, system_name, "uniprot_summary", get_root_path()) # SA modified
    tsv_file = StringIO(tsv_data)
    df = pd.read_csv(tsv_file, sep='\t')
    
    ## Create Prompts
    print("Creating prompts")
    prompt = create_nesa_chatGPT_prompt(get_genes(system), tsv_data, gene_candidacy_text =gene_candidacy_text)
    prompt_page = create_system_prompt_page(system_name, prompt)
    write_system_page(prompt_page, model_name, version +'/'+ system_name, system_name, "chatgtp_prompt", get_root_path())
    analysis_page = create_nesa_system_analysis_page(model_name, version, system_name, get_genes(system), tsv_data)
    write_system_page(analysis_page, model_name, version +'/'+ system_name, system_name, "analysis", get_root_path())

    
    ## ToDo: 
        # automatically call chatGPT with prompt
        # save chatGPT response
        # grab the name that chatGPT provided
        # validate references - Ingoo 
    

In [None]:
# update the model page to include links to the new pages
write_model_page(model_name, version , get_root_path())

In [None]:
# return prompt as html or json