### Load the multi-scale map


In [1]:
import sys,os,os.path
os.environ['MODEL_ANNOTATION_ROOT']='Desktop/projects/model_annotation/'

In [2]:
os.getenv("MODEL_ANNOTATION_ROOT")

'Desktop/projects/model_annotation/'

### Load the libraries and functions

In [3]:
import os
import pandas as pd
import json
import networkx as nx

from file_io import get_model_directory_path, read_system_json, write_system_json, write_system_tsv, get_root_path
from model_cx2 import get_system, get_genes
from hugo import get_hugo_data
from uniprot import get_uniprot_data_for_system, summarize_uniprot_features, summarized_uniprot_features_to_tsv
from io import StringIO
from chatgpt_prompts import create_system_prompt_page, create_nesa_chatGPT_prompt, add_uniprot_feature_summary
from pages import write_system_page, write_model_page, dataframe_to_html_table



In [4]:
def dataframe_to_dict(df):
    """
    Convert a pandas DataFrame into a dictionary indexed by the first column.

    :param df: The pandas DataFrame to convert.
    :return: A dictionary indexed by the first column.
    """
    # Set the index to be the first column
    df = df.set_index(df.columns[0])

    # Convert the DataFrame to a dictionary
    result_dict = df.to_dict(orient='index')

    return result_dict

def make_gene_candidacy_text(gene_data, selected_genes):
    attribute_descriptions = {
        'hasHighConfidenceMut': "Genes with high confidence mutation in ASD-diagnosed individuals:",
        'in_WES_2020': "ASD-risk genes identified in Satterstrom et al., 2020:",
       # 'in_WES_2022': "ASD-risk genes identified in Fu et al., 2022:",
        'connectedToASDPPI': "Proteins connected to ASD-risk proteins (AP-MS experiment):" # ASD-PPI preys

      #  'in_SFARI_cat_2_3': "ASD-risk in SFARI categories 2 and 3:"
    }
    attributes = {key: [] for key in attribute_descriptions.keys()}

    for gene, attributes_data in gene_data.items():
        if gene not in selected_genes:
            continue
        for attribute in attributes.keys():
            if attributes_data[attribute] == 1:
                attributes[attribute].append(gene)

    text_output = ''
    for attribute, genes in attributes.items():
        if len(genes) != 0:
            gene_list = ', '.join(genes)
            text_output += f"{attribute_descriptions[attribute]} {gene_list}\n"

    return text_output.strip()

In [5]:
def create_nesa_system_analysis_page(model_name, version, system_name, protein_list, tsv_data, n_genes=2):


    # Read the TSV data into a DataFrame
    tsv_file = StringIO(tsv_data)
    df = pd.read_csv(tsv_file, sep='\t')

    # Filter the DataFrame based on the n_genes criterion
    df = df[df['Number of Genes'] >= n_genes]
    
    uniprot_table = dataframe_to_html_table(df)

    # Create the ChatGPT analysis section with a placeholder for the analysis text
    chatgpt_analysis = "<h2>ChatGPT 4 Analysis</h2>\n<p>Paste ChatGPT analysis here:</p>\n<!-- Analysis goes here -->"

    page_title = f"{system_name} Summary"
    
    # Create the HTML page with the system summary
    html_page = f"<!DOCTYPE html>\n<html>\n<head>\n<title>{page_title}</title>\n<style>\n \
              body {{background-color: skyblue;}} \n \
              h1, h2 {{color: white; font-family: 'Roboto', sans-serif;}} \n \
              </style>\n</head>\n<body>\n<h1>{system_name} System Summary</h1>\n \
              <h2>{model_name}: {version}</h2>\n \
              \n{chatgpt_analysis}\
              <h2>Proteins</h2>\n<p>{', '.join(protein_list)}</p>\n \
              <h2>UniProt Data</h2>\n{uniprot_table}\n \
              </body>\n</html>"
    
    #html_page = f"<!DOCTYPE html>\n<html>\n<head>\n<title>{page_title}</title>\n</head>\n<body>\n<h1>{system_name} System Summary</h1>\n<h2>Proteins</h2>\n<p>{', '.join(protein_list)}</p>\n<h2>UniProt Data</h2>\n{uniprot_table}\n{chatgpt_analysis}\n</body>\n</html>"

    return html_page

## Multi-scale map level 

In [6]:
model_name = "nesa"
version = "Krogan_230424"
model_cx2_filename = "hidef_50_0.75_5_leiden.edges.cx2"
print(get_model_directory_path(model_name, version))
model_path = os.path.join(get_model_directory_path(model_name, version), model_cx2_filename)
print(model_path)
with open(model_path, encoding='utf-8') as f:
    data = f.read()
    model = json.loads(data)
#print(model)

/Users/salkhairy/Desktop/projects/model_annotation/nesa/Krogan_230424
/Users/salkhairy/Desktop/projects/model_annotation/nesa/Krogan_230424/hidef_50_0.75_5_leiden.edges.cx2


In [8]:
print(model)

[{'CXVersion': '2.0', 'hasFragments': False}, {'metaData': [{'name': 'networkAttributes', 'elementCount': 1}, {'name': 'visualProperties', 'elementCount': 1}, {'name': 'visualEditorProperties', 'elementCount': 1}, {'name': 'attributeDeclarations', 'elementCount': 1}, {'name': 'cyTableColumn', 'elementCount': 31}, {'name': 'edges', 'elementCount': 264}, {'name': 'cyHiddenAttributes', 'elementCount': 1}, {'name': 'nodes', 'elementCount': 202}]}, {'attributeDeclarations': [{'nodes': {'CD_MemberList': {'d': 'string'}, 'CD_AnnotatedMembers_Pvalue': {'d': 'double'}, 'CD_AnnotatedMembers_Size': {'d': 'integer'}, 'CommunityDetectionTally::Unmatched': {'d': 'integer'}, 'CD_AnnotatedMembers': {'d': 'string'}, 'CD_CommunityName': {'d': 'string'}, 'CommunityDetectionTally::in_diffInter': {'d': 'integer'}, 'CommunityDetectionTally::hasHighConfidenceMut': {'d': 'integer'}, 'CommunityDetectionTally::connectedToASDPPI': {'d': 'integer'}, 'CD_AnnotatedMembers_Overlap': {'d': 'double'}, 'CommunityDetect

In [7]:
# print(model)

In [7]:
# This NeSA-specific excel spreadsheet contains ASD gene candidacy information
# Set the file path for 'geneCandidacy_DF.xlsx' in the 'nesa' folder
file_path = os.path.join(get_model_directory_path(model_name, version ), 'geneCandidacy_DF.xlsx')  # SA: Note 

# Load the first worksheet of the Excel file into a DataFrame
df = pd.read_excel(file_path, sheet_name=0)

# Convert the DataFrame to a dictionary indexed by the first column
gene_data = dataframe_to_dict(df)


### Perform topological sorting of systems

In [55]:
# Get edge list and convert to directed graph from child to parent
edgesFilePath = os.path.join(get_model_directory_path(model_name, version ), 'hidef_50_0.75_5_leiden.edges')  # SA: Note 
edgesFile_DF = pd.read_csv(edgesFilePath, header=None, sep="\t")  
edgesFile_DF = edgesFile_DF.rename(columns={0: "parentSystem", 1: "childSystem", 2: "relType"})

In [64]:
edgesFilePath

'/Users/salkhairy/Desktop/projects/model_annotation/nesa/Krogan_230424/hidef_50_0.75_5_leiden.edges'

In [56]:
edgesFile_DF.head()

Unnamed: 0,parentSystem,childSystem,relType
0,Cluster0-0,Cluster1-0,default
1,Cluster0-0,Cluster1-1,default
2,Cluster0-0,Cluster1-2,default
3,Cluster0-0,Cluster1-3,default
4,Cluster0-0,Cluster1-4,default


In [57]:
edgesFile_graph = nx.from_pandas_edgelist(edgesFile_DF, 'childSystem', 'parentSystem')
edgesFile_graph  = edgesFile_graph.to_directed()

In [58]:
not nx.is_directed_acyclic_graph(edgesFile_graph)

True

In [63]:
nx.minimum_cycle_basis(edgesFile_graph)

NetworkXNotImplemented: not implemented for directed type

In [45]:
# if not DAG, convert to DAG
if  not nx.is_directed_acyclic_graph(edgesFile_graph): 
    edgesFile_graph_DAG =  nx.transitive_reduction(edgesFile_graph)

NetworkXError: Directed Acyclic Graph required for transitive_reduction

In [38]:
list(nx.simple_cycles(edgesFile_graph))

KeyboardInterrupt: 

In [36]:
list(nx.topological_sort(edgesFile_graph))

NetworkXUnfeasible: Graph contains a cycle or graph changed during iteration

In [49]:
list(nx.topological_generations(edgesFile_graph))

NetworkXUnfeasible: Graph contains a cycle or graph changed during iteration

## System level 

In [9]:
system_name_list = ["Cluster5-3", "Cluster4-10", "Cluster1-26", "Cluster3-16" , "Cluster7-0", "Cluster5-8", "Cluster4-14", "Cluster2-41", "Cluster2-20"]

In [10]:
for system_name in system_name_list:
    print("================================================")
    print(system_name)
    ## Select the system and get genes
    system = get_system(model, system_name)
    system["genes_attribute"] = "CD_MemberList"
    genes = get_genes(system)
    # print(f'{system_name}: {genes}')
    
    
    ## Get HUGO data
    print("Getting HUGO data")
    hugo_data = get_hugo_data(system)
    write_system_json(hugo_data, model_name, version, system_name, "hugo", get_root_path()) 

    ## Get genes from model data for system
    gene_candidacy_text = make_gene_candidacy_text(gene_data, get_genes(system))
    
    ## Get Uniprot Data
    print("Getting Uniprot data")
    # Gathers a protein's function, pathway, disease association, aliases, and summary description data from the uniprot database using its REST api
    hugo_data = read_system_json(model_name, version +'/'+ system_name, system_name, "hugo", get_root_path()) # SA modified
    uniprot_data = get_uniprot_data_for_system(system, hugo_data=hugo_data)
    write_system_json(uniprot_data, model_name, version, system_name, "uniprot", get_root_path()) # SA modified
    
    ## Summarized Features
    # analyze the information to find features shared between n or more system proteins
    print("Summarizing features")
    summarized_features = summarize_uniprot_features(uniprot_data)
    tsv_data = summarized_uniprot_features_to_tsv(summarized_features)
    write_system_tsv(tsv_data, model_name, version +'/' + system_name, system_name, "uniprot_summary", get_root_path()) # SA modified
    tsv_file = StringIO(tsv_data)
    df = pd.read_csv(tsv_file, sep='\t')
    
    ## Create Prompts
    print("Creating prompts")
    prompt = create_nesa_chatGPT_prompt(get_genes(system), tsv_data, gene_candidacy_text =gene_candidacy_text)
    prompt_page = create_system_prompt_page(system_name, prompt)
    write_system_page(prompt_page, model_name, version +'/'+ system_name, system_name, "chatgtp_prompt", get_root_path())
    analysis_page = create_nesa_system_analysis_page(model_name, version, system_name, get_genes(system), tsv_data)
    write_system_page(analysis_page, model_name, version +'/'+ system_name, system_name, "analysis", get_root_path())

    
    ## ToDo: 
        # automatically call chatGPT with prompt
        # save chatGPT response
        # grab the name that chatGPT provided
        # validate references - Ingoo 
    

Cluster5-3
Getting HUGO data
Getting Uniprot data
Summarizing features
Creating prompts
Cluster4-10
Getting HUGO data
Getting Uniprot data
Summarizing features
Creating prompts
Cluster1-26
Getting HUGO data
Getting Uniprot data
Summarizing features
Creating prompts
Cluster3-16
Getting HUGO data
Getting Uniprot data
Summarizing features
Creating prompts
Cluster7-0
Getting HUGO data
Getting Uniprot data
Summarizing features
Creating prompts
Cluster5-8
Getting HUGO data
Getting Uniprot data
Summarizing features
Creating prompts
Cluster4-14
Getting HUGO data
Getting Uniprot data
Summarizing features
Creating prompts
Cluster2-41
Getting HUGO data
Getting Uniprot data
Summarizing features
Creating prompts
Cluster2-20
Getting HUGO data
Getting Uniprot data
Summarizing features
Creating prompts


In [None]:
# update the model page to include links to the new pages
write_model_page(model_name, version , get_root_path())

In [None]:
# return prompt as html or json