### Load the NeSA, April 2023 model


In [None]:

import os
from file_io import get_model_directory_path
import json

model_name = "nesa"
version = "april_2023"
model_cx2_filename = "hidef_50_0.75_5_leiden_pruned.edges.cx2"
print(get_model_directory_path(model_name, version))


model_path = os.path.join(get_model_directory_path(model_name, version), model_cx2_filename)
print(model_path)
with open(model_path, encoding='utf-8') as f:
    data = f.read()
    model = json.loads(data)
#print(model)

### Select the system

In [3]:
from file_io import write_system_json, get_root_path
from model_cx2 import get_system, get_genes

system_name = "Cluster2-157"
system = get_system(model, system_name)
system["genes_attribute"] = "CD_MemberList"
genes = get_genes(system)
print(f'{system_name}: {genes}')
system

Cluster2-157: ['CHST9', 'DKK1', 'LRP5', 'LRP6', 'RASD2', 'TK2']


{'id': 3909069,
 'x': -1296.4951064560441,
 'y': -232.88461538461434,
 'v': {'systemSize': 6,
  'CD_AnnotatedMembers_Pvalue': 2.079760112471739e-05,
  'CD_AnnotatedMembers_Size': 3,
  'System_index': 17453,
  'Genes': 'LRP6,LRP5,CHST9,DKK1,RASD2,TK2',
  'CommunityDetectionTally::hasHighConfidenceMut': 2,
  'CD_AnnotatedMembers_Overlap': 0.0,
  'Selection_pressure': 0.00383,
  'CommunityDetectionTally::in_SFARI_cat_2_3': 0,
  'CD_MemberList_LogSize': 2.584962500721156,
  'Rank_of_model_input': '73|243|17295|17295|17295|17295',
  'CommunityDetectionTally::in_WES_2020': 0,
  'CommunityDetectionTally::in_WES_2022': 0,
  'CD_MemberList': 'CHST9 DKK1 LRP5 LRP6 RASD2 TK2',
  'CommunityDetectionTally::Unmatched': 4,
  'CD_AnnotatedMembers': 'LRP5 DKK1 LRP6',
  'CD_CommunityName': 'regulation of ossification (GO:0030278)',
  'CommunityDetectionTally::connectedToASDPPI': 0,
  'numKnownASDGenes': 0.0,
  'Column1': 15,
  'geneCount': 'LRP6(3);LRP5(2);CHST9(NaN);DKK1(NaN);RASD2(NaN);TK2(NaN)',
  'p

### Get HUGO data

In [4]:
from file_io import write_system_json
from hugo import get_hugo_data

hugo_data = get_hugo_data(system)

write_system_json(hugo_data, model_name, version, system_name, "hugo", get_root_path())

getting Hugo data for CHST9
getting Hugo data for DKK1
getting Hugo data for LRP5
getting Hugo data for LRP6
getting Hugo data for RASD2
getting Hugo data for TK2


### Load Model Data
This NeSA-specific excel spreadsheet contains ASD gene candidacy information

In [7]:
import pandas as pd
import os
from file_io import read_system_json

def dataframe_to_dict(df):
    """
    Convert a pandas DataFrame into a dictionary indexed by the first column.

    :param df: The pandas DataFrame to convert.
    :return: A dictionary indexed by the first column.
    """
    # Set the index to be the first column
    df = df.set_index(df.columns[0])

    # Convert the DataFrame to a dictionary
    result_dict = df.to_dict(orient='index')

    return result_dict

def make_gene_candidacy_text(gene_data, selected_genes):
    attribute_descriptions = {
        'hasHighConfidenceMut': "Genes with high confidence mutation in ASD-diagnosed individuals:",
        'in_WES_2020': "ASD-risk genes identified in Satterstrom et al., 2020:",
        'in_WES_2022': "ASD-risk genes identified in Fu et al., 2022:",
        'connectedToASDPPI': "Proteins connected to ASD-risk proteins (AP-MS experiment):",
        'in_SFARI_cat_2_3': "ASD-risk in SFARI categories 2 and 3:"
    }
    attributes = {key: [] for key in attribute_descriptions.keys()}

    for gene, attributes_data in gene_data.items():
        if gene not in selected_genes:
            continue
        for attribute in attributes.keys():
            if attributes_data[attribute] == 1:
                attributes[attribute].append(gene)

    text_output = ''
    for attribute, genes in attributes.items():
        if len(genes) != 0:
            gene_list = ', '.join(genes)
            text_output += f"{attribute_descriptions[attribute]} {gene_list}\n"

    return text_output.strip()


# Set the file path for 'geneCandidacy_DF.xlsx' in the 'nesa' folder
file_path = os.path.join(get_model_directory_path(model_name, version), 'geneCandidacy_DF.xlsx')

# Load the first worksheet of the Excel file into a DataFrame
df = pd.read_excel(file_path, sheet_name=0)

# Convert the DataFrame to a dictionary indexed by the first column
gene_data = dataframe_to_dict(df)
gene_candidacy_text = make_gene_candidacy_text(gene_data, get_genes(system))
gene_candidacy_text   

'Genes with high confidence mutation in ASD-diagnosed individuals: LRP5, LRP6'

### Get Uniprot Data
Gathers a protein's function, pathway, disease association, aliases, and summary description data from the uniprot database using its REST api

In [8]:
from uniprot import get_uniprot_data_for_system
from file_io import read_system_json

hugo_data = read_system_json(model_name, version, system_name, "hugo", get_root_path())
uniprot_data = get_uniprot_data_for_system(system, hugo_data=hugo_data)
write_system_json(uniprot_data, model_name, version, system_name, "uniprot", get_root_path())

gene names: ['CHST9', 'DKK1', 'LRP5', 'LRP6', 'RASD2', 'TK2']
gene name = CHST9
uniprot_ids = ['Q7L1S5']
querying uniprot id Q7L1S5
gene name = DKK1
uniprot_ids = ['O94907']
querying uniprot id O94907
gene name = LRP5
uniprot_ids = ['O75197']
querying uniprot id O75197
gene name = LRP6
uniprot_ids = ['O75581']
querying uniprot id O75581
gene name = RASD2
uniprot_ids = ['Q96D21']
querying uniprot id Q96D21
gene name = TK2
uniprot_ids = ['O00142']
querying uniprot id O00142


### Summarized Features
analyze the information to find features shared between n or more system proteins

In [9]:
import pandas as pd
from io import StringIO
from file_io import write_system_tsv
from uniprot import summarize_uniprot_features, summarized_uniprot_features_to_tsv

summarized_features = summarize_uniprot_features(uniprot_data)
tsv_data = summarized_uniprot_features_to_tsv(summarized_features)
write_system_tsv(tsv_data, model_name, version, system_name, "uniprot_summary", get_root_path())


tsv_file = StringIO(tsv_data)
df = pd.read_csv(tsv_file, sep='\t')
df

Unnamed: 0,Feature,Number of Genes,Genes
0,membrane,4,"CHST9, LRP5, LRP6, RASD2"
1,plasma membrane,4,"DKK1, LRP5, LRP6, RASD2"
2,extracellular region,3,"CHST9, DKK1, LRP6"
3,canonical Wnt signaling pathway,3,"DKK1, LRP5, LRP6"
4,Disease variant,3,"LRP5, LRP6, TK2"
...,...,...,...
179,thymidine metabolic process,1,TK2
180,Mitochondrial DNA depletion syndrome 2,1,TK2
181,Progressive external ophthalmoplegia with mito...,1,TK2
182,Primary mitochondrial disease,1,TK2


### Create Prompts

In [17]:
from model_cx2 import get_genes
from io import StringIO
import pandas as pd
from model_cx2 import get_genes
from chatgpt_prompts import create_system_prompt_page
from pages import write_system_page, write_model_page, dataframe_to_html_table


def create_nesa_chatGPT_prompt(protein_list, tsv_data, n_genes=2, gene_candidacy_text=''):
    """
    Create a ChatGPT prompt based on the given protein list and TSV data.

    :param protein_list: A list of protein names.
    :param tsv_data: A string containing TSV formatted summary data.
    :param n_genes: An integer representing the minimum number of genes for a feature to be included.
    :return: A string containing the ChatGPT prompt in HTML format.
    """
    # Read the TSV data into a DataFrame
    tsv_file = StringIO(tsv_data)
    df = pd.read_csv(tsv_file, sep='\t')

    preamble = "You are assisting a molecular biologist in the analysis of a system of interacting proteins \n"
    
    autism_instructions = "\nA critical goal of the analysis is to determine what, if any, relationship this system has to ASD (Autism Spectrum Disorder)"
    
    general_analysis_instructions = "\nSave any summary analysis of the system to the last paragraph. \
                \nAvoid overly general statements of how the proteins are involved in various cellular processes\n\
                \nAvoid recapitualting the goals of the analysis. \n\
                \nYour response should be formatted as HTML paragraphs"
    
    task_instructions = "\nFirst, write a critical analysis of this system, describing your reasoning as you go.\
    \nWhat mechanisms and biological processes are performed by this system?\
    \nWhat cellular components and complexes are involved in this system?\
    \nSecond, analyze the proteins to discuss which of them might be the product of novel ASD-risk genes\
    \nThird, discuss potential names for the system. Select the best name and place it in a paragraph \
    at the beginning of your output"
                

    # Generate the ChatGPT prompt in HTML format
    prompt_text = f"\n"
    prompt_text += preamble
    prompt_text += autism_instructions
    prompt_text += task_instructions
    prompt_text += general_analysis_instructions


    prompt_text += f'\nProteins: '
    prompt_text += ", ".join(protein_list) + ".\n\n"

    prompt_text += f"\nHere are some ASD-related facts about these proteins"
    prompt_text += f"\n{gene_candidacy_text}"
    
    prompt_text += f'\n\nSystem features from a Uniprot analysis: \n'
    prompt_text = add_uniprot_feature_summary(prompt_text, df)

    prompt = f"<div class='code-section'><button class='copy-prompt-button' onclick='copyPrompt()'>Copy Prompt</button>"
    prompt += f"<pre><code id='prompt-code'>{prompt_text}</code></pre></div>"
    prompt += "<script>function copyPrompt() {var copyText = document.getElementById('prompt-code').innerText; navigator.clipboard.writeText(copyText);}</script>"
    return prompt

def add_uniprot_feature_summary(prompt_text, feature_dataframe, n_genes=2):
    for index, row in feature_dataframe.iterrows():
        number_of_genes = 0
        if row['Number of Genes'] is not None:
            number_of_genes = int(row['Number of Genes'])
        if number_of_genes >= n_genes:
            prompt_text += f"{row['Feature']}: {row['Number of Genes']} proteins: {row['Genes']}\n"
    return prompt_text


def create_nesa_system_analysis_page(model_name, version, system_name, protein_list, tsv_data, n_genes=2):


    # Read the TSV data into a DataFrame
    tsv_file = StringIO(tsv_data)
    df = pd.read_csv(tsv_file, sep='\t')

    # Filter the DataFrame based on the n_genes criterion
    df = df[df['Number of Genes'] >= n_genes]
    
    uniprot_table = dataframe_to_html_table(df)

    # Create the ChatGPT analysis section with a placeholder for the analysis text
    chatgpt_analysis = "<h2>ChatGPT 4 Analysis</h2>\n<p>Paste ChatGPT analysis here:</p>\n<!-- Analysis goes here -->"

    page_title = f"{system_name} Summary"
    
    # Create the HTML page with the system summary
    html_page = f"<!DOCTYPE html>\n<html>\n<head>\n<title>{page_title}</title>\n<style>\n \
              body {{background-color: skyblue;}} \n \
              h1, h2 {{color: white; font-family: 'Roboto', sans-serif;}} \n \
              </style>\n</head>\n<body>\n<h1>{system_name} System Summary</h1>\n \
              <h2>{model_name}: {version}</h2>\n \
              \n{chatgpt_analysis}\
              <h2>Proteins</h2>\n<p>{', '.join(protein_list)}</p>\n \
              <h2>UniProt Data</h2>\n{uniprot_table}\n \
              </body>\n</html>"
    
    #html_page = f"<!DOCTYPE html>\n<html>\n<head>\n<title>{page_title}</title>\n</head>\n<body>\n<h1>{system_name} System Summary</h1>\n<h2>Proteins</h2>\n<p>{', '.join(protein_list)}</p>\n<h2>UniProt Data</h2>\n{uniprot_table}\n{chatgpt_analysis}\n</body>\n</html>"

    return html_page

prompt = create_nesa_chatGPT_prompt(get_genes(system), tsv_data)
prompt_page = create_system_prompt_page(system_name, prompt)
write_system_page(prompt_page, model_name, version, system_name, "chatgtp_prompt", get_root_path())
analysis_page = create_nesa_system_analysis_page(model_name, version, system_name, get_genes(system), tsv_data)
write_system_page(analysis_page, model_name, version, system_name, "analysis", get_root_path())
# update the model page to include links to the new pages
write_model_page(model_name, version, get_root_path())