## NeSA System Report
#### Goal: 
 - Assign a candidate name for the system
 - Explain the assignment, including alternatives
 - Provide information about the system and its proteins
 - Provide results of analyses
 
#### Report Structure:
 - the system ID
 - The assigned name (human chosen)
 - A brief summary of the system
 - ChatGPT candidate names
     - Including an explanation of each name
 - Supporting information
     - system level
         - summary of shared features
         - features include processes, cellular components, and diseases
     - per gene from uniprot (TBD)
         - gene name
         - gene summary
         - go BP
         - go CC
         - Disease association
     - ChatGPT
         - General analysis
         - search suggestions
             - google
             - pubmed
         - Proteins of special interest, such as candidate drug targets
         

### Model information

In [2]:
import os
from file_io import get_model_path
# Get the user's home directory
home_dir = os.path.expanduser("~")

# Create the path for the "models" folder in the home directory
models_path = os.path.join(home_dir, "models")

print(models_path)

model_name = "nesa"
version = "1"
print(model_path)

/Users/depratt/models
/Users/depratt/models/nesa/1


### Load the NeSA model

This needs to be updated to get the data from the models/nesa/v1 folder 

In [4]:
import json
cx2_filename = "hidef_50_0.75_5_leiden_pruned.edges.cx2"
temp_model_path = os.path.join(home_dir, model_name, cx2_filename)
with open(temp_model_path, encoding='utf-8') as f:
    data = f.read()
    model = json.loads(data)


### Select the system


In [7]:
from file_io import find_system_in_systems, find_first_dict_with_key, write_system_json, get_root_path

# root_path = os.getenv("MODEL_ANNOTATION_ROOT")
# print(f'model annotation root = {root_path}')

system_name = "Cluster4-40"
systems = find_first_dict_with_key(model, "nodes")["nodes"]
system = find_system_in_systems(systems, system_name).get("v")
write_system_json(system, model_name, version, system_name, "data", get_root_path())


### Load Model Data
This function is NeSA-specific
It needs to be updated to get the data from models/nesa

In [9]:
import pandas as pd
import os
from file_io import read_system_json

def dataframe_to_dict(df):
    """
    Convert a pandas DataFrame into a dictionary indexed by the first column.

    :param df: The pandas DataFrame to convert.
    :return: A dictionary indexed by the first column.
    """
    # Set the index to be the first column
    df = df.set_index(df.columns[0])

    # Convert the DataFrame to a dictionary
    result_dict = df.to_dict(orient='index')

    return result_dict

def make_gene_candidacy_text(gene_data, selected_genes):
    attribute_descriptions = {
        'hasHighConfidenceMut': "Genes with high confidence mutation in ASD-diagnosed individuals:",
        'in_WES_2020': "ASD-risk genes identified in Satterstrom et al., 2020:",
        'in_WES_2022': "ASD-risk genes identified in Fu et al., 2022:",
        'connectedToASDPPI': "Proteins connected to ASD-risk proteins (AP-MS experiment):",
        'in_SFARI_cat_2_3': "ASD-risk in SFARI categories 2 and 3:"
    }
    attributes = {key: [] for key in attribute_descriptions.keys()}

    for gene, attributes_data in gene_data.items():
        if gene not in selected_genes:
            continue
        for attribute in attributes.keys():
            if attributes_data[attribute] == 1:
                attributes[attribute].append(gene)

    text_output = ''
    for attribute, genes in attributes.items():
        if len(genes) != 0:
            gene_list = ', '.join(genes)
            text_output += f"{attribute_descriptions[attribute]} {gene_list}\n"

    return text_output.strip()


# Get the path to the 'nesa' folder in your home directory
home_dir = os.path.expanduser("~")
nesa_folder_path = os.path.join(home_dir, "nesa")

# Set the file path for 'geneCandidacy_DF.xlsx' in the 'nesa' folder
file_path = os.path.join(nesa_folder_path, 'geneCandidacy_DF.xlsx')

# Load the first worksheet of the Excel file into a DataFrame
df = pd.read_excel(file_path, sheet_name=0)

# Convert the DataFrame to a dictionary indexed by the first column
gene_data = dataframe_to_dict(df)
system = read_system_json(model_name, version, system_name, "data", get_root_path())
gene_names = system.get("CD_MemberList").split(" ")
print(gene_names)
gene_candidacy_text = make_gene_candidacy_text(gene_data, gene_names)
gene_candidacy_text             


['ADAMTS13', 'ADGRV1', 'C16orf46', 'C2CD4A', 'C2CD4B', 'CACNG2', 'CARD16', 'CASP1', 'CC2D1B', 'CELSR1', 'CELSR2', 'CELSR3', 'CHRM5', 'CLBA1', 'CMA1', 'COL20A1', 'DCANP1', 'DCDC1', 'DCDC2B', 'DCHS1', 'DICER1', 'DTWD2', 'DTX4', 'FAM169BP', 'FAT1', 'FAT3', 'FAT4', 'FREM2', 'FZD3', 'HPSE2', 'IFNE', 'INSL4', 'LAG3', 'LINC01588', 'LINC02694', 'LIX1L', 'NUP210P1', 'NXPH2', 'PASK', 'PCDH20', 'PCDH7', 'PPIAL4E', 'PPIAL4G', 'PRSS37', 'RBPJL', 'RPS14P3', 'RPSAP52', 'SMTNL1', 'SOCS1', 'SPATA4', 'SSX6P', 'TAFA5', 'UCN3', 'XAGE1A', 'XAGE1B', 'XCL1', 'XYLB', 'ZBBX']


'Genes with high confidence mutation in ASD-diagnosed individuals: DCDC2B, DICER1, FAT3\nASD-risk in SFARI categories 2 and 3: FAT1'

In [11]:
from file_io import write_system_json
from hugo import get_hugo_data

hugo_data = get_hugo_data(model_name, version, system_name)
#write_system_file(system_name, hugo_data, "hugo")
write_system_json(hugo_data, model_name, version, system_name, "hugo", get_root_path())
hugo_data
        

reading system info from root_path = /Users/depratt/Dropbox (Personal)/GitHub/model_annotation
['ADAMTS13', 'ADGRV1', 'C16orf46', 'C2CD4A', 'C2CD4B', 'CACNG2', 'CARD16', 'CASP1', 'CC2D1B', 'CELSR1', 'CELSR2', 'CELSR3', 'CHRM5', 'CLBA1', 'CMA1', 'COL20A1', 'DCANP1', 'DCDC1', 'DCDC2B', 'DCHS1', 'DICER1', 'DTWD2', 'DTX4', 'FAM169BP', 'FAT1', 'FAT3', 'FAT4', 'FREM2', 'FZD3', 'HPSE2', 'IFNE', 'INSL4', 'LAG3', 'LINC01588', 'LINC02694', 'LIX1L', 'NUP210P1', 'NXPH2', 'PASK', 'PCDH20', 'PCDH7', 'PPIAL4E', 'PPIAL4G', 'PRSS37', 'RBPJL', 'RPS14P3', 'RPSAP52', 'SMTNL1', 'SOCS1', 'SPATA4', 'SSX6P', 'TAFA5', 'UCN3', 'XAGE1A', 'XAGE1B', 'XCL1', 'XYLB', 'ZBBX']
getting Hugo data for ADAMTS13
getting Hugo data for ADGRV1
getting Hugo data for C16orf46
getting Hugo data for C2CD4A
getting Hugo data for C2CD4B
getting Hugo data for CACNG2
getting Hugo data for CARD16
getting Hugo data for CASP1
getting Hugo data for CC2D1B
getting Hugo data for CELSR1
getting Hugo data for CELSR2
getting Hugo data for CEL

{'ADAMTS13': {'hgnc_id': 'HGNC:1366',
  'symbol': 'ADAMTS13',
  'name': 'ADAM metallopeptidase with thrombospondin type 1 motif 13',
  'status': 'Approved',
  'locus_type': 'gene with protein product',
  'prev_symbol': ['C9orf8'],
  'prev_name': ['a disintegrin-like and metalloprotease (reprolysin type) with thrombospondin type 1 motif, 13'],
  'alias_symbol': ['VWFCP',
   'TTP',
   'vWF-CP',
   'FLJ42993',
   'MGC118899',
   'MGC118900',
   'DKFZp434C2322'],
  'location': '9q34.2',
  'date_approved_reserved': '1999-08-23T00:00:00Z',
  'date_modified': '2023-03-15T00:00:00Z',
  'date_name_changed': '2015-11-09T00:00:00Z',
  'ena': ['AJ011374'],
  'entrez_id': '11093',
  'mgd_id': ['MGI:2685556'],
  'iuphar': 'objectId:1685',
  'merops': 'M12.241',
  'orphanet': 117776,
  'pubmed_id': [11557746, 11535495],
  'refseq_accession': ['NM_139025'],
  'gene_group': ['ADAM metallopeptidases with thrombospondin type 1 motif'],
  'date_symbol_changed': '2001-09-21T00:00:00Z',
  'vega_id': 'OTTHUM

### Get Uniprot Data
Gathers a protein's function, pathway, disease association, aliases, and summary description data from the uniprot database using its REST api


In [None]:
from uniprot import get_uniprot_data_for_system

system = read_system_json(model_name, version, system_name, "data", get_root_path())
hugo_data = read_system_json(model_name, version, system_name, "hugo", get_root_path())
uniprot_data = get_uniprot_data_for_system(system, hugo_data=hugo_data)
write_system_json(uniprot_data, model_name, version, system_name, "uniprot", get_root_path())

### Summarized Features 
analyze the information to find features shared between n or more SPs


NameError: name 'uniprot_data' is not defined

In [None]:
import pandas as pd

def create_nesa_chatGPT_prompt(protein_list, tsv_data, n_genes=2, gene_candidacy_text=''):
    """
    Create a ChatGPT prompt based on the given protein list and TSV data.

    :param protein_list: A list of protein names.
    :param tsv_data: A string containing TSV formatted summary data.
    :param n_genes: An integer representing the minimum number of genes for a feature to be included.
    :return: A string containing the ChatGPT prompt in HTML format.
    """
    # Read the TSV data into a DataFrame
    tsv_file = StringIO(tsv_data)
    df = pd.read_csv(tsv_file, sep='\t')

    # Filter the DataFrame based on the n_genes criterion
    #df['Number of Genes'] = pd.to_numeric(df['Number of Genes'], errors='coerce')
    #print(df[df['Number of Genes']])

    #df = df[df['Number of Genes'] >= n_genes]

    # Generate the ChatGPT prompt in HTML format
    prompt_text = f"Your response should be formatted as HTML paragraphs"
    prompt_text += f"The following is a system of interacting proteins."
    prompt_text += f' Write a critical analysis of this system, describing your reasoning as you go.'
    prompt_text += f'\nWhat mechanisms and biological processes are performed by this system?'
    prompt_text += f'\nWhat cellular components and complexes are involved in this system?'
    prompt_text += f'\nProteins: '
    prompt_text += ", ".join(protein_list) + ".\n\n"
    prompt_text += f"\nA critical goal of the analysis is to determine what, if any, relationship this system has to ASD (Autism Spectrum Disorder)"
    prompt_text += f"\nHere are some ASD-related facts about these proteins"
    prompt_text += f"\n{gene_candidacy_text}"
    prompt_text += f'\n\nSystem features from a Uniprot analysis: \n'

    for index, row in df.iterrows():
        number_of_genes = 0
        if row['Number of Genes'] is not None:
            number_of_genes = int(row['Number of Genes'])
        if number_of_genes >= n_genes:
            prompt_text += f"{row['Feature']}: {row['Number of Genes']} proteins: {row['Genes']}\n"
        

    prompt = f"<div class='code-section'><button class='copy-prompt-button' onclick='copyPrompt()'>Copy Prompt</button>"
    prompt += f"<pre><code id='prompt-code'>{prompt_text}</code></pre></div>"
    prompt += "<script>function copyPrompt() {var copyText = document.getElementById('prompt-code').innerText; navigator.clipboard.writeText(copyText);}</script>"
    return prompt

def create_system_prompt_page(system_name, prompt):
    """
    Create an HTML page with a ChatGPT prompt for the specified system.

    :param system_name: The name of the system.
    :param prompt: The ChatGPT prompt for the system.
    :return: A string containing the HTML page.
    """
    # Create the HTML page with the specified title and prompt content
    html = f"<!DOCTYPE html>\n<html>\n<head>\n<title>{system_name} Summary ChatGPT Prompt</title>\n</head>\n<body>\n{prompt}\n</body>\n</html>"
    return html

def dataframe_to_html_table(df):
    table_html = "<table>\n"
    table_html += "<thead>\n<tr>\n"
    table_html += "".join([f"<th>{col}</th>" for col in df.columns])
    table_html += "</tr>\n</thead>\n<tbody>\n"
    for index, row in df.iterrows():
        table_html += "<tr>\n"
        table_html += "".join([f"<td>{val}</td>" for val in row.values])
        table_html += "</tr>\n"
    table_html += "</tbody>\n</table>"
    return table_html


def create_nesa_system_analysis_page(protein_list, tsv_data, n_genes=2):
    # Read the TSV data into a DataFrame
    tsv_file = StringIO(tsv_data)
    df = pd.read_csv(tsv_file, sep='\t')

    # Filter the DataFrame based on the n_genes criterion
    df = df[df['Number of Genes'] >= n_genes]
    
    uniprot_table = dataframe_to_html_table(df)

    # Create the ChatGPT analysis section with a placeholder for the analysis text
    chatgpt_analysis = "<h2>ChatGPT 4 Analysis</h2>\n<p>Paste ChatGPT analysis here:</p>\n<!-- Analysis goes here -->"

    # Create the HTML page with the system summary
    page_title = f"{system_name} Summary"
    html_page = f"<!DOCTYPE html>\n<html>\n<head>\n<title>{page_title}</title>\n</head>\n<body>\n<h1>{system_name} System Summary</h1>\n<h2>Proteins</h2>\n<p>{', '.join(protein_list)}</p>\n<h2>UniProt Data</h2>\n{uniprot_table}\n{chatgpt_analysis}\n</body>\n</html>"

    return html_page


def get_file_links(model_name, version, root_path):
    """
    Get the HTML for the links to the files in the model directory.

    :param model_name: The name of the model.
    :param version: The version of the model.
    :param root_path: The root path of the model.
    :return: The HTML for the links to the files in the model directory.
    """
    file_links = ""
    model_path = os.path.join(root_path, model_name, version)

    # Get a list of all the files in the model directory
    for root, dirs, files in os.walk(model_path):
        # Don't include the root directory in the links
        #if root == model_path:
         #   continue

        # Add a heading for the current directory
        file_links += f"<li><strong>{os.path.basename(root)}</strong></li>"

        # Add links for all the files in the current directory
        for file in files:
            file_path = os.path.join(root, file)
            file_url = os.path.relpath(file_path, model_path)
            file_links += f"<li><a href='{file_url}'>{file}</a></li>"

    return file_links

sys = read_system_json(model_name, version, system_name, "data", get_root_path())
protein_list = sys.get("CD_MemberList").split(" ")
prompt = create_chatGPT_prompt(protein_list, tsv_data, gene_candidacy_text=gene_candidacy_text)
prompt_page = create_system_prompt_page(system_name, prompt)
write_system_page(prompt_page, model_name, version, system_name, "chatgtp_prompt", get_root_path())
analysis_page = create_nesa_system_analysis_page(protein_list, tsv_data)
write_system_page(analysis_page, model_name, version, system_name, "analysis", get_root_path())

write_model_page(model_name, version, get_root_path())

In [None]:
# Next round of prompts
'''
Your response should be formatted as HTML paragraphs
The following is a system of interacting proteins. 
Write a critical analysis of this system, describing your reasoning as you go.
What mechanisms and biological processes are performed by this system?
What cellular components and complexes are involved in this system?
Do not recapitulate the list of proteins, do not simply restate the other information provided. 
'''

'''
Give me 5 candidate names for this system. 
The names should not include direct references to ASD. Do not create acronyms.
format your output as an HTML list

'''

'''

Genes with high confidence mutation in ASD-diagnosed individuals: DCDC2B, DICER1, FAT3
ASD-risk in SFARI categories 2 and 3: FAT1
Are any of the genes with high-confidence mutations in this system potentially novel ASD-risk genes? 
If a gene is included in one of the ASD-risk gene sets, such as SFARI, it is not novel. 
What other evidence supports each candidate? 
For example, is it associated with a disease that is co-morbid with ASD? 
Give specific, analytic reasons for each candidate. Be succinct and omit general caveats.
Format your output as HTML paragraphs

'''




### Uniprot Report

In [None]:
import pandas as pd

def format_uniprot_report(system_id, uniprot_data):
    df = pd.DataFrame(uniprot_data).transpose()
    html_table = df.to_html()

    html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{system_id} UniProt Report</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            margin: 40px;
        }}
        table {{
            border-collapse: collapse;
            width: 100%;
        }}
        th, td {{
            border: 1px solid #dddddd;
            padding: 8px;
            text-align: left;
        }}
        th {{
            background-color: #f2f2f2;
        }}
    </style>
</head>
<body>
    <h1>{system_id} UniProt Report</h1>
    <table>
        {html_table}
    </table>
</body>
</html>
"""

    with open(f"{system_id}_uniprot_report.html", "w") as f:
        f.write(html_content)

    print(f"UniProt report saved to {system_id}_uniprot_report.html")


### Get Tissue Expression Data

In [None]:
import pandas as pd

# GTEx
def download_gtex_data():
    gtex_sample_url = "https://storage.googleapis.com/gtex_analysis_v8/sample_attributes/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"
    gtex_expression_url = "https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz"
    
    gtex_sample_metadata = pd.read_csv(gtex_sample_url, sep='\t')
    gtex_expression_data = pd.read_csv(gtex_expression_url, sep='\t', skiprows=2, compression='gzip', nrows=100)  # Limit to 100 rows for demonstration purposes

    return gtex_sample_metadata, gtex_expression_data

def filter_gtex_brain_expression(gtex_sample_metadata, gtex_expression_data):
    brain_samples = gtex_sample_metadata[gtex_sample_metadata['SMTS'] == 'Brain']
    brain_sample_ids = set(brain_samples['SAMPID'])
    gtex_brain_expression = gtex_expression_data[['Name', 'Description'] + list(brain_sample_ids.intersection(gtex_expression_data.columns))]
    
    return gtex_brain_expression

# BrainSpan
def download_brainspan_data():
    brainspan_url = "http://www.brainspan.org/static/download.html"
    brainspan_expression_url = "http://www.brainspan.org/api/v2/well_known_file_download/267666525"
    
    brainspan_metadata = pd.read_html(brainspan_url)[0]
    brainspan_expression_data = pd.read_csv(brainspan_expression_url, sep='\t', nrows=100)  # Limit to 100 rows for demonstration purposes

    return brainspan_metadata, brainspan_expression_data

def filter_brainspan_brain_expression(brainspan_metadata, brainspan_expression_data):
    brain_regions = brainspan_metadata[brainspan_metadata['Column Type'] == 'brain']
    brain_column_ids = set(brain_regions['Column ID'])
    brainspan_brain_expression = brainspan_expression_data[['gene_id', 'ensembl_gene_id'] + list(brain_column_ids.intersection(brainspan_expression_data.columns))]
    
    return brainspan_brain_expression

def download_and_filter_brain_expression_data():
    # GTEx
    gtex_sample_metadata, gtex_expression_data = download_gtex_data()
    gtex_brain_expression = filter_gtex_brain_expression(gtex_sample_metadata, gtex_expression_data)
    
    # BrainSpan
    brainspan_metadata, brainspan_expression_data = download_brainspan_data()
    brainspan_brain_expression = filter_brainspan_brain_expression(brainspan_metadata, brainspan_expression_data)

    # Print samples of the data
    print("GTEx brain expression data (first 5 rows):")
    print(gtex_brain_expression.head())
    print("\nBrainSpan brain expression data (first 5 rows):")
    print(brainspan_brain_expression.head())
    return brainspan_brain_expression, gtex_brain_expression



In [None]:
brainspan_metadata, brainspan_expression_data = download_brainspan_data()

### Tissue Expression Report
Report on the BrainScan and GTEX data

In [None]:
def format_expression_report(system_id, gtex_data, brainspan_data):
    gtex_df = pd.DataFrame(gtex_data)
    brainspan_df = pd.DataFrame(brainspan_data)

    gtex_html_table = gtex_df.to_html()
    brainspan_html_table = brainspan_df.to_html()

    html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{system_id} Expression Report</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            margin: 40px;
        }}
        table {{
            border-collapse: collapse;
            width: 100%;
        }}
        th, td {{
            border: 1px solid #dddddd;
            padding: 8px;
            text-align: left;
        }}
        th {{
            background-color: #f2f2f2;
        }}
    </style>
</head>
<body>
    <h1>{system_id} Expression Report</h1>
    <h2>GTEx Data</h2>
    <table>
        {gtex_html_table}
    </table>
    <h2>BrainSpan Data</h2>
    <table>
        {brainspan_html_table}
    </table>
</body>
</html>
"""

    with open(f"{system_id}_expression_report.html", "w") as f:
        f.write(html_content)

    print(f"Expression report saved to {system_id}_expression_report.html")


In [None]:
format_expression_report(system_id, gtex_brain_expression, brainspan_brain_expression)

### ChatGPT Summarize Analysis Parcel


In [None]:
import os
import openai
import json

openai.api_key = os.getenv("OPENAI_API_KEY")

def create_parcel(data):
    parcel = "\n".join(f"{key}: {value}" for key, value in data.items())
    return parcel

def summarize_parcel(parcel):
    prompt = f"Summarize this information about my set of proteins as brief text and a set of relevant keywords and key phrases:\n\n{parcel}"
    response = openai.Completion.create(
        engine="text-davinci-002",
        prompt=prompt,
        max_tokens=100,
        n=1,
        stop=None,
        temperature=0.5,
    )

    summary = response.choices[0].text.strip()
    return summary

def write_analysis_to_file(filename, analysis_data):
    with open(filename, "w") as f:
        json.dump(analysis_data, f, indent=4)



### Summarize and select keywords
select subsets of the gathered information and query ChatGPT to summarize and extract key concepts

GPT prompts:
 - I am analyzing a system of proteins: *SPs*
 - *DOI proteins* are known to be associated with <DOI>
 - The following table lists disease association that are shared by two or more of the proteins are involved in shared biological process or mechanism
 - These sets of proteins share a disease association
    

### Summarize the Analysis

**ChatGPT prompts: required some cleanup because they were originally generated as "main" functions.**

Write a function in which makes parcels of (1) the uniprot data file created by the uniprot data downloader (2) the GTEX and BrainSpan data file created by its downloader. The summarize each into a datastructure with a description and a list of keywords. Then format the summaries and keywords to provide to ChatGPT along with  "Review these summaries and keywords for my set of proteins. Critique them and synthesize the information into (1) candidate names for the set of proteins and (2) an outline of the reasoning behind the candidate names. Return this as a datastructure. ". Then append this to the datastructure with the parcels, include the list of gene names, and write out as <system id>analysis

In [None]:
import json

def read_data_from_file(filename):
    with open(filename, "r") as f:
        data = json.load(f)
    return data

def create_parcel_from_uniprot_data(uniprot_data):
    parcel = "\n".join(f"{key}: {value}" for key, value in uniprot_data.items() if key not in ['function', 'pathway', 'disease_association'])
    return parcel

def create_parcel_from_expression_data(expression_data):
    parcel = f"Gene expression data from GTEx and BrainSpan: {expression_data}"
    return parcel

def summarize_analyses(system_id:"001"):
    output_filename = f"system_{system_id}_analysis.json"

    uniprot_data_filename = f"system_{system_id}_analysis"
    gtex_brain_expression_filename = "gtex_brain_expression.json"
    brainspan_brain_expression_filename = "brainspan_brain_expression.json"

    uniprot_data = read_data_from_file(uniprot_data_filename)
    gtex_brain_expression = read_data_from_file(gtex_brain_expression_filename)
    brainspan_brain_expression = read_data_from_file(brainspan_brain_expression_filename)

    parcels = []
    for gene_name, gene_uniprot_data in uniprot_data.items():
        parcel = create_parcel_from_uniprot_data(gene_uniprot_data)
        summary = summarize_parcel(parcel)
        parcels.append({
            "gene_name": gene_name,
            "parcel": parcel,
            "summary": summary
        })

    expression_data = {
        "gtex": gtex_brain_expression,
        "brainspan": brainspan_brain_expression
    }
    parcel = create_parcel_from_expression_data(expression_data)
    summary = summarize_parcel(parcel)
    parcels.append({
        "parcel": parcel,
        "summary": summary
    })

    summaries_and_keywords = [parcel["summary"] for parcel in parcels]
    chatgpt_prompt = f"Review these summaries and keywords for my set of proteins:\n\n{summaries_and_keywords}\n\nCritique them and synthesize the information into (1) candidate names for the set of proteins and (2) an outline of the reasoning behind the candidate names."

    chatgpt_response = summarize_parcel(chatgpt_prompt)

    analysis_data = {
        "gene_names": list(uniprot_data.keys()),
        "parcels": parcels,
        "chatgpt_response": chatgpt_response
    }

    write_analysis_to_file(output_filename, analysis_data)
    print(f"Analysis data saved to {output_filename}")
    return analysis_data



### System Analysis Report

In [None]:
def format_system_analysis_report(system_id, analysis_data):
    gene_names = analysis_data['gene_names']
    parcels = analysis_data['parcels']
    chatgpt_response = analysis_data['chatgpt_response']

    parcels_html = ""
    for i, parcel in enumerate(parcels):
        parcels_html += f"<h3>Parcel {i+1}: {parcel['gene_name']}</h3>"
        parcels_html += f"<h4>Original Data</h4><pre>{parcel['parcel']}</pre>"
        parcels_html += f"<h4>Summary</h4><p>{parcel['summary']}</p>"

    html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{system_id} System Analysis Report</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            margin: 40px;
        }}
        pre {{
            background-color: #f2f2f2;
            padding: 1em;
            white-space: pre-wrap;
        }}
    </style>
</head>
<body>
    <h1>{system_id} System Analysis Report</h1>
    <h2>Gene Names</h2>
    <p>{', '.join(gene_names)}</p>
    <h2>Parcels</h2>
    {parcels_html}
    <h2>ChatGPT Synthesis</h2>
    <p>{chatgpt_response}</p>
</body>
</html>
"""

    with open(f"{system_id}_system_analysis_report.html", "w") as f:
        f.write(html_content)

    print(f"System analysis report saved to {system_id}_system_analysis_report.html")


### Query INDRA, prioritizing more specific interactions

##### ChatGPT 4 prompts. The output needed a little cleanup and removing a first attempt at ranking statements, superseded by _sort_response_by_relationship_type_ 

Write a function that takes my set of proteins and queries INDRA for the statements for the interactions between each pair. For each pair, return up to 50 statements.

Give me a python list of the INDRA relationships you described ranked by an estimate how specific a relationship is.

Write a function, sort_response_by_relationship_type, that bins a list of Indra statements first by the order of the relationship (a relationship b vs b relationship a) and then by the relationship types in the list indra_relationships_ranked. The function should then make a list of bins ranked  from most specific to least specific. Finally, it should repeatedly loop through the bins, picking the statement with the highest confidence score and adding to a list ranked_statements until there are no statements remaining. If there are no statements remaining in a bin, skip the bin. Return the ranked statements. Also, comment the function.

In [None]:
import requests
from itertools import combinations
import indra

def query_indra(agent_a, agent_b, limit=50):
    base_url = "https://db.indra.bio/statements/from_agents"
    query = f'HasAgent({agent_a}) & HasAgent({agent_b})'
    indra.
    params = {
        "subject": agent_a,
        "object": agent_b,
        "offset": 0,
        "limit": limit,
        "format": "json",
    }

    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        data = response.json()
        print(data["statements"])
        return data["statements"]
    else:
        print(f"Error {response.status_code}: {response.text}")
        return []

def get_indra_pairwise_interactions(proteins, limit=50):
    pairwise_interactions = {}
    for agent_a, agent_b in combinations(proteins, 2):
        print(f'Querying {agent_a} - {agent_b}')
        
        statements = 
        statements_a_b = query_indra(agent_a, agent_b)
        print(f'Querying {agent_b} -> {agent_a}')
        statements_b_a = query_indra(agent_b, agent_a)
        ranked_statements = rank_statements(statements)[:limit]
        pairwise_interactions[(agent_a, agent_b)] = [s[1] for s in statements]

    return pairwise_interactions

'''
You can refer to the INDRA documentation for a more comprehensive 
and up-to-date list: https://indra.readthedocs.io/en/latest/statements.html
'''
indra_relationships_ranked = [
    "Complex",
    "Binding",
    "Activation",
    "Inhibition",
    "IncreaseAmount",
    "DecreaseAmount",
    "Translocation",
    "Phosphorylation",
    "Dephosphorylation",
    "Ubiquitination",
    "Deubiquitination",
    "Acetylation",
    "Deacetylation",
    "Methylation",
    "Demethylation",
    "Glycosylation",
    "Deglycosylation",
    "Palmitoylation",
    "Depalmitoylation",
    "Myristoylation",
    "Demyristoylation",
    "Hydroxylation",
    "Dehydroxylation",
    "Sumoylation",
    "Desumoylation",
    "Autophosphorylation",
    "SelfModification",
    "ActiveForm",
]

def sort_response_by_relationship_type(statements, indra_relationships_ranked):
    """
    Sort a list of INDRA statements based on the order of the relationship and the relationship types in
    the provided indra_relationships_ranked list.

    :param statements: A list of INDRA statements to be sorted.
    :param indra_relationships_ranked: A list of INDRA relationship types ranked by specificity.
    :return: A list of ranked INDRA statements.
    """

    # Create bins for each relationship type
    bins = {relation: [] for relation in indra_relationships_ranked}

    # Add statements to the appropriate bin based on the relationship type
    for statement in statements:
        relation = statement["type"]
        if relation in bins:
            bins[relation].append(statement)

    # Sort each bin based on the confidence score
    for relation in bins:
        bins[relation].sort(key=lambda s: s["evidence"][0]["confidence"], reverse=True)

    # Create a list to store the ranked statements
    ranked_statements = []

    # Loop through the bins, picking the statement with the highest confidence score
    # and adding it to the ranked_statements list until there are no statements remaining
    while True:
        statements_added = 0
        for relation in indra_relationships_ranked:
            if bins[relation]:
                ranked_statements.append(bins[relation].pop(0))
                statements_added += 1
        if statements_added == 0:
            break

    return ranked_statements


In [None]:
# Test Query
#protein_list = system.get("CD_MemberList").split(" ")
protein_list = ["KCNMA1", "SCN1A", "SCN2A"]
pairwise_interactions = get_indra_pairwise_interactions(protein_list)
ranked_statements = sort_response_by_relationship_type(pairwise_interactions, 
                                                       indra_relationships_ranked)

for (agent_a, agent_b), statements in pairwise_interactions.items():
    print(f"Interactions between {agent_a} and {agent_b}:")
    for statement in ranked_statements:
        print(f"- {statement['type']} (Evidence: {statement['evidence'][0]['text']})")
    print()
