In [4]:
import pandas as pd
from utils.openai_query import openai_chat
import re
import json

In [5]:
jsonFilePath = "jsonFiles/OmicsRunLLM.json"
genesCol = "GeneList"
nameCol  = "GeneSetName"
outputFilePath = "data/omics_revamped_LLM_DF.tsv"

with open(jsonFilePath) as json_file:
    config = json.load(json_file)
    
context = config['CONTEXT']
gpt_model = config['GPT_MODEL']
temperature = config['TEMP']
max_tokens = config['MAX_TOKENS']
rate_per_token = config['RATE_PER_TOKEN']
LOG_FILE = 'data/supporting_gene_log.json'
DOLLAR_LIMIT = config['DOLLAR_LIMIT']

In [30]:
# functions to compute the Jaccard Index and coverage for the query gene list and the GO term genes
def get_JI(GeneList: str, enrichr_genes) -> float:
    # Check if enrichr_genes is a string
    if not isinstance(enrichr_genes, str):
        #print(f"Warning: enrichr_genes is not a string: {enrichr_genes}")
        return 0
    
    # Assume separation is the same
    geneSetGenes = GeneList.split(" ")
    enrichRGenes = enrichr_genes.split(" ")
    
    # Calculate the Jaccard Index (JI)
    intersection = set(geneSetGenes).intersection(enrichRGenes)
    union = set(geneSetGenes).union(enrichRGenes)
    
    JI = len(intersection) / len(union) if len(union) != 0 else 0
    
    return JI

def get_coverage(GeneList: str, enrichr_genes) -> float:
    # Check if enrichr_genes is a string
    if not isinstance(enrichr_genes, str):
        #print(f"Warning: enrichr_genes is not a string: {enrichr_genes}")
        return 0
    
    # Assume separation is the same
    geneSetGenes = GeneList.split(" ")
    enrichRGenes = enrichr_genes.split(" ")
    
    # Calculate the intersection
    intersection = set(geneSetGenes).intersection(enrichRGenes)
    
    # Calculate the coverage
    coverage = len(intersection) / len(geneSetGenes) if len(geneSetGenes) != 0 else 0

In [50]:
# Read the results of processing the 300 'omics datasets
temp = pd.read_csv("data/omics_revamped_LLM_Enrichr_simVals_failure_by_apv_best_DF.tsv", delimiter="\t")
input_analyses = temp[temp["LLM Name"] != "System of unrelated proteins"]
print(len(input_analyses))
input_analyses.head(3)

135


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Source,GeneSetID,GeneSetName,GeneList,n_Genes,LLM Name,LLM Analysis,Score,...,Genes,Term,GO term,GO ID,GO_term_genes,LLM_name_GO_term_sim,enrichr_JI,coverage,LLM_success_TF,enrichr_success_TF
0,7148,0,L1000,BRD-A00546892_-666_MCF7_6.0_h_10.0_um,BRD-A00546892 -666 MCF7 6.0 h 10.0 um,CITED2 COL5A1 CRABP2 KCTD12 MDFIC MMP2 NRP1 OR...,24,Cellular Matrix Remodeling and Tissue Development,"1. CITED2, TWIST1, and LMO2 are transcriptiona...",0.85,...,CITED2;TWIST1,Regulation Of Peroxisome Proliferator Activate...,Regulation Of Peroxisome Proliferator Activate...,GO:0035358,ASXL2 ASXL1 PTGIS BMP2 CITED2 GPS2 LMO3 PLIN5 ...,0.245878,0.0625,0.083333,True,False
1,7149,0,L1000,BRD-A00993607_ALPRENOLOL_MCF7_6.0_h_10.0_um,BRD-A00993607 ALPRENOLOL MCF7 6.0 h 10.0 um,1060P11.3 ADM AHR AMIGO2 ARL4C ATP10D CAV2 CD4...,47,Cellular Adhesion and Extracellular Matrix Int...,"1. Several proteins in this system, such as CD...",0.85,...,NT5E;HPRT1,Purine Ribonucleoside Monophosphate Catabolic ...,Purine Ribonucleoside Monophosphate Catabolic ...,GO:0009169,PNP NT5C1A NT5E HPRT1 SORD PRTFDC1 NT5C2,0.166891,0.038462,0.042553,True,False
5,7374,0,L1000,BRD-A01593789_CHLORMADINONE ACETATE_MCF7_6.0_h...,BRD-A01593789 CHLORMADINONE ACETATE MCF7 6.0 h...,1060P11.3 ADH5 ADM ATF3 CASP1 CLEC2B CPA3 CRIM...,38,Stress Response and Apoptosis Regulation,"1. ATF3, Activating Transcription Factor 3, is...",0.85,...,TNC;FOLR1,Axon Regeneration (GO:0031103),Axon Regeneration,GO:0031103,GAP43 RTN4RL1 RTN4RL2 TNC NREP MTR MAPK8IP3 FO...,0.306703,0.044444,0.052632,True,False


In [100]:
count_genes_prompt_template_old = """
The provided text is an analysis of a gene set to describe its common functions and to provide a name for the gene set reflecting the predominant function or functions.

Do not critique the analysis or the name. Your job is to find the genes that support the name

You are provided with this list of the gene symbols mentioned in the text. Only consider these genes.

<genes mentioned in the text>{genes_in_text}</genes mentioned in the text>
 
For each gene from that list mentioned in a sentence or paragraph, decide whether the text is making a *definite* assertion about the gene that supports the name.
DO NOT make your own assesments about the gene's function or the validity of the analysis, just evaluate the text.
<example of definite assertion>
"XRCC1 is involved in the DNA damage response"
</example of definite assertion>
<example of non-definite assertion>
"E2F1 may be involved in homolougus recombination."
"SREBF1 could be related to RAD51's function."
</example of non-definite assertion>

As you work, briefly explain your reasoning for each gene symbol that supports the name.
<example of explanation>
Reasoning:

1. CITED2: Involved in transcriptional regulation influencing gene expression, which is crucial for tissue development.
2. TWIST1: Crucial for mesodermal tissue development and epithelial-mesenchymal transition (EMT), directly supporting tissue development.
3. LMO2: Implicated in hematopoiesis and vascular development, supporting tissue development.
4. COL5A1: Encodes a component of type V collagen, essential for connective tissue structure and function, supporting tissue development and extracellular matrix remodeling.
</example of explanation>

For each gene symbol that supports the name, add it to your list of supporting genes.

<text>{text}</text>

<name>{name}</name>

Output the supporting genes and non-supporting genes in the following format: 

<format>
-- Explanation --
<explanation of the reasoning for the supporting genes>
-- genes supporting the name: <list of gene symbols of genes supporting the name>
</format>
"""

count_genes_prompt_template = """
Analyze the provided text, which describes a gene set's common functions and suggests a name reflecting its predominant function(s). Your task is to identify genes that support this name based solely on the information given in the text.

Context: Gene sets are groups of genes that share common biological functions, pathways, or other characteristics. Naming these sets based on their predominant functions helps researchers quickly understand their significance.

Input:
1. A list of gene symbols mentioned in the text, provided in comma-separated format:
<genes mentioned in the text>{genes_in_text}</genes mentioned in the text>

2. The analysis text:
<text>{text}</text>

3. The suggested name for the gene set:
<name>{name}</name>

Instructions:
1. Evaluate each gene from the provided list that is mentioned in the text.
2. Determine if the text makes a definite assertion about the gene that supports the given name.
   - A definite assertion clearly states a gene's function or role without using speculative language.
   - Example of a definite assertion: "XRCC1 is involved in the DNA damage response"
   - Example of a non-definite assertion: "E2F1 may be involved in homologous recombination"
3. If a gene is mentioned multiple times, consider the strongest assertion made about it.
4. In case of contradictory statements about a gene, favor the most recent or specific assertion.
5. For each gene you determine supports the name:
   - Briefly explain your reasoning (max 50 words per gene)
   - Assign a confidence level (High, Medium, Low) based on the strength of the assertion
6. Handle acronyms or alternative gene names as equivalent to official gene symbols.
7. If no genes seem to support the name or if all genes support it, state this observation.

Output your analysis in the following format:

<format>
-- Summary --
[Provide a brief summary (max 100 words) of why the selected genes support the given name]

-- Explanation --
[Gene Symbol]: [Confidence Level]
[Explanation of reasoning (max 50 words)]

[Repeat for each supporting gene]

-- genes supporting the name: [List of gene symbols of genes supporting the name]

-- genes not supporting the name: [[List of gene symbols of genes not supporting the name]

-- genes with unclear support: [[List of gene symbols mentioned but not clearly linked to the name]
</format>

Do not critique the analysis or the name. Base your evaluation solely on the information provided in the text.
"""

def parse_analysis(text):
    try:
        supporting_genes = re.search(r"genes supporting the name: (.+)", text).group(1).strip("[]").split(", ")
        supporting_count = len(supporting_genes)
        formatted_genes = " ".join(supporting_genes)
    except Exception as e:
        print(text)
        raise Exception("Error parsing response text: {}".format(e))
    return formatted_genes, supporting_count

def get_genecount_mentioned_in_text(genelist, text):
    genes = set(genelist.split(" "))
    number_mentioned = sum(1 for gene in genes if gene in text)
    return number_mentioned

def get_genes_supporting_name(name, text, prompt_template, genes_in_text):
    prompt = prompt_template.format(genes_in_text=genes_in_text, text=text, name=name)
    temperature = 0.0
    model = "gpt-4o"
    analysis, finger_print  = openai_chat(context, prompt, model, temperature, max_tokens, rate_per_token, LOG_FILE, DOLLAR_LIMIT)
    print(analysis)
    supporting_genes, supporting_count = parse_analysis(analysis)
    return supporting_genes, supporting_count, analysis

def process_gene_set_analyses(input_analyses, prompt_template, number_to_process=None, progress_file="data/supporting_gene_counts_progress.tsv"):
    if number_to_process is not None:
        analyses = input_analyses.head(number_to_process).copy()
    else:
        analyses = input_analyses.copy()
    
    print(f"Processing {len(analyses)} analyses")

    def get_supporting(row):
        print(f'processing {row["GeneSetName"]}...')
        return get_genes_supporting_name(row["LLM Name"], row["LLM Analysis"], prompt_template, row["GeneList"])
    
    analyses[["Supporting Genes", "Supporting Count", "LLM Support Analysis"]] = analyses.apply(
        lambda row: pd.Series(get_supporting(row)), axis=1)
    
    def get_mentioned(row): 
        return get_genecount_mentioned_in_text(row["GeneList"], row["LLM Analysis"])
    
    analyses['GenesMentionedInText'] = analyses.apply(get_mentioned, axis=1)
    
    results = analyses[[
        "Source", "GeneSetID", "GeneSetName", "GeneList", "n_Genes", "Supporting Genes", 
        "Supporting Count", 
        "LLM Support Analysis", "GenesMentionedInText"]].copy()

    results.to_csv(progress_file, sep='\t', index=False)
    
    return results

In [103]:
results_df = process_gene_set_analyses(input_analyses, count_genes_prompt_template, number_to_process=10)
results_df

Processing 10 analyses
processing BRD-A00546892 -666 MCF7 6.0 h 10.0 um...
1542
-- Summary --
The selected genes support the given name "Cellular Matrix Remodeling and Tissue Development" as they are involved in processes such as tissue development, extracellular matrix remodeling, and cellular signaling pathways that influence these functions. These processes are essential for organogenesis, tissue repair, and maintaining tissue architecture.

-- Explanation --
CITED2: High
Involved in transcriptional regulation influencing gene expression, crucial for developmental processes.

TWIST1: High
Crucial for mesodermal tissue development and epithelial-mesenchymal transition (EMT).

LMO2: High
Implicated in hematopoiesis and vascular development.

COL5A1: High
Encodes type V collagen, essential for connective tissue structure and function.

MMP2: High
Degrades extracellular matrix components, critical for tissue remodeling and development.

NRP1: High
Involved in angiogenesis and cardiovasc

Unnamed: 0,Source,GeneSetID,GeneSetName,GeneList,n_Genes,Supporting Genes,Supporting Count,LLM Support Analysis,GenesMentionedInText
0,L1000,BRD-A00546892_-666_MCF7_6.0_h_10.0_um,BRD-A00546892 -666 MCF7 6.0 h 10.0 um,CITED2 COL5A1 CRABP2 KCTD12 MDFIC MMP2 NRP1 OR...,24,CITED2 TWIST1 LMO2 COL5A1 MMP2 NRP1,6,-- Summary --\nThe selected genes support the ...,24
1,L1000,BRD-A00993607_ALPRENOLOL_MCF7_6.0_h_10.0_um,BRD-A00993607 ALPRENOLOL MCF7 6.0 h 10.0 um,1060P11.3 ADM AHR AMIGO2 ARL4C ATP10D CAV2 CD4...,47,CD44 EMP1 AMIGO2 POSTN COMP EXT1 CAV2 ARL4C CD...,15,-- Summary --\nThe selected genes support the ...,15
5,L1000,BRD-A01593789_CHLORMADINONE ACETATE_MCF7_6.0_h...,BRD-A01593789 CHLORMADINONE ACETATE MCF7 6.0 h...,1060P11.3 ADH5 ADM ATF3 CASP1 CLEC2B CPA3 CRIM...,38,ATF3 CASP1 ADM CRIM1 DKK3 MECOM PIK3R1 S100A8 ...,10,-- Summary --\nThe selected genes support the ...,10
6,L1000,BRD-A02006392_NITRENDIPINE_MCF7_6.0_h_10.0_um,BRD-A02006392 NITRENDIPINE MCF7 6.0 h 10.0 um,1060P11.3 ADAM17 AKAP12 ALCAM AMFR BIK CASP2 C...,76,TP53 CASP2 BIK HSPA1A GLUL GPX3 CDKN2A FOSL1 A...,11,-- Summary --\nThe selected genes support the ...,11
7,L1000,BRD-A02176148_TUBAIC ACID_MCF7_6.0_h_10.0_um,BRD-A02176148 TUBAIC ACID MCF7 6.0 h 10.0 um,CASP1 CAST CD52 CLC COL1A2 COL3A1 COL5A2 CPA3 ...,34,COL1A2 COL3A1 COL5A2 FN1 LUM SPON1 GJA1 GPNMB ...,12,-- Summary --\nThe selected genes support the ...,15
12,L1000,BRD-A04756508_NORGESTIMATE_MCF7_6.0_h_10.0_um,BRD-A04756508 NORGESTIMATE MCF7 6.0 h 10.0 um,1060P11.3 ACTN1 AMIGO2 ANKRD10 ATP11B C1QA C1Q...,63,C1QA C1QB HLA-C HLA-F CXCL13 IDO1 IGHM IGJ IGK...,22,-- Summary --\nThe selected genes support the ...,22
15,L1000,BRD-A06352418_TERFENADINE_MCF7_6.0_h_10.0_um,BRD-A06352418 TERFENADINE MCF7 6.0 h 10.0 um,1060P11.3 ACLY CDH2 CDKN1C CLU CST3 CXCL12 CXC...,67,HMGCR HMGCS1 FDFT1 DHCR24 DHCR7 MSMO1 SC5D SQL...,18,-- Summary --\nThe selected genes support the ...,18
18,L1000,BRD-A08003242_RHODOMYRTOXIN B_MCF7_6.0_h_10.0_um,BRD-A08003242 RHODOMYRTOXIN B MCF7 6.0 h 10.0 um,1060P11.3 ACKR1 ASNS ATP6V1D AXIN1 BCL6 BEX1 B...,74,DDIT4 DNAJA1 HSPA1A ATF5 ASNS FASN G6PD SCD BC...,20,-- Summary --\nThe selected genes support the ...,21
21,L1000,BRD-A09056319_-666_MCF7_6.0_h_10.0_um,BRD-A09056319 -666 MCF7 6.0 h 10.0 um,CHEK1 DAB2 EGLN3 MFAP5 MXRA5 PDCD4 PLEKHM1 PTP...,27,CHEK1 PDCD4 ZFP36L2 S100A10 ADRB2 AKR1B1 OAS1 ...,8,-- Summary --\nThe selected genes support the ...,27
24,L1000,BRD-A09539288_HOMATROPINE BROMIDE_MCF7_6.0_h_1...,BRD-A09539288 HOMATROPINE BROMIDE MCF7 6.0 h 1...,1060P11.3 BEX1 CD44 CDC42EP3 CHI3L1 EXT1 IGFBP...,40,CD44 VCAN CHI3L1 EXT1 XYLT1 TNFRSF21 TNFSF10 W...,14,-- Summary --\nThe selected genes support the ...,15


In [104]:
# Apply the get_JI and get_coverage functions to each row and add new columns to the DataFrame

results_df.loc[:,'LLM_coverage'] = results_df.apply(lambda row: row['Supporting Count'] / row['n_Genes'], axis=1)

results_df

Unnamed: 0,Source,GeneSetID,GeneSetName,GeneList,n_Genes,Supporting Genes,Supporting Count,LLM Support Analysis,GenesMentionedInText,LLM_coverage
0,L1000,BRD-A00546892_-666_MCF7_6.0_h_10.0_um,BRD-A00546892 -666 MCF7 6.0 h 10.0 um,CITED2 COL5A1 CRABP2 KCTD12 MDFIC MMP2 NRP1 OR...,24,CITED2 TWIST1 LMO2 COL5A1 MMP2 NRP1,6,-- Summary --\nThe selected genes support the ...,24,0.25
1,L1000,BRD-A00993607_ALPRENOLOL_MCF7_6.0_h_10.0_um,BRD-A00993607 ALPRENOLOL MCF7 6.0 h 10.0 um,1060P11.3 ADM AHR AMIGO2 ARL4C ATP10D CAV2 CD4...,47,CD44 EMP1 AMIGO2 POSTN COMP EXT1 CAV2 ARL4C CD...,15,-- Summary --\nThe selected genes support the ...,15,0.319149
5,L1000,BRD-A01593789_CHLORMADINONE ACETATE_MCF7_6.0_h...,BRD-A01593789 CHLORMADINONE ACETATE MCF7 6.0 h...,1060P11.3 ADH5 ADM ATF3 CASP1 CLEC2B CPA3 CRIM...,38,ATF3 CASP1 ADM CRIM1 DKK3 MECOM PIK3R1 S100A8 ...,10,-- Summary --\nThe selected genes support the ...,10,0.263158
6,L1000,BRD-A02006392_NITRENDIPINE_MCF7_6.0_h_10.0_um,BRD-A02006392 NITRENDIPINE MCF7 6.0 h 10.0 um,1060P11.3 ADAM17 AKAP12 ALCAM AMFR BIK CASP2 C...,76,TP53 CASP2 BIK HSPA1A GLUL GPX3 CDKN2A FOSL1 A...,11,-- Summary --\nThe selected genes support the ...,11,0.144737
7,L1000,BRD-A02176148_TUBAIC ACID_MCF7_6.0_h_10.0_um,BRD-A02176148 TUBAIC ACID MCF7 6.0 h 10.0 um,CASP1 CAST CD52 CLC COL1A2 COL3A1 COL5A2 CPA3 ...,34,COL1A2 COL3A1 COL5A2 FN1 LUM SPON1 GJA1 GPNMB ...,12,-- Summary --\nThe selected genes support the ...,15,0.352941
12,L1000,BRD-A04756508_NORGESTIMATE_MCF7_6.0_h_10.0_um,BRD-A04756508 NORGESTIMATE MCF7 6.0 h 10.0 um,1060P11.3 ACTN1 AMIGO2 ANKRD10 ATP11B C1QA C1Q...,63,C1QA C1QB HLA-C HLA-F CXCL13 IDO1 IGHM IGJ IGK...,22,-- Summary --\nThe selected genes support the ...,22,0.349206
15,L1000,BRD-A06352418_TERFENADINE_MCF7_6.0_h_10.0_um,BRD-A06352418 TERFENADINE MCF7 6.0 h 10.0 um,1060P11.3 ACLY CDH2 CDKN1C CLU CST3 CXCL12 CXC...,67,HMGCR HMGCS1 FDFT1 DHCR24 DHCR7 MSMO1 SC5D SQL...,18,-- Summary --\nThe selected genes support the ...,18,0.268657
18,L1000,BRD-A08003242_RHODOMYRTOXIN B_MCF7_6.0_h_10.0_um,BRD-A08003242 RHODOMYRTOXIN B MCF7 6.0 h 10.0 um,1060P11.3 ACKR1 ASNS ATP6V1D AXIN1 BCL6 BEX1 B...,74,DDIT4 DNAJA1 HSPA1A ATF5 ASNS FASN G6PD SCD BC...,20,-- Summary --\nThe selected genes support the ...,21,0.27027
21,L1000,BRD-A09056319_-666_MCF7_6.0_h_10.0_um,BRD-A09056319 -666 MCF7 6.0 h 10.0 um,CHEK1 DAB2 EGLN3 MFAP5 MXRA5 PDCD4 PLEKHM1 PTP...,27,CHEK1 PDCD4 ZFP36L2 S100A10 ADRB2 AKR1B1 OAS1 ...,8,-- Summary --\nThe selected genes support the ...,27,0.296296
24,L1000,BRD-A09539288_HOMATROPINE BROMIDE_MCF7_6.0_h_1...,BRD-A09539288 HOMATROPINE BROMIDE MCF7 6.0 h 1...,1060P11.3 BEX1 CD44 CDC42EP3 CHI3L1 EXT1 IGFBP...,40,CD44 VCAN CHI3L1 EXT1 XYLT1 TNFRSF21 TNFSF10 W...,14,-- Summary --\nThe selected genes support the ...,15,0.35


In [105]:
results_df.to_csv("data/omics_revamped_LLM_Enrichr_simVals__genecounts_DF.tsv", sep="\t", index=False)