In [27]:

import re
from pathlib import Path
from typing import Set, List, Tuple, Optional
import plotly.express as px

# Minimum genes threshold for filtering both reference and LLM sections
min_genes = 4

# %%
# Function definitions

def normalize_gene(gene: str) -> str:
    """
    Normalize a gene name: strip whitespace, lowercase, remove parentheses and non-alphanumeric chars.
    """
    g = gene.strip().lower()
    g = re.sub(r"\s*\(.*\)$", "", g)
    g = re.sub(r"[^a-z0-9]", "", g)
    return g


def clean_gene_name(raw: str) -> Optional[str]:
    """
    Clean raw gene token: remove trailing commas, parentheses, and filter out invalid tokens.
    """
    g = raw.strip().rstrip(',')
    g = re.sub(r"\s*\(.*\)$", "", g)
    if not g or g.lower().startswith("gene ") or len(g.split()) > 1:
        return None
    return g


def parse_sections(fp: Path) -> List[Tuple[str, Set[str]]]:
    """
    Parse a text file into named sections. Each section starts with a header line ending with ':'
    and the next non-empty line contains comma-separated gene names.
    Returns a list of (section_name, set(normalized_genes)).
    """
    lines = fp.read_text(encoding="utf8").splitlines()
    sections: List[Tuple[str, Set[str]]] = []
    i = 0
    while i < len(lines):
        header = lines[i].strip()
        if header.endswith(':'):
            name = header[:-1].strip()
            j = i + 1
            while j < len(lines) and not lines[j].strip():
                j += 1
            if j < len(lines):
                gene_line = lines[j].strip()
                genes: Set[str] = set()
                for part in gene_line.split(','):
                    cg = clean_gene_name(part)
                    if cg:
                        genes.add(normalize_gene(cg))
                sections.append((name, genes))
            i = j
        else:
            i += 1
    return sections

# %%
# Load and filter IPA reference pathways
ipa_df = pd.read_csv('IPA.csv')
ipa_df = ipa_df.rename(columns={'Column 1':'pathway','Column 5':'Molecules'})
ipa_df['genes'] = ipa_df['Molecules'].str.split(',').apply(
    lambda lst: {normalize_gene(g) for g in lst if g.strip()}
)
# Keep only reference pathways with at least min_genes
ipa_df = ipa_df[ipa_df['genes'].apply(len) >= min_genes]
ipa_lookup = {row['pathway']: row['genes'] for _, row in ipa_df.iterrows()}

# %%
# Collect and filter LLM outputs
base_dir = Path('configurations ran')
llm_records = []
for sub in base_dir.iterdir():
    if sub.is_dir() and sub.name.startswith('With'):
        for txt in sub.glob('*.txt'):
            for section, genes in parse_sections(txt):
                if len(genes) < min_genes:
                    continue
                llm_records.append({
                    'Configuration': sub.name,
                    'file': txt.name,
                    'Pathway': section,
                    'genes': genes,
                })
llm_df = pd.DataFrame(llm_records)

# %%
# Compute overlap metrics per section
results = []
for _, row in llm_df.iterrows():
    genes_llm = row['genes']
    if not genes_llm:
        continue
    best_ref = None
    best_genes = set()
    max_ov = -1
    for ref_name, genes_ref in ipa_lookup.items():
        ov = len(genes_llm & genes_ref)
        if ov > max_ov:
            max_ov = ov
            best_ref = ref_name
            best_genes = genes_ref
    tot_llm = len(genes_llm)
    tot_ref = len(best_genes)
    cov_llm = (max_ov / tot_llm * 100) if tot_llm else 0.0
    cov_ref = (max_ov / tot_ref * 100) if tot_ref else 0.0
    avg_cov = (cov_llm + cov_ref) / 2
    results.append({
        'Configuration': row['Configuration'],
        'file': row['file'],
        'Pathway': row['Pathway'],
        'Best IPA Match': best_ref,
        'Coverage of answers in the pathway': cov_llm,
        'Coverage of Pathway in Answers': cov_ref,
        'Average coverage': avg_cov,
        'Original Genes': sorted(genes_llm),
        'Genes from hit': sorted(genes_llm & best_genes),
    })
final_data = pd.DataFrame(results)

# %%
# Save final_data
final_data.to_csv('final_data.csv', index=False)

# %%
# Compute average coverage metrics per directory
overlap_df = final_data.groupby('Configuration')[
    ['Coverage of answers in the pathway',
     'Coverage of Pathway in Answers',
     'Average coverage']
].mean().reset_index()
overlap_df.to_csv('overlap_df.csv', index=False)

# %%
# Plot separate metrics with colored boxplots
plot_dir = Path('./PNG')
plot_dir.mkdir(exist_ok=True, parents=True)
metrics = ['Coverage of answers in the pathway','Coverage of Pathway in Answers','Average coverage']
for metric in metrics:
    med = final_data.groupby('Configuration')[metric].median().sort_values(ascending=False)
    order = med.index.tolist()
    fig = px.box(final_data, x='Configuration', y=metric, color='Configuration', points='all',
                 category_orders={'Configuration':order}, template='plotly_white',
                 title=f'{metric} by configuration')
    fig.update_traces(pointpos=0, jitter=0.3, marker={'size':6,'opacity':0.6})
    out = plot_dir/f"{metric.replace(' ','_')}_boxplot.html"
    fig.write_html(str(out), include_plotlyjs='cdn')
    print(f"Saved {metric} plot at {out}")


Saved Coverage of answers in the pathway plot at PNG\Coverage_of_answers_in_the_pathway_boxplot.html
Saved Coverage of Pathway in Answers plot at PNG\Coverage_of_Pathway_in_Answers_boxplot.html
Saved Average coverage plot at PNG\Average_coverage_boxplot.html


In [16]:
final_data

Unnamed: 0,Directory,file,Pathway,Best IPA Match,Coverage of answers in the pathway,Coverage of Pathway in Answers,Average Coverage,Original Genes,Genes from hit
0,With RAG with scope,o3-with-rag-with-scope-1-115.26.txt,Axon Guidance and Growth Cone Signaling,Axonal Guidance Signaling,60.000000,42.857143,51.428571,"[cntn6, dpysl3, dpysl5, efna5, gldn, itga8, ki...","[dpysl5, efna5, itga8, nrp1, nrp2, ntn1, plxnb..."
1,With RAG with scope,o3-with-rag-with-scope-1-115.26.txt,Peripheral Myelination and Node of Ranvier For...,EGR2 and SOX10-mediated initiation of Schwann ...,40.000000,80.000000,60.000000,"[cers6, cldn19, cxcl12, drp2, gldn, mag, mal, ...","[drp2, mag, mpz, pmp22]"
2,With RAG with scope,o3-with-rag-with-scope-1-115.26.txt,Synaptic Vesicle Cycling and Neurotransmitter ...,RHOA Signaling,20.000000,60.000000,40.000000,"[cpe, frrs1, kif1a, map1lc3a, numb, pfn2, psap...","[pfn2, septin4, septin5]"
3,With RAG with scope,o3-with-rag-with-scope-1-115.26.txt,Calcium and Mechanosensory Ion Channel Signaling,NAD Signaling Pathway,33.333333,40.000000,36.666667,"[itpripl1, p2ry2, piezo2, ryr3, scn7a, tpcn1]","[ryr3, tpcn1]"
4,With RAG with scope,o3-with-rag-with-scope-1-115.26.txt,Non-Canonical Wnt Signaling in Neural Development,Hepatic Fibrosis Signaling Pathway,40.000000,14.285714,27.142857,"[cdh13, dyrk1a, lef1, tnik, wnt5a]","[lef1, wnt5a]"
...,...,...,...,...,...,...,...,...,...
684,Without RAG without scope,o3-without-rag-without-scope-9-57.55.txt,Angiogenesis and Vascular Remodeling,Axonal Guidance Signaling,30.769231,14.285714,22.527473,"[efna5, epas1, gpnmb, has2, lef1, nampt, nrp1,...","[efna5, nrp1, nrp2, wnt5a]"
685,Without RAG without scope,o3-without-rag-without-scope-9-57.55.txt,Immune and Inflammatory Response,Hepatic Fibrosis / Hepatic Stellate Cell Activ...,41.666667,41.666667,41.666667,"[a2m, cxcl12, cxcl14, icoslg, ifnar1, igf2bp2,...","[a2m, ifnar1, igfbp3, timp1, tnfrsf1a]"
686,Without RAG without scope,o3-without-rag-without-scope-9-57.55.txt,Oxidative Stress and Metabolic Detoxification,Neutrophil degranulation,27.272727,27.272727,27.272727,"[eif2ak2, fth1, hadhb, hsp90aa1, hspa12a, namp...","[fth1, hsp90aa1, psap]"
687,Without RAG without scope,o3-without-rag-without-scope-9-57.55.txt,Wnt/TGF-β Signaling,S100 Family Signaling Pathway,37.500000,33.333333,35.416667,"[dyrk1a, epas1, lef1, ltbp1, ppp1r14c, runx3, ...","[lef1, serpinf1, wnt5a]"


In [17]:
overlap_df

Unnamed: 0,Directory,Coverage of answers in the pathway,Coverage of Pathway in Answers,Average Coverage
0,With RAG with scope,41.344962,45.226569,43.285766
1,With RAG without scope,36.549101,48.143377,42.346239
2,Without RAG with scope,43.719328,44.527978,44.123653
3,Without RAG without scope,34.506653,47.161663,40.834158


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'output/test_files/configurations ran/'

In [3]:
import os
print(os.getcwd())

C:\Python\RAG\supporting scripts\validation
