In [10]:
import os
import pandas as pd

base_dir = 'output/test_files/configurations ran/'
with_dirs = [
    d for d in os.listdir(base_dir)
    if d.lower().startswith('with') and os.path.isdir(os.path.join(base_dir, d))
]

patterns = {
    'Axon_Guidance': ['axon guidance signaling', 'axon guidance', 'axonal guidance'],
    'ECM_Organization': ['extracellular matrix organization', 'ecm', 'extracellular matrix'],
    'Myelination': ['myelination', 'myel'],
}


records = []
for cfg in with_dirs:
    cfg_path = os.path.join(base_dir, cfg)
    for fname in os.listdir(cfg_path):
        fpath = os.path.join(cfg_path, fname)
        if not os.path.isfile(fpath):
            continue
        with open(fpath, 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read().lower()
        rec = {'Configuration': cfg, 'File': fname}
        for p, terms in patterns.items():
            rec[p] = int(any(term in text for term in terms))
        rec['Pathway_Count'] = text.count(':')
        records.append(rec)

file_df = pd.DataFrame(records)

agg = (
    file_df
    .groupby('Configuration')
    .agg(
        n_files=('File', 'size'),
        Axon_Guidance_sum=('Axon_Guidance', 'sum'),
        ECM_Organization_sum=('ECM_Organization', 'sum'),
        Myelination_sum=('Myelination', 'sum'),
        Avg_Pathway_Count=('Pathway_Count', 'mean')
    )
)


for p in patterns:
    agg[f'{p}_pct'] = (agg[f'{p}_sum'] / agg['n_files'] * 100).round(1)

pct_df = agg[[f'{p}_pct' for p in patterns]].copy()
pct_df.columns = [col.replace('_pct', '') for col in pct_df.columns]
pct_df = pct_df.T
pct_df.loc['Average_Pathways'] = agg['Avg_Pathway_Count'].round(1)
pct_df.iloc[:-1] = pct_df.iloc[:-1].astype(str) + '%'

print("Percentage of files with each pathway by configuration:")
print(pct_df)

print("\nFiles missing any pathways:")
missing = file_df[file_df[list(patterns)].sum(axis=1) < len(patterns)]
for _, row in missing.iterrows():
    missing_items = [p for p in patterns if row[p] == 0]
    print(f"{row['Configuration']}/{row['File']}: missing {', '.join(missing_items)}")


Percentage of files with each pathway by configuration:
Configuration    With RAG with scope With RAG without scope  \
Axon_Guidance                 100.0%                  92.0%   
ECM_Organization               88.0%                 100.0%   
Myelination                   100.0%                 100.0%   
Average_Pathways                 6.6                    6.7   

Configuration    Without RAG with scope Without RAG without scope  
Axon_Guidance                    100.0%                    100.0%  
ECM_Organization                 100.0%                    100.0%  
Myelination                      100.0%                     62.5%  
Average_Pathways                    7.4                       7.8  

Files missing any pathways:
With RAG with scope/o3-with-rag-with-scope-20-85.35.txt: missing ECM_Organization
With RAG with scope/o3-with-rag-with-scope-22-127.31.txt: missing ECM_Organization
With RAG with scope/o3-with-rag-with-scope-23-114.09.txt: missing ECM_Organization
With RAG wi

  pct_df.iloc[:-1] = pct_df.iloc[:-1].astype(str) + '%'
  pct_df.iloc[:-1] = pct_df.iloc[:-1].astype(str) + '%'
  pct_df.iloc[:-1] = pct_df.iloc[:-1].astype(str) + '%'
  pct_df.iloc[:-1] = pct_df.iloc[:-1].astype(str) + '%'


In [11]:
pct_df

Configuration,With RAG with scope,With RAG without scope,Without RAG with scope,Without RAG without scope
Axon_Guidance,100.0%,92.0%,100.0%,100.0%
ECM_Organization,88.0%,100.0%,100.0%,100.0%
Myelination,100.0%,100.0%,100.0%,62.5%
Average_Pathways,6.6,6.7,7.4,7.8
