In [3]:
import os
import re
import glob

with open('all_genes_300.txt', 'r', encoding='utf-8') as f:
    gene_set = {line.strip().lower() for line in f if line.strip()}

def parse_genes_from_file(filepath):
    """
    Parse a file (opened with UTF-8 encoding) to extract distinct gene names.

    Handles multiple formats:
      - Files where pathway headers and gene lists are on separate lines.
      - Files where headers and gene lists appear on the same line.
      - Files with Markdown-style headers.

    Heuristics used:
      • Lines starting with "Pathway Name:" or wrapped in "**" are considered headers.
      • Lines without a comma and containing multiple words are assumed to be headers.
      • If a line contains a colon (e.g. an inline header), the part after the colon is used only if it has commas.
      • Gene names with trailing annotations in parentheses are cleaned.
      • Placeholder tokens (e.g., "Gene A") and multiword phrases are ignored.

    Returns:
      A set of unique gene symbols found in the file.
    """
    genes_set = set()
    gene_dict_counter = {}
    pathways_counter = 0
    
    with open(filepath, 'r', encoding="utf8") as f:
        lines = f.readlines()
    
    # Flag to signal that the next non-empty line is expected to be a gene list.
    expecting_genes = False

    for line in lines:
        line = line.strip()
        if not line:
            expecting_genes = False
            continue

        if line.startswith("Pathway Name:") or (line.startswith("**") and line.endswith("**")):
            expecting_genes = True
            pathways_counter += 1
            continue

        if ',' not in line and len(line.split()) > 1:
            expecting_genes = True
            continue

        if ':' in line and not expecting_genes:
            parts = line.split(':', 1)
            candidate = parts[1].strip()
            if ',' in candidate:
                line = candidate

        expecting_genes = False

        # Split the line by commas and process each gene.
        for gene in line.split(','):
            gene_clean = gene.strip().strip(',')
            # Remove any trailing annotation in parentheses,
            # e.g., "Mpz (implicit via pathway context)" → "Mpz"
            gene_clean = re.sub(r'\s*\(.*\)$', '', gene_clean)
            if gene_clean.startswith("Gene "):
                continue
            # If the cleaned entry contains multiple words, assume it isn’t a proper gene.
            if len(gene_clean.split()) > 1:
                continue
            if gene_clean:
                gene_clean = gene_clean.lower()
                genes_set.add(gene_clean)
                if gene_clean not in gene_dict_counter:
                    gene_dict_counter[gene_clean] = 1
                else:
                    gene_dict_counter[gene_clean] += 1
      
    
    return genes_set, pathways_counter            
                    
            

results = []
files = glob.glob("../../output/test_files/*.txt")
#print(files)
for txt_file_path in files:
    filename = os.path.basename(txt_file_path)
    print(f"Processing file: {filename}")
    try:
        unique_genes_in_txt, pathways_count = parse_genes_from_file(txt_file_path)
    except Exception as e:
        print(f"Error processing file {filename}: {e}")
        continue

    total_occurrences = len(unique_genes_in_txt)
    matched_occurrences = sum(1 for gene in unique_genes_in_txt if gene in gene_set)
    occurrence_percentage = (matched_occurrences / total_occurrences * 100) if total_occurrences else 0
    #print(f"({occurrence_percentage}, occurrence_percentage)")

    non_matched_unique_genes = unique_genes_in_txt - gene_set
    non_matched_unique_percentage = (len(non_matched_unique_genes) / total_occurrences * 100) if total_occurrences else 0
    #print("\n\n", non_matched_unique_genes)
    results.append((
        filename,
        occurrence_percentage,
        non_matched_unique_percentage,
        sorted(list(non_matched_unique_genes)),
        total_occurrences,
        pathways_count
    ))

results.sort(key=lambda x: x[2], reverse=False)

print("\n--- FINAL RESULTS ---")
for filename, occ_perc, non_match_perc, non_matched_genes, unique_count, pathways_count in results:
    print(f"File: {filename}")
    print(f"-> {occ_perc:.2f}% of all mentions are from the gene list.")
    print(f"Unique genes in file: {unique_count}")
    print(f"-> {non_match_perc:.2f}% of unique genes in the file are not in the gene list.")
    print("Genes in file but NOT in gene list:", ", ".join(non_matched_genes))
    print(f"Total returned pathways (gene lists found): {pathways_count}")
    print()

config_pathways = {}
for filename, occ_perc, non_match_perc, non_matched_genes, unique_count, pathways_count in results:
    parts = filename.split("_", 1)
    if len(parts) > 1:
        config = parts[1].rsplit(".txt", 1)[0]
    else:
        config = filename.rsplit(".txt", 1)[0]
    if config not in config_pathways:
        config_pathways[config] = []
    config_pathways[config].append(pathways_count)

def average(lst):
    return sum(lst) / len(lst) if lst else 0

# print("Average returned pathways per configuration:")
# for config, counts in config_pathways.items():
#     avg_count = average(counts)
#     print(f"{config}: {avg_count:.2f} pathways on average over {len(counts)} runs")


Processing file: grok-3-mini-beta-GSEA-1-35.55.txt
Processing file: grok-3-mini-beta-GSEA-2-29.8.txt
Processing file: grok-3-mini-beta-GSEA-3-44.53.txt
Processing file: grok-3-mini-beta-GSEA-4-96.57.txt
Processing file: grok-3-mini-beta-GSEA-5-36.85.txt
Processing file: grok-3-mini-beta-GSEA-6-31.44.txt
Processing file: grok-3-mini-beta-GSEA-7-41.67.txt
Processing file: grok-3-mini-beta-GSEA-8-634.07.txt

--- FINAL RESULTS ---
File: grok-3-mini-beta-GSEA-1-35.55.txt
-> 100.00% of all mentions are from the gene list.
Unique genes in file: 51
-> 0.00% of unique genes in the file are not in the gene list.
Genes in file but NOT in gene list: 
Total returned pathways (gene lists found): 0

File: grok-3-mini-beta-GSEA-2-29.8.txt
-> 100.00% of all mentions are from the gene list.
Unique genes in file: 38
-> 0.00% of unique genes in the file are not in the gene list.
Genes in file but NOT in gene list: 
Total returned pathways (gene lists found): 0

File: grok-3-mini-beta-GSEA-3-44.53.txt
-> 1

In [2]:
import os
#os.chdir(r"../../test_rag/RAG_LUMC/supporting scripts/calculate_overlap")
os.getcwd()

'C:\\test_rag\\RAG_LUMC\\supporting scripts\\calculate_overlap'

In [6]:
gene_set

{'arpc1b',
 'sptlc2',
 'synm',
 'man2a2',
 'id2',
 'scn7a',
 'ttll5',
 'iah1',
 'hadha',
 'bst2',
 'dyrk2',
 'adam12',
 'htra1',
 'ensrnog00000062503',
 'a2m',
 'ctbs',
 'synj2bp',
 'ctsl',
 'gstm5l',
 'pbx3',
 'cep97',
 'dhrs7',
 'sparc',
 'large1',
 'ccnjl',
 'mbp',
 'igsf9',
 'synpr',
 'dglucy',
 'cd55',
 'adgra3',
 'tubb2a',
 'septin4',
 'slc25a22',
 'ankmy2',
 'patz1',
 'ppm1f',
 'atrnl1',
 'epn2',
 'naa50',
 'ighm',
 'stat3',
 'tubb4a',
 'laptm4a',
 'fuca2',
 'b4galnt3',
 'tfpi',
 'ggt7',
 'tmlhe',
 'eapp',
 'robo2',
 'p3h1',
 'cux1',
 'fam20c',
 'plxnb1',
 'dpysl5',
 'acap2',
 'pgm1',
 'olfml1',
 'psat1',
 'smpd1',
 'trim21',
 'cep170b',
 'efemp2',
 'camk4',
 'drp2',
 'met',
 'mmp28',
 'pnpla8',
 'selenon',
 'defb29',
 'nid2.ps16',
 'prorp',
 'cd80',
 'trim13',
 'thbs2',
 'rhox5',
 'pkp1',
 'casp3',
 'bnip3',
 'suclg2',
 'cfl2',
 'tmem117',
 'ahnak',
 'gm2a',
 'peg3',
 'cav3',
 'prelid2',
 'gba1',
 'psmc1',
 'dgcr2',
 'aabr07064719.2',
 'gng2',
 'atf5',
 'adamts1',
 'psma6',
 'y

In [15]:
import os
import re
import glob

# Load the complete and the short gene lists.
with open('all_genes_complete.txt', 'r', encoding='utf-8') as f:
    all_gene_set = {line.strip().lower() for line in f if line.strip()}

with open('all_genes_1000.txt', 'r', encoding='utf-8') as f:
    gene_set = {line.strip().lower() for line in f if line.strip()}

pattern = re.compile(
    r"Genes involved:\s*(.*?)(?=\n\s*\n|\n(?:Pathway Name:|\d+\.\s|={5,})|\Z)",
    re.DOTALL,
)
valid_gene_pattern = re.compile(r"^[A-Za-z0-9\-_]+$")

results = []
files = glob.glob('./test_files/*.txt')

for txt_file_path in files:
    with open(txt_file_path, 'r', encoding='utf-8') as f:
        text_content = f.read()
    gene_blocks = pattern.findall(text_content)
    all_genes_in_txt = []
    for block in gene_blocks:
        tokens = re.split(r"[,\n]", block)
        tokens = [re.sub(r'\*+', '', token).strip().lower() for token in tokens if token.strip()]
        genes = [token for token in tokens if valid_gene_pattern.match(token)]
        all_genes_in_txt.extend(genes)
    
    # Total gene mentions in the file
    total_occurrences = len(all_genes_in_txt)
    
    # Count of gene mentions that are in the short gene list (all_genes.txt)
    matched_occurrences = sum(1 for gene in all_genes_in_txt if gene in gene_set)
    occurrence_percentage = (matched_occurrences / total_occurrences * 100) if total_occurrences else 0
    
    # Hallucination: gene mentions not in the short list.
    hallucinated_occurrences = total_occurrences - matched_occurrences
    hallucination_percentage_total = (hallucinated_occurrences / total_occurrences * 100) if total_occurrences else 0

    # Count of gene mentions that are in the complete gene list (all_genes_complete.txt)
    matched_complete_occurrences = sum(1 for gene in all_genes_in_txt if gene in all_gene_set)
    hallucination_percentage_complete = ((total_occurrences - matched_complete_occurrences) / total_occurrences * 100) if total_occurrences else 0

    # Among the hallucinated mentions (not in the short list), count how many are present in the complete list.
    hallucinated_in_complete = sum(1 for gene in all_genes_in_txt if gene not in gene_set and gene in all_gene_set)
    perc_halluc_in_complete = (hallucinated_in_complete / hallucinated_occurrences * 100) if hallucinated_occurrences else 0

    # Store relevant values.
    results.append(
        (
            os.path.basename(txt_file_path),
            occurrence_percentage,
            hallucination_percentage_total,
            hallucination_percentage_complete,
            perc_halluc_in_complete,
            matched_occurrences,
            total_occurrences,
            matched_complete_occurrences,
            hallucinated_occurrences,
            hallucinated_in_complete,
            all_genes_in_txt,  
        )
    )

results.sort(key=lambda x: x[2], reverse=False)

for (filename,
     occ_perc,
     halluc_total,
     halluc_complete,
     perc_halluc_in_complete,
     matched_occurrences,
     total_occurrences,
     matched_complete_occurrences,
     hallucinated_occurrences,
     hallucinated_in_complete,
     all_genes_in_txt) in results:
    
    print(f"File: {filename}")
    print(f"-> {occ_perc:.2f}% of all mentions are from the short gene list (all_genes.txt) "
          f"({matched_occurrences} out of {total_occurrences}).")
    print(f"-> {halluc_total:.2f}% of all mentions are hallucinated (not in the short gene list) "
          f"({hallucinated_occurrences} out of {total_occurrences}).")
    print(f"-> {halluc_complete:.2f}% of all mentions are hallucinated (not in the complete gene list) "
          f"({total_occurrences - matched_complete_occurrences} out of {total_occurrences}).")
    print(f"-> {perc_halluc_in_complete:.2f}% of the hallucinated mentions are present in the complete gene list "
          f"({hallucinated_in_complete} out of {hallucinated_occurrences}).")
    
    # List genes that are not in the short gene list.
    genes_not_in_short = [gene for gene in all_genes_in_txt if gene not in gene_set]
    print("Genes in file but NOT in the short gene list:", ", ".join(genes_not_in_short))
    print()

o3_mini_high_percents = []
o1_preview_percents = []
gpt4o_percents = []

o3_mini_high_halluc_complete_percents = []
o1_preview_halluc_complete_percents = []
gpt4o_halluc_complete_percents = []

o3_mini_high_in_complete_percents = []
o1_preview_in_complete_percents = []
gpt4o_in_complete_percents = []

gpt4o_names = {"answer1.txt", "answer2.txt", "answer3.txt", "answer4.txt", "answer5.txt"}

for (filename,
     occ_perc,
     halluc_total,
     halluc_complete,
     perc_halluc_in_complete,
     matched_occurrences,
     total_occurrences,
     matched_complete_occurrences,
     hallucinated_occurrences,
     hallucinated_in_complete,
     all_genes_in_txt) in results:
    
    if "o3-mini-high" in filename:
        o3_mini_high_percents.append(halluc_total)
        o3_mini_high_halluc_complete_percents.append(halluc_complete)
        o3_mini_high_in_complete_percents.append(perc_halluc_in_complete)
    elif "o1-preview" in filename:
        o1_preview_percents.append(halluc_total)
        o1_preview_halluc_complete_percents.append(halluc_complete)
        o1_preview_in_complete_percents.append(perc_halluc_in_complete)
    elif filename in gpt4o_names:
        gpt4o_percents.append(halluc_total)
        gpt4o_halluc_complete_percents.append(halluc_complete)
        gpt4o_in_complete_percents.append(perc_halluc_in_complete)

def average(lst):
    return sum(lst) / len(lst) if lst else 0

avg_o3 = average(o3_mini_high_percents)
avg_o1 = average(o1_preview_percents)
avg_gpt4o = average(gpt4o_percents)

avg_o3_halluc_complete = average(o3_mini_high_halluc_complete_percents)
avg_o1_halluc_complete = average(o1_preview_halluc_complete_percents)
avg_gpt4o_halluc_complete = average(gpt4o_halluc_complete_percents)

avg_o3_in_complete = average(o3_mini_high_in_complete_percents)
avg_o1_in_complete = average(o1_preview_in_complete_percents)
avg_gpt4o_in_complete = average(gpt4o_in_complete_percents)

print("Average Hallucination Percentages (Total Occurrences):")
print(f"o3-mini-high files: {avg_o3:.2f}% (short list), {avg_o3_halluc_complete:.2f}% (complete list), "
      f"with {avg_o3_in_complete:.2f}% of hallucinated mentions in the complete gene list")
print(f"o1-preview files: {avg_o1:.2f}% (short list), {avg_o1_halluc_complete:.2f}% (complete list), "
      f"with {avg_o1_in_complete:.2f}% of hallucinated mentions in the complete gene list")
print(f"GPT4o (answer1.txt to answer5.txt): {avg_gpt4o:.2f}% (short list), {avg_gpt4o_halluc_complete:.2f}% (complete list), "
      f"with {avg_gpt4o_in_complete:.2f}% of hallucinated mentions in the complete gene list")


Average Hallucination Percentages (Total Occurrences):
o3-mini-high files: 0.00% (short list), 0.00% (complete list), with 0.00% of hallucinated mentions in the complete gene list
o1-preview files: 0.00% (short list), 0.00% (complete list), with 0.00% of hallucinated mentions in the complete gene list
GPT4o (answer1.txt to answer5.txt): 0.00% (short list), 0.00% (complete list), with 0.00% of hallucinated mentions in the complete gene list


In [None]:
import pyperclip
import re

# Get user input
user_input = input("Enter a string: ")

match = re.search(r"(genes involved:|genes:)\s*(.*)", user_input, re.IGNORECASE)

if match:
    genes_string = match.group(2)  
    modified_string = genes_string.replace(', ', ' ')  
    
    # Copy to clipboard
    pyperclip.copy(modified_string)
    
    print("Extracted genes copied to clipboard:", modified_string)
else:
    print("No matching pattern found.")
