In [39]:
import os
import re
import glob

with open('all_genes.txt', 'r', encoding='utf-8') as f:
    gene_set = {line.strip().lower() for line in f if line.strip()}

pattern = re.compile(r"Genes involved:\s*(.*?)(?=\n\s*\n|\n(?:Pathway Name:|\d+\.\s|={5,})|\Z)", re.DOTALL)
valid_gene_pattern = re.compile(r"^[A-Za-z0-9\-_]+$")

results = []
files = glob.glob('./test_files/*.txt')

for txt_file_path in files:
    with open(txt_file_path, 'r', encoding='utf-8') as f:
        text_content = f.read()
    gene_blocks = pattern.findall(text_content)
    all_genes_in_txt = []
    for block in gene_blocks:
        tokens = re.split(r"[,\n]", block)
        tokens = [re.sub(r'\*+', '', token).strip().lower() for token in tokens if token.strip()]
        genes = [token for token in tokens if valid_gene_pattern.match(token)]
        all_genes_in_txt.extend(genes)
    total_occurrences = len(all_genes_in_txt)
    matched_occurrences = sum(1 for gene in all_genes_in_txt if gene in gene_set)
    occurrence_percentage = (matched_occurrences / total_occurrences * 100) if total_occurrences else 0
    unique_genes_in_txt = set(all_genes_in_txt)
    non_matched_unique_genes = unique_genes_in_txt - gene_set
    non_matched_unique_percentage = (len(non_matched_unique_genes) / len(unique_genes_in_txt) * 100) if unique_genes_in_txt else 0
    results.append((os.path.basename(txt_file_path), occurrence_percentage,
                    non_matched_unique_percentage, sorted(non_matched_unique_genes),
                    len(unique_genes_in_txt)))

results.sort(key=lambda x: x[2], reverse=False)

for filename, occ_perc, non_match_perc, non_matched_genes, unique_count in results:
    print(f"File: {filename}")
    print(f"-> {occ_perc:.2f}% of all mentions are from the gene list.")
    print(f"Unique genes in file: {unique_count}")
    print(f"-> {non_match_perc:.2f}% of unique genes in the file are not in the gene list.")
    print("Genes in file but NOT in gene list:", ", ".join(non_matched_genes))
    print()


File: gpt_o3-mini-high-noref-no_scope.txt
-> 100.00% of all mentions are from the gene list.
Unique genes in file: 90
-> 0.00% of unique genes in the file are not in the gene list.
Genes in file but NOT in gene list: 

File: gpt_o3-mini-high-noref.txt
-> 100.00% of all mentions are from the gene list.
Unique genes in file: 68
-> 0.00% of unique genes in the file are not in the gene list.
Genes in file but NOT in gene list: 

