In [17]:
import os
import re
import glob

with open('all_genes_1000.txt', 'r', encoding='utf-8') as f:
    gene_set = {line.strip().lower() for line in f if line.strip()}

pattern = re.compile(
    r"(?:Pathway Name:.*?\n\s*|Genes involved:\s*)([^\n]+)"
)

valid_gene_pattern = re.compile(r"^[a-z0-9\-_]+$", re.IGNORECASE)

results = []
files = glob.glob("../../output/test_files/*.txt")
print(files)
for txt_file_path in files:
    filename = os.path.basename(txt_file_path)
    print(f"Processing file: {filename}")
    try:
        with open(txt_file_path, 'r', encoding='utf-8') as f:
            text_content = f.read()
    except Exception as e:
        print(f"Error reading file {filename}: {e}")
        continue

    gene_list_strings = pattern.findall(text_content)

    pathways_count = len(gene_list_strings)
    all_genes_in_txt = []

    for gene_block_string in gene_list_strings:
        tokens = gene_block_string.split(',')
        cleaned_genes = []
        for token in tokens:
            cleaned_token = re.sub(r'\*+', '', token).strip().lower()
            if cleaned_token and valid_gene_pattern.match(cleaned_token):
                 cleaned_genes.append(cleaned_token)
        all_genes_in_txt.extend(cleaned_genes)

    print("ALL_GENES!!!!!!!\n\n", all_genes_in_txt, txt_file_path)

    total_occurrences = len(all_genes_in_txt)
    matched_occurrences = sum(1 for gene in all_genes_in_txt if gene in gene_set)
    occurrence_percentage = (matched_occurrences / total_occurrences * 100) if total_occurrences else 0
    print(f"({occurrence_percentage}, occurrence_percentage)")

    unique_genes_in_txt = set(all_genes_in_txt)
    non_matched_unique_genes = unique_genes_in_txt - gene_set
    non_matched_unique_percentage = (
        len(non_matched_unique_genes) / len(unique_genes_in_txt) * 100 if unique_genes_in_txt else 0
    )
    print("\n\n", non_matched_unique_genes)
    results.append((
        filename,
        occurrence_percentage,
        non_matched_unique_percentage,
        sorted(list(non_matched_unique_genes)),
        len(unique_genes_in_txt),
        pathways_count
    ))

results.sort(key=lambda x: x[2], reverse=False)

print("\n--- FINAL RESULTS ---")
for filename, occ_perc, non_match_perc, non_matched_genes, unique_count, pathways_count in results:
    print(f"File: {filename}")
    print(f"-> {occ_perc:.2f}% of all mentions are from the gene list.")
    print(f"Unique genes in file: {unique_count}")
    print(f"-> {non_match_perc:.2f}% of unique genes in the file are not in the gene list.")
    print("Genes in file but NOT in gene list:", ", ".join(non_matched_genes))
    print(f"Total returned pathways (gene lists found): {pathways_count}")
    print()

config_pathways = {}
for filename, occ_perc, non_match_perc, non_matched_genes, unique_count, pathways_count in results:
    parts = filename.split("_", 1)
    if len(parts) > 1:
        config = parts[1].rsplit(".txt", 1)[0]
    else:
        config = filename.rsplit(".txt", 1)[0]

    if config not in config_pathways:
        config_pathways[config] = []
    config_pathways[config].append(pathways_count)

def average(lst):
    return sum(lst) / len(lst) if lst else 0

print("Average returned pathways per configuration:")
for config, counts in config_pathways.items():
    avg_count = average(counts)
    print(f"{config}: {avg_count:.2f} pathways on average over {len(counts)} runs")

['../../output/test_files\\o3-mini-GSEA-1.txt', '../../output/test_files\\o3-mini-without-rag-without-scope-1.txt']
Processing file: o3-mini-GSEA-1.txt
ALL_GENES!!!!!!!

 ['sema4f', 'nrp1', 'nrp2', 'plxnb1', 'slit2', 'robo2', 'efna5', 'epha4', 'cdh13', 'itga6', 'itga7', 'itga8', 'vangl2', 'sema3g', 'mpz', 'mbp', 'plp1', 'mag', 'prx', 'pmp22', 'cldn19', 'mal', 'col9a3', 'col15a1', 'col16a1', 'hapln1', 'cntn6', 'lama4', 'fbln1', 'sparc', 'thbs2', 'emilin1', 'matn3', 'col5a2', 'col7a1', 'col8a2', 'col14a1', 'col20a1', 'apod', 'fabp5', 'gstz1', 'scd', 'acadl', 'hadha', 'cers6', 'sgms1', 'plpp1', 'htra1', 'eif2ak2', 'atf3', 'atf4', 'creb3', 'eif2s1', 'mapk1', 'grb14', 'sos1', 'mapk8ip1', 'pak1', 'pak2', 'map2k6', 'wnt5a', 'lef1', 'ctnnb1', 'ctsd', 'ctsl', 'atg3', 'fabp5', 'acadl', 'hadha', 'bdnf', 'synpr', 'vamp2', 'grb14', 'gabpa', 'adcy1', 'cdk5r1', 'psma6', 'psmc3', 'psmd2', 'ube2l3', 'itga6', 'itga7', 'itga8', 'itga1', 'hes6', 'stat3', 'rhoj', 'rhpn2', 'cdc42se2'] ../../output/test_file

In [2]:
import os
import re
import glob

# Load the complete and the short gene lists.
with open('all_genes_complete.txt', 'r', encoding='utf-8') as f:
    all_gene_set = {line.strip().lower() for line in f if line.strip()}

with open('all_genes_1000.txt', 'r', encoding='utf-8') as f:
    gene_set = {line.strip().lower() for line in f if line.strip()}

pattern = re.compile(
    r"Genes involved:\s*(.*?)(?=\n\s*\n|\n(?:Pathway Name:|\d+\.\s|={5,})|\Z)",
    re.DOTALL,
)
valid_gene_pattern = re.compile(r"^[A-Za-z0-9\-_]+$")

results = []
files = glob.glob('./test_files/*.txt')

for txt_file_path in files:
    with open(txt_file_path, 'r', encoding='utf-8') as f:
        text_content = f.read()
    gene_blocks = pattern.findall(text_content)
    all_genes_in_txt = []
    for block in gene_blocks:
        tokens = re.split(r"[,\n]", block)
        tokens = [re.sub(r'\*+', '', token).strip().lower() for token in tokens if token.strip()]
        genes = [token for token in tokens if valid_gene_pattern.match(token)]
        all_genes_in_txt.extend(genes)
    
    # Total gene mentions in the file
    total_occurrences = len(all_genes_in_txt)
    
    # Count of gene mentions that are in the short gene list (all_genes.txt)
    matched_occurrences = sum(1 for gene in all_genes_in_txt if gene in gene_set)
    occurrence_percentage = (matched_occurrences / total_occurrences * 100) if total_occurrences else 0
    
    # Hallucination: gene mentions not in the short list.
    hallucinated_occurrences = total_occurrences - matched_occurrences
    hallucination_percentage_total = (hallucinated_occurrences / total_occurrences * 100) if total_occurrences else 0

    # Count of gene mentions that are in the complete gene list (all_genes_complete.txt)
    matched_complete_occurrences = sum(1 for gene in all_genes_in_txt if gene in all_gene_set)
    hallucination_percentage_complete = ((total_occurrences - matched_complete_occurrences) / total_occurrences * 100) if total_occurrences else 0

    # Among the hallucinated mentions (not in the short list), count how many are present in the complete list.
    hallucinated_in_complete = sum(1 for gene in all_genes_in_txt if gene not in gene_set and gene in all_gene_set)
    perc_halluc_in_complete = (hallucinated_in_complete / hallucinated_occurrences * 100) if hallucinated_occurrences else 0

    # Store relevant values.
    results.append(
        (
            os.path.basename(txt_file_path),
            occurrence_percentage,
            hallucination_percentage_total,
            hallucination_percentage_complete,
            perc_halluc_in_complete,
            matched_occurrences,
            total_occurrences,
            matched_complete_occurrences,
            hallucinated_occurrences,
            hallucinated_in_complete,
            all_genes_in_txt,  
        )
    )

results.sort(key=lambda x: x[2], reverse=False)

for (filename,
     occ_perc,
     halluc_total,
     halluc_complete,
     perc_halluc_in_complete,
     matched_occurrences,
     total_occurrences,
     matched_complete_occurrences,
     hallucinated_occurrences,
     hallucinated_in_complete,
     all_genes_in_txt) in results:
    
    print(f"File: {filename}")
    print(f"-> {occ_perc:.2f}% of all mentions are from the short gene list (all_genes.txt) "
          f"({matched_occurrences} out of {total_occurrences}).")
    print(f"-> {halluc_total:.2f}% of all mentions are hallucinated (not in the short gene list) "
          f"({hallucinated_occurrences} out of {total_occurrences}).")
    print(f"-> {halluc_complete:.2f}% of all mentions are hallucinated (not in the complete gene list) "
          f"({total_occurrences - matched_complete_occurrences} out of {total_occurrences}).")
    print(f"-> {perc_halluc_in_complete:.2f}% of the hallucinated mentions are present in the complete gene list "
          f"({hallucinated_in_complete} out of {hallucinated_occurrences}).")
    
    # List genes that are not in the short gene list.
    genes_not_in_short = [gene for gene in all_genes_in_txt if gene not in gene_set]
    print("Genes in file but NOT in the short gene list:", ", ".join(genes_not_in_short))
    print()

o3_mini_high_percents = []
o1_preview_percents = []
gpt4o_percents = []

o3_mini_high_halluc_complete_percents = []
o1_preview_halluc_complete_percents = []
gpt4o_halluc_complete_percents = []

o3_mini_high_in_complete_percents = []
o1_preview_in_complete_percents = []
gpt4o_in_complete_percents = []

gpt4o_names = {"answer1.txt", "answer2.txt", "answer3.txt", "answer4.txt", "answer5.txt"}

for (filename,
     occ_perc,
     halluc_total,
     halluc_complete,
     perc_halluc_in_complete,
     matched_occurrences,
     total_occurrences,
     matched_complete_occurrences,
     hallucinated_occurrences,
     hallucinated_in_complete,
     all_genes_in_txt) in results:
    
    if "o3-mini-high" in filename:
        o3_mini_high_percents.append(halluc_total)
        o3_mini_high_halluc_complete_percents.append(halluc_complete)
        o3_mini_high_in_complete_percents.append(perc_halluc_in_complete)
    elif "o1-preview" in filename:
        o1_preview_percents.append(halluc_total)
        o1_preview_halluc_complete_percents.append(halluc_complete)
        o1_preview_in_complete_percents.append(perc_halluc_in_complete)
    elif filename in gpt4o_names:
        gpt4o_percents.append(halluc_total)
        gpt4o_halluc_complete_percents.append(halluc_complete)
        gpt4o_in_complete_percents.append(perc_halluc_in_complete)

def average(lst):
    return sum(lst) / len(lst) if lst else 0

avg_o3 = average(o3_mini_high_percents)
avg_o1 = average(o1_preview_percents)
avg_gpt4o = average(gpt4o_percents)

avg_o3_halluc_complete = average(o3_mini_high_halluc_complete_percents)
avg_o1_halluc_complete = average(o1_preview_halluc_complete_percents)
avg_gpt4o_halluc_complete = average(gpt4o_halluc_complete_percents)

avg_o3_in_complete = average(o3_mini_high_in_complete_percents)
avg_o1_in_complete = average(o1_preview_in_complete_percents)
avg_gpt4o_in_complete = average(gpt4o_in_complete_percents)

print("Average Hallucination Percentages (Total Occurrences):")
print(f"o3-mini-high files: {avg_o3:.2f}% (short list), {avg_o3_halluc_complete:.2f}% (complete list), "
      f"with {avg_o3_in_complete:.2f}% of hallucinated mentions in the complete gene list")
print(f"o1-preview files: {avg_o1:.2f}% (short list), {avg_o1_halluc_complete:.2f}% (complete list), "
      f"with {avg_o1_in_complete:.2f}% of hallucinated mentions in the complete gene list")
print(f"GPT4o (answer1.txt to answer5.txt): {avg_gpt4o:.2f}% (short list), {avg_gpt4o_halluc_complete:.2f}% (complete list), "
      f"with {avg_gpt4o_in_complete:.2f}% of hallucinated mentions in the complete gene list")


Average Hallucination Percentages (Total Occurrences):
o3-mini-high files: 0.00% (short list), 0.00% (complete list), with 0.00% of hallucinated mentions in the complete gene list
o1-preview files: 0.00% (short list), 0.00% (complete list), with 0.00% of hallucinated mentions in the complete gene list
GPT4o (answer1.txt to answer5.txt): 0.00% (short list), 0.00% (complete list), with 0.00% of hallucinated mentions in the complete gene list


In [None]:
import pyperclip
import re

# Get user input
user_input = input("Enter a string: ")

match = re.search(r"(genes involved:|genes:)\s*(.*)", user_input, re.IGNORECASE)

if match:
    genes_string = match.group(2)  
    modified_string = genes_string.replace(', ', ' ')  
    
    # Copy to clipboard
    pyperclip.copy(modified_string)
    
    print("Extracted genes copied to clipboard:", modified_string)
else:
    print("No matching pattern found.")
