In [26]:
import re

# -----------------------------
# 1. Read the ground truth gene list (all_genes.txt)
# -----------------------------
genes_file_path = 'all_genes.txt'
with open(genes_file_path, 'r', encoding='utf-8') as f:
    gene_set = {line.strip() for line in f if line.strip()}

print("Unique genes in gene list (ground truth):", len(gene_set))

# -----------------------------
# 2. Read the test file (all_answers_openai_correct)
# -----------------------------
txt_file_path = './test_files/all_answers_openai.txt'
with open(txt_file_path, 'r', encoding='utf-8') as f:
    text_content = f.read()

# -----------------------------
# 3. Extract gene list blocks using an updated regex
# -----------------------------
# Explanation:
# - Look for "Genes involved:" followed by any characters (including newlines),
#   stopping when a blank line occurs, a new header appears (like "Pathway Name:" or a numbered header),
#   or the end of the file is reached.
#
# We use (?=\n\s*\n|\n(?:Pathway Name:|\d+\.\s|={5,})|\Z) as the lookahead.
pattern = re.compile(
    r"Genes involved:\s*(.*?)(?=\n\s*\n|\n(?:Pathway Name:|\d+\.\s|={5,})|\Z)",
    re.DOTALL
)
gene_blocks = pattern.findall(text_content)

# -----------------------------
# 4. Process the captured blocks to extract individual gene tokens
# -----------------------------
all_genes_in_txt = []
# We assume valid gene names consist only of letters, digits, dashes or underscores.
valid_gene_pattern = re.compile(r"^[A-Za-z0-9\-]+$")

for block in gene_blocks:
    # First, split on newlines and commas
    tokens = re.split(r"[,\n]", block)
    # Strip whitespace and filter out empty tokens
    tokens = [token.strip() for token in tokens if token.strip()]
    # Filter tokens by checking they match a simple gene-name pattern
    genes = [token for token in tokens if valid_gene_pattern.match(token)]
    all_genes_in_txt.extend(genes)

total_occurrences = len(all_genes_in_txt)
print("\nTotal gene mentions in OpenAI file:", total_occurrences)

# -----------------------------
# 5. Occurrence (Mention) Comparison
# -----------------------------
matched_occurrences = sum(1 for gene in all_genes_in_txt if gene in gene_set)
occurrence_percentage = (matched_occurrences / total_occurrences) * 100 if total_occurrences else 0

print("Mentions from gene list in OpenAI file:", matched_occurrences)
print(f"-> {occurrence_percentage:.2f}% of all mentions are from the gene list.")

non_matched_occurrences = total_occurrences - matched_occurrences
non_matched_occurrence_percentage = (non_matched_occurrences / total_occurrences) * 100 if total_occurrences else 0

print("Mentions NOT in gene list:", non_matched_occurrences)
print(f"-> {non_matched_occurrence_percentage:.2f}% of all mentions are NOT in the gene list.")

# -----------------------------
# 6. Unique Gene Comparison
# -----------------------------
unique_genes_in_txt = set(all_genes_in_txt)
print("\nUnique genes found in OpenAI file:", len(unique_genes_in_txt))

matched_genes = {gene for gene in gene_set if gene in unique_genes_in_txt}
coverage_percentage = (len(matched_genes) / len(gene_set)) * 100 if gene_set else 0

print("Ground truth genes found in OpenAI file:", len(matched_genes))
print(f"-> Of the 250 genes in your gene list, {len(matched_genes)} (or {coverage_percentage:.2f}%) are present in the OpenAI file.")

non_matched_unique_genes = {gene for gene in unique_genes_in_txt if gene not in gene_set}
non_matched_unique_percentage = (len(non_matched_unique_genes) / len(unique_genes_in_txt)) * 100 if unique_genes_in_txt else 0

print("\nUnique genes in OpenAI file NOT in gene list:", len(non_matched_unique_genes))
print(f"-> {non_matched_unique_percentage:.2f}% of unique genes in the OpenAI file are not in the gene list.")

# -----------------------------
# 7. List the genes not in the gene list
# -----------------------------
print("\nGenes in OpenAI file but NOT in the gene list:")
for gene in sorted(non_matched_unique_genes):
    print(gene)


Unique genes in gene list (ground truth): 250

Total gene mentions in OpenAI file: 576
Mentions from gene list in OpenAI file: 576
-> 100.00% of all mentions are from the gene list.
Mentions NOT in gene list: 0
-> 0.00% of all mentions are NOT in the gene list.

Unique genes found in OpenAI file: 192
Ground truth genes found in OpenAI file: 192
-> Of the 250 genes in your gene list, 192 (or 76.80%) are present in the OpenAI file.

Unique genes in OpenAI file NOT in gene list: 0
-> 0.00% of unique genes in the OpenAI file are not in the gene list.

Genes in OpenAI file but NOT in the gene list:
