# Evaluation of gene name extraction and runtime calculation

# 1) Install libraries and load dataset

In [None]:
import os
import re
import sys
import time
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from matplotlib.colors import LinearSegmentedColormap

print("Import successful!")
print("Current Python version:", sys.version)

In [None]:
# Set the working directory and file paths
working_directory = "WORKING_DIRECTORY"
input_directory = "INPUT_DIRECTORY"
output_directory = "OUTPUT_DIRECTORY"
articles_file = "articles.csv"
genes_file = "genes.csv"

# Change the working directory
os.chdir(output_directory)
print("Current Working Directory:", os.getcwd())

# 2) Merge all datasets and vizualize performance

In [None]:
# Load and merge datasets
os.chdir(working_directory)
print("Current Working Directory:", os.getcwd())

file_mapping = {
    "human_evaluation_results.csv": "Human",
    "string_matching_evaluation_results.csv": "String_Matching",
    "scispacy_evaluation_bionlp13cg.csv": "bionlp13cg",
    "scispacy_evaluation_jnlpba.csv": "jnlpba",
    "scispacy_evaluation_craft.csv": "craft",
    "BioBERT_evaluation_results.csv": "BioBERT",
    "LLM_evaluation_gpt4o.csv": "gpt4o"
}

datasets = {}
for file, model in file_mapping.items():
    file_path = os.path.join(working_directory, file)
    df = pd.read_csv(file_path)
    datasets[model] = df

sum_entity_mentions = {
    model: df["Sum_Entity_Mentions"].sum()
    for model, df in datasets.items()
}

plt.figure(figsize=(12, 6))
plt.bar(sum_entity_mentions.keys(), sum_entity_mentions.values(), color="blue", alpha=0.7)
plt.xlabel("Approach")
plt.ylabel("Total Extracted Genes")
plt.title("Total Number of Extracted Genes Per Approach")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Create gene count matrix
os.chdir(working_directory)
print("Current Working Directory:", os.getcwd())

file_mapping = {
    "human_evaluation_results.csv": "Human",
    "string_matching_evaluation_results.csv": "String_Matching",
    "scispacy_evaluation_bionlp13cg.csv": "bionlp13cg",
    "scispacy_evaluation_jnlpba.csv": "jnlpba",
    "scispacy_evaluation_craft.csv": "craft",
    "BioBERT_evaluation_results.csv": "BioBERT",
    "LLM_evaluation_gpt4o.csv": "gpt4o"
}

datasets = {}
for file, model in file_mapping.items():
    file_path = os.path.join(working_directory, file)
    df = pd.read_csv(file_path)
    df.columns = df.columns.str.strip()
    datasets[model] = df

print("\n All datasets loaded successfully!")

os.chdir(input_directory)
genes = pd.read_csv(genes_file, header=None)
gene_list = genes[0].tolist()
print(" Genes import successful!")
os.chdir(working_directory)

human_df = datasets["Human"]
actual_gene_columns = [gene for gene in gene_list if gene in human_df.columns]
gene_count_matrix = pd.DataFrame(index=actual_gene_columns)

for model, df in datasets.items():
    gene_counts = df[actual_gene_columns].sum()
    gene_count_matrix[model] = gene_counts

output_file = os.path.join(working_directory, "final_evaluation_matrix.csv")
gene_count_matrix.to_csv(output_file)
print(f"\n Gene count matrix saved as '{output_file}'")

print(gene_count_matrix.to_string())

# 3) Create evaluation matrix and figure

In [None]:
# Create confusion and evaluation matrix
os.chdir(working_directory)
print("Current Working Directory:", os.getcwd())
file_mapping = {
    "human_evaluation_results.csv": "Human",
    "string_matching_evaluation_results.csv": "String_Matching",
    "scispacy_evaluation_bionlp13cg.csv": "bionlp13cg",
    "scispacy_evaluation_jnlpba.csv": "jnlpba",
    "scispacy_evaluation_craft.csv": "craft",
    "BioBERT_evaluation_results.csv": "BioBERT",
    "LLM_evaluation_gpt4o.csv": "gpt4o"
}

datasets = {}
for file, model in file_mapping.items():
    file_path = os.path.join(working_directory, file)
    df = pd.read_csv(file_path)
    df.columns = df.columns.str.strip()
    datasets[model] = df

print("\n All datasets loaded successfully!")

os.chdir(input_directory)
genes = pd.read_csv(genes_file, header=None)
gene_list = genes[0].tolist()
print(" Genes import successful!")
os.chdir(working_directory)

human_df = datasets["Human"]
actual_gene_columns = [gene for gene in gene_list if gene in human_df.columns]

gene_count_matrix = pd.DataFrame(index=actual_gene_columns)

for model, df in datasets.items():
    gene_counts = df[actual_gene_columns].sum()
    gene_count_matrix[model] = gene_counts

gene_count_matrix = gene_count_matrix.fillna(0).astype(int)
human_counts = gene_count_matrix["Human"]

evaluation_results = {}

for model, df in datasets.items():
    if model == "Human":
        continue
    model_counts = gene_count_matrix[model]
    TP = (model_counts.where(model_counts <= human_counts, human_counts)).sum()
    FP = (model_counts - human_counts).clip(lower=0).sum()
    FN = (human_counts - model_counts).clip(lower=0).sum()
    TN = ((model_counts == 0) & (human_counts == 0)).sum()

    sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    accuracy = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) > 0 else 0
    f1_score = 2 * (precision * sensitivity) / (precision + sensitivity) if (precision + sensitivity) > 0 else 0
    jaccard_index = TP / (TP + FP + FN) if (TP + FP + FN) > 0 else 0

    evaluation_results[model] = {
        "True Positives": TP,
        "False Positives": FP,
        "False Negatives": FN,
        "True Negatives": TN,
        "Sensitivity (recall)": sensitivity,
        "Specificity": specificity,
        "Precision": precision,
        "Accuracy": accuracy,
        "F1-Score": f1_score,
        "Jaccard index": jaccard_index,
    }

evaluation_df = pd.DataFrame.from_dict(evaluation_results, orient="index")
output_file = os.path.join(working_directory, "confusion_matrix_evaluation.csv")
evaluation_df.to_csv(output_file)
print(f"\n Evaluation results saved as '{output_file}'")
print(evaluation_df)

In [None]:
# Create heatmap figure
metrics_data = evaluation_df.iloc[:, 4:]
vmin = metrics_data.min().min()
vmax = metrics_data.max().max()

custom_cmap = LinearSegmentedColormap.from_list(
    "orange_to_green",
    ["#fbe8a6", "#a3c586", "#004d00"]
)
plt.figure(figsize=(8, 4))
sns.set(style="whitegrid")
heatmap = sns.heatmap(
    metrics_data,
    annot=True,
    fmt=".2f",
    cmap=custom_cmap,
    linewidths=0.5,
    annot_kws={"size": 9},
    vmin=vmin,
    vmax=vmax
)

cbar = heatmap.collections[0].colorbar
cbar.ax.tick_params(labelsize=8)
plt.title("Model evaluation (compared to ground truth)", fontsize=10, color='black')
plt.ylabel("Model", fontsize=9, color='black', labelpad=25)
plt.xlabel("Performance metrics", fontsize=9, color='black')
plt.xticks(rotation=45, fontsize=8, color='black')
heatmap.tick_params(axis='x', pad=-3)

for label in heatmap.get_xticklabels():
    label.set_ha('right')

heatmap.tick_params(
    axis='x',
    which='both',
    bottom=True,
    top=False,
    length=5,
    width=1,
    direction='out',
    pad=2
)

plt.yticks(fontsize=8, color='black')
plt.tight_layout()
custom_y_labels = ['GPT-4o (LLM)' if label.get_text() == 'gpt4o' else label.get_text() for label in heatmap.get_yticklabels()]
custom_y_labels = ['Sting-matching' if label.get_text() == 'String_Matching' else label.get_text() for label in heatmap.get_yticklabels()]
heatmap.set_yticklabels(custom_y_labels, rotation=0, fontsize=8, color='black')
plt.savefig("heatmap_confusion_matrix.png", dpi=300, bbox_inches="tight")
plt.savefig("heatmap_confusion_matrix.pdf", bbox_inches="tight")
plt.show()

# 4) Calculate runtime

In [None]:
# Set working directory and load the CSV
os.chdir(working_directory)
runtime = pd.read_csv("model_runtimes.csv")
print(runtime)

In [None]:
# Bar chart creation for runtime
runtime["model"] = runtime["model"].replace({
    "LLM": "LLM",
    "BioBERT": "BioBERT",
    "scispaCy": "SciSpaCy",
    "String-matching": "String-matching"
})

model_order = ["String-matching", "SciSpaCy", "BioBERT", "LLM"]
runtime["model"] = pd.Categorical(runtime["model"], categories=model_order, ordered=True)
runtime_sorted = runtime.sort_values("model")

plt.figure(figsize=(4, 4))
sns.set(style="whitegrid")

ax = sns.barplot(
    data=runtime_sorted,
    y="model",
    x="min",
    color="darkorange",
    height=0.6
)

for i, v in enumerate(runtime_sorted["min"]):
    ax.text(v + 10, i, f"{int(v):,}", color='black', va='center', fontsize=8)
ax.xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{int(x):,}'))
x_max = runtime_sorted["min"].max()
plt.xlim(0, x_max * 1.2)

plt.title("Runtime to process 100,000 articles", fontsize=10, color='black', loc='left', x=-0.05)
plt.xlabel("Time (minutes)", fontsize=9, color='black')
plt.ylabel("Model", fontsize=9, color='black')
plt.xticks(fontsize=8, color='black')
plt.yticks(fontsize=8, color='black')
plt.tight_layout()
plt.savefig("barplot_model_runtimes_narrow.png", dpi=300, bbox_inches="tight")
plt.savefig("barplot_model_runtimes_narrow.pdf", bbox_inches="tight")
plt.show()