# Evaluation and confusion matrix creation

# 1) Set up libraries and datasets

In [None]:
import os
import re
import sys
import time
import logging
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from pathlib import Path
from functools import reduce
from collections import Counter
from datetime import datetime, timedelta
from sklearn.metrics import f1_score, precision_score, recall_score

print("Success!")

In [None]:
# Set the working directory and file paths
working_directory = "INPUT_DIRECTORY"
NLP_directory = "NLP_DIRECTORY"
output_directory = "OUTPUT_DIRECTORY"

In [None]:
# Set working directory
os.chdir(working_directory)

llm_files = [f for f in os.listdir() if f.startswith("LLM_variant_extraction_") and f.endswith("_prompt3.csv")]
df_list = []

meta_columns = ["PaperId", "PaperTitle", "Abstract"]
meta_df = None

for file in llm_files:
    model_match = re.search(r"LLM_variant_extraction_(.*?)_prompt3\.csv", file)
    model_name = model_match.group(1) if model_match else f"model_{len(df_list)+1}"

    df = pd.read_csv(file)
    df["PaperId"] = pd.to_numeric(df["PaperId"], errors="coerce").fillna(0).astype("int64")

    if meta_df is None:
        meta_df = df[meta_columns]

    response_column = df.columns[-1]
    df = df[["PaperId", response_column]].rename(columns={response_column: f"LLM_Response_{model_name}"})

    df_list.append(df)

# Load human variant extraction file
human_file = "Human_variant_extraction.csv"
if os.path.exists(human_file):
    human_df = pd.read_csv(human_file)
    human_df["PaperId"] = pd.to_numeric(human_df["PaperId"], errors="coerce").fillna(0).astype("int64")
    human_col = human_df.columns[-1]
    human_df = human_df[["PaperId", human_col]].rename(columns={human_col: "Human_analysis"})
    df_list.append(human_df)

merged_df = reduce(lambda left, right: pd.merge(left, right, on="PaperId", how="outer"), df_list)
Variant_LLM_dataset_Evaluation = pd.merge(meta_df, merged_df, on="PaperId", how="left")
Variant_LLM_dataset_Evaluation = Variant_LLM_dataset_Evaluation.sort_values(by="PaperId").reset_index(drop=True)

print("Merged dataset shape:", Variant_LLM_dataset_Evaluation.shape)
print("Column names:", Variant_LLM_dataset_Evaluation.columns.tolist())

In [None]:
# Check data manually and merge to common file including NLP!!

In [None]:
# Upload data
file_path = os.path.join(working_directory, "merged_variant_extraction_llm_nlp_human.csv")
confusion_matrix_df = pd.read_csv(file_path)
print(confusion_matrix_df.shape)
print(confusion_matrix_df.columns.tolist())
print(confusion_matrix_df.head(20))

## RUN EVALUATION

In [None]:
# Define model and human columns
model_columns = [
    'LLM_Response_llama31-70b',
    'LLM_Response_gpt4o',
    'LLM_Response_llama33-70b',
    'LLM_Response_deepseek_v3',
    'NLP'
]
human_col = 'Human_analysis'

### --- Part 1: Exact Row Match (Model vs Human) ---
exact_match_counts = {}

for model in model_columns:
    match_series = confusion_matrix_df[model].fillna("").str.strip() == confusion_matrix_df[human_col].fillna("").str.strip()
    exact_match_counts[model] = {
        "Total Matches": match_series.sum(),
        "Total Rows": len(match_series),
        "Match Percentage": 100 * match_series.sum() / len(match_series)
    }

exact_match_df = pd.DataFrame(exact_match_counts).T
print("Exact string match results:")
print(exact_match_df)


### --- Part 2: Count Variant-Gene Pairs in Each Column ---
def count_variant_gene_pairs(text):
    if pd.isna(text):
        return 0
    pattern = r'Variant:\s*.+?,\s*Gene:\s*.+?(\n|$)'
    matches = re.findall(pattern, text, re.IGNORECASE)
    return len(matches)

variant_gene_counts = {}
variant_gene_row_counts = {}

for col in model_columns + [human_col]:
    counts = confusion_matrix_df[col].apply(count_variant_gene_pairs)
    
    # Part 2: Total count and max per row
    variant_gene_counts[col] = {
        "Total Variant-Gene Pairs": counts.sum(),
        "Max in Single Entry": counts.max()
    }

    # Part 3: Number of rows with at least 1 variant-gene pair, and with 0
    num_with_variants = (counts > 0).sum()
    num_without_variants = (counts == 0).sum()

    variant_gene_row_counts[col] = {
        "Rows With ≥1 Variant-Gene Pair": num_with_variants,
        "Rows With 0 Variant-Gene Pairs": num_without_variants,
        "Total Rows": len(counts),
    }

variant_gene_df = pd.DataFrame(variant_gene_counts).T
variant_detection_df = pd.DataFrame(variant_gene_row_counts).T

print("\nVariant-Gene pair counts (Total and Max):")
print(variant_gene_df)

print("\nRows with at least one Variant-Gene pair detected:")
print(variant_detection_df)

In [None]:
# Load data
file_path = os.path.join(working_directory, "merged_variant_extraction_llm_nlp_human.csv")
df = pd.read_csv(file_path)

# Define columns
model_columns = [
    'LLM_Response_llama31-70b',
    'LLM_Response_gpt4o',
    'LLM_Response_llama33-70b',
    'LLM_Response_deepseek_v3',
    'NLP'
]
human_col = 'Human_analysis'

# Extract variant-gene pairs from text
def extract_pairs(text):
    if pd.isna(text):
        return set()
    pattern = r'Variant:\s*(.+?),\s*Gene:\s*(.+?)(?:\n|$)'
    matches = re.findall(pattern, text, re.IGNORECASE)
    return set([f"Variant: {v.strip()}, Gene: {g.strip()}" for v, g in matches])

# Prepare results
results = {}
examples = {}

# Evaluate each model
for model in model_columns:
    tp = fp = fn = tn = 0
    false_positives = []
    false_negatives = []

    for idx, row in df.iterrows():
        human_pairs = extract_pairs(row[human_col])
        model_pairs = extract_pairs(row[model])
        tp_pairs = model_pairs & human_pairs
        fp_pairs = model_pairs - human_pairs
        fn_pairs = human_pairs - model_pairs
        tp += len(tp_pairs)
        fp += len(fp_pairs)
        fn += len(fn_pairs)
        if not human_pairs and not model_pairs:
            tn += 1

        if fp_pairs:
            false_positives.append({"Row": idx, "Pairs": list(fp_pairs)})
        if fn_pairs:
            false_negatives.append({"Row": idx, "Pairs": list(fn_pairs)})

    # Metrics
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    total_predictions = tp + fp + fn + tn
    accuracy = (tp + tn) / total_predictions if total_predictions > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

    results[model] = {
        "True Positives (TP)": tp,
        "False Positives (FP)": fp,
        "False Negatives (FN)": fn,
        "True Negatives (TN)": tn,
        "F1 Score": f1,
        "Sensitivity (Recall)": recall,
        "Specificity": specificity,
        "Precision": precision,
        "Accuracy": accuracy
    }

    examples[model] = {
        "False Positives": false_positives[:3],
        "False Negatives": false_negatives[:3]
    }

# Create results DataFrame
results_df = pd.DataFrame(results).T

plt.figure(figsize=(10, 6))
sns.barplot(data=results_df.reset_index(), x='index', y='F1 Score', palette="mako")
plt.title("F1 Scores by LLM Model")
plt.ylabel("F1 Score")
plt.xlabel("LLM Model")
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
for metric in ["Precision", "Sensitivity (Recall)", "Accuracy"]:
    plt.plot(results_df.index, results_df[metric], marker='o', label=metric)

plt.title("Model Performance Metrics")
plt.ylabel("Score")
plt.xlabel("LLM Model")
plt.ylim(0, 1.05)
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Display False Positives and False Negatives
for model in model_columns:
    print(f"\nModel: {model}")
    print("False Positives (examples):")
    for ex in examples[model]["False Positives"]:
        print(f"  Row {ex['Row']} → {ex['Pairs']}")
    print("False Negatives (examples):")
    for ex in examples[model]["False Negatives"]:
        print(f"  Row {ex['Row']} → {ex['Pairs']}")


In [None]:
# Refine model names
new_model_names = {
    'LLM_Response_deepseek_v3': 'Deepseek_v3',
    'LLM_Response_gpt4o': 'GPT-4o',
    'LLM_Response_llama31-70b': 'LLaMA3.1-70b',
    'LLM_Response_llama33-70b': 'LLaMA3.3-70b',
    'NLP':'en_ner_bionlp13cg_md'
}

# Define model order
ordered_models = [
    'LLM_Response_deepseek_v3',
    'LLM_Response_gpt4o',
    'LLM_Response_llama31-70b',
    'LLM_Response_llama33-70b',
    'NLP'
]

# Metrics
count_metrics = ["True Positives (TP)", "False Positives (FP)", "False Negatives (FN)", "True Negatives (TN)"]
score_metrics = ["Precision", "Sensitivity (Recall)", "Accuracy","F1 Score"]
counts_df = results_df.loc[ordered_models, count_metrics].copy()
scores_df = results_df.loc[ordered_models, score_metrics].copy()
counts_df.index = [new_model_names[m] for m in counts_df.index]
scores_df.index = [new_model_names[m] for m in scores_df.index]
counts_col_percent_df = counts_df.copy()

for col in counts_col_percent_df.columns:
    col_vals = counts_col_percent_df[col]
    if col in ["False Positives (FP)", "False Negatives (FN)"]:
        max_val = col_vals.max()
        if max_val > 0:
            counts_col_percent_df[col] = (1 - col_vals / max_val) * 100
        else:
            counts_col_percent_df[col] = 100 
    else:
        max_val = col_vals.max()
        if max_val > 0:
            counts_col_percent_df[col] = (col_vals / max_val) * 100

            
            
###### Plot side-by-side heatmaps
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Left: Confusion matrix heatmap
sns.heatmap(counts_col_percent_df, annot=counts_df, fmt=".0f", cmap="Greens", linewidths=0.5,
            ax=axes[0], cbar_kws={'label': 'Relative comparison (%)'})
axes[0].set_title("Confusion matrix")
axes[0].set_xlabel("")
axes[0].set_ylabel("Model / approach")
axes[0].tick_params(axis='y', rotation=0)

# Right: Standard performance metrics
sns.heatmap(scores_df, annot=True, fmt=".2f", cmap="Blues", linewidths=0.5,
            ax=axes[1], cbar_kws={'label': 'Score'})
axes[1].set_title("Performance metrics")
axes[1].set_xlabel("")
axes[1].set_ylabel("")
axes[1].tick_params(axis='y', rotation=0)

# Rotate x-ticks for readability
for ax in axes:
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
output_path = "final_llm_nlp_human_evaluations.png"
plt.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"Plot saved successfully as '{output_path}'.")
plt.show()