In [None]:
canonical_csv_path = "../../data/csv/fake_canonical.csv"
test_csv_path = "../../data/csv/fake_genai.csv"

In [None]:
import pandas as pd

# Load original and modified datasets
canonical_csv = pd.read_csv(canonical_csv_path).sort_values(by="id").reset_index(drop=True)
test_csv = pd.read_csv(test_csv_path).sort_values(by="id").reset_index(drop=True)

# Sort by ID to align entries
canonical_csv = canonical_csv.sort_values(by="id").reset_index(drop=True)
test_csv = test_csv.sort_values(by="id").reset_index(drop=True)

# Limit canonical to just the test's columns
common_columns = test_csv.columns
canonical_csv = canonical_csv[common_columns]

print(f"Comparing the following columns: {list(common_columns)}")

In [None]:
# ML Model Output Evaluation Report

# Define comparison function for strict match
def compare_fields(val1, val2):
    return str(val1).strip().lower() == str(val2).strip().lower()

# Setup
accuracy_report = {}
examples = []
total_rows = len(test_csv)
comparison_columns = [col for col in common_columns if col != "id"]
row_correct_counts = [0] * total_rows

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
plt.rcParams['figure.figsize'] = [10, 5]
sns.set_theme(style="whitegrid")

# Column-wise accuracy tracking
total_correct = 0
for column in comparison_columns:
    correct = 0
    diffs = []
    for i in range(total_rows):
        val1 = canonical_csv[column][i]
        val2 = test_csv[column][i]
        match = compare_fields(val1, val2)
        if match:
            correct += 1
            row_correct_counts[i] += 1
        elif len(diffs) < 5:
            diffs.append({
                "id": test_csv["id"][i],
                "original": "" if pd.isna(val1) else val1,
                "test": "" if pd.isna(val2) else val2
            })
    total_correct += correct
    accuracy_report[column] = {
        "correct": correct,
        "total": total_rows,
        "accuracy": round(correct / total_rows, 3),
        "examples": diffs
    }

# Total accuracy score
total_fields = total_rows * len(comparison_columns)
total_accuracy = round(total_correct / total_fields, 3)
print(f"\n🧠 Model Accuracy Summary")
print(f"✅ Total Accuracy Across All Fields: {total_accuracy * 100:.1f}%\n")

# Accuracy per column
summary_df = pd.DataFrame.from_dict({col: {"accuracy": v["accuracy"]} for col, v in accuracy_report.items()}, orient="index")
print("📊 Per-Field Accuracy:")
display(summary_df)

# Visual: Barplot of per-column accuracy
plt.figure()
sns.barplot(x=summary_df.index, y=summary_df["accuracy"])
plt.title("Model Accuracy by Field")
plt.ylabel("Accuracy")
plt.xlabel("Field")
plt.ylim(0, 1)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Visual: Histogram of row accuracy
row_accuracy = pd.DataFrame({
    "id": test_csv["id"],
    "correct_fields": row_correct_counts,
    "total_fields": len(comparison_columns),
    "row_accuracy": [round(c / len(comparison_columns), 3) for c in row_correct_counts]
})

plt.figure()
sns.histplot(row_accuracy["row_accuracy"], bins=10, kde=True)
plt.title("Distribution of Accuracy per Record")
plt.xlabel("Row Accuracy")
plt.ylabel("Number of Records")
plt.xlim(0, 1)
plt.tight_layout()
plt.show()

# Detailed field-level diffs
for col, data in accuracy_report.items():
    if data["examples"]:
        print(f"\n❌ Sample Mismatches in Column: {col}")
        display(pd.DataFrame(data["examples"]))



# 🔍 Diff View of Test CSV
from difflib import SequenceMatcher

def similarity(a, b):
    return SequenceMatcher(None, str(a).strip().lower(), str(b).strip().lower()).ratio()

def highlight_diffs(row):
    styled = []
    for col in comparison_columns:
        test_val = test_csv.at[row.name, col]
        canon_val = canonical_csv.at[row.name, col]
        sim = similarity(test_val, canon_val)
        if sim == 1:
            style = "background-color: #e6ffe6"  # greenish for match
            display_val = test_val
        else:
            # yellow to red based on similarity
            red = int((1 - sim) * 255)
            color = f"#{255:02x}{255 - red:02x}{128:02x}"
            style = f"background-color: {color}"
            display_val = f"{test_val}<br><small><i>→ {canon_val}</i></small>"
        styled.append(f'<td style="{style}">{display_val}</td>')
    return f'<tr><td>{row["id"]}</td>' + ''.join(styled) + '</tr>'

print("\n🧾 Diff View of Model Output:")
html_table = '<table border="1" style="border-collapse: collapse"><tr><th>ID</th>' + ''.join([f'<th>{col}</th>' for col in comparison_columns]) + '</tr>'
html_table += '\n'.join(test_csv.apply(highlight_diffs, axis=1))
html_table += '</table>'
display(HTML(html_table))


In [None]:
# Additional evaluation metrics
# Calculate precision, recall, F1, similarity, and coverage for each field
from difflib import SequenceMatcher
import numpy as np

metrics = {}
for col in comparison_columns:
    y_true = canonical_csv[col].fillna("")
    y_pred = test_csv[col].fillna("")
    # Count true/false positives/negatives
    tp = (((y_true.str.lower() == y_pred.str.lower()) & (y_true != "")).sum())
    fp = (((y_true.str.lower() != y_pred.str.lower()) & (y_pred != "")).sum())
    fn = (((y_true != "") & (y_pred == "")).sum())
    precision = tp / (tp + fp) if (tp + fp) else 0
    recall = tp / (tp + fn) if (tp + fn) else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
    # Character-level similarity and prediction coverage
    similarity = np.mean([SequenceMatcher(None, str(a).lower(), str(b).lower()).ratio() for a, b in zip(y_true, y_pred)])
    coverage = (y_pred != "").mean()
    metrics[col] = {
        "precision": round(precision, 3),
        "recall": round(recall, 3),
        "f1": round(f1, 3),
        "similarity": round(similarity, 3),
        "coverage": round(coverage, 3),
    }

# Macro averages across all fields
macro_p = np.mean([m["precision"] for m in metrics.values()])
macro_r = np.mean([m["recall"] for m in metrics.values()])
macro_f1 = np.mean([m["f1"] for m in metrics.values()])
macro_sim = np.mean([m["similarity"] for m in metrics.values()])
macro_cov = np.mean([m["coverage"] for m in metrics.values()])
metrics["macro_avg"] = {
    "precision": round(macro_p, 3),
    "recall": round(macro_r, 3),
    "f1": round(macro_f1, 3),
    "similarity": round(macro_sim, 3),
    "coverage": round(macro_cov, 3),
}
metrics_df = pd.DataFrame(metrics).T
print("\n🔬 Precision, Recall, F1, Similarity, Coverage:")
display(metrics_df)


# Metric Definitions

**Precision** measures how many of the model's non-empty answers were correct.

**Recall** captures how many of the true answers the model successfully filled in.

**F1** is a single score balancing precision and recall.

**Similarity** looks at how close the predicted text is to the true text even if they aren't exactly the same.

**Coverage** shows the percentage of records where the model attempted an answer at all.
