## Manual Prompting Analysis

This is a "manual prompting" analysis notebook analyzing the differences between the ground-truth dataset and the cleaned (by manual prompting) dataset. 

MP dataset used Google Gemini's web/chat interface (Gemini 3, Thinking) at gemini.google.com. Refer to `/cleaning/manual_prompting.ipynb` for more details.

#### Analysis

In [1]:
# import packages
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [2]:
# import data (only 250 rows for now)
df_manual = pd.read_csv("../data/fdic/mt_cleaned_fdic.csv")
df_llm = pd.read_csv("../data/fdic/mp_cleaned_fdic.csv")

In [3]:
# schema integrity
manual_cols = set(df_manual.columns)
llm_cols = set(df_llm.columns)

schema_diff = {
    "dropped_columns": list(manual_cols - llm_cols),
    "invented_columns": list(llm_cols - manual_cols),
    "row_count_diff": len(df_llm) - len(df_manual)
}

print("schema diffs:")
print(schema_diff)

schema diffs:
{'dropped_columns': ['change_code_2', 'change_code_4', 'change_code_3', 'end_effective_date'], 'invented_columns': [], 'row_count_diff': -1}


In [4]:
# make sure both csvs have same "primary key" due to comparisions
df_manual = df_manual.set_index("fdic_certificate_number")
df_llm = df_llm.set_index("fdic_certificate_number")

commons = df_manual.index.intersection(df_llm.index)

df_manual = df_manual.loc[commons]
df_llm = df_llm.loc[commons]

In [5]:
# calculate metrics for all cols
precisions = []
recalls = []
f1s = []

for col in df_llm.columns:
    
    if col not in df_manual.columns:
        continue
    
    y_true = df_manual[col].astype(str)
    y_pred = df_llm[col].astype(str)
    
    precision = precision_score(y_true, y_pred, average="weighted", zero_division=0)
    recall = recall_score(y_true, y_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0)
    
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) for x in ys)
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) for x in ys)
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) for x in ys)
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) for x in ys)
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) f

In [6]:
# find avgs for all cols
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_f1 = np.mean(f1s)

print("\navg metrics fror all cols:")
print(f"avg precision: {avg_precision:.4f}")
print(f"avg recall:    {avg_recall:.4f}")
print(f"avg F1 score:  {avg_f1:.4f}")


avg metrics fror all cols:
avg precision: 0.9693
avg recall:    0.9671
avg F1 score:  0.9671
