# Human Validation for LLM annotation with 4 Class (Support, Oppose, Amend, and Monitor)

In [1]:
import os
import pandas as pd
from utils import open_csv, save_csv
from sklearn.metrics import classification_report, f1_score

In [2]:
def get_human_validation_df(base_dir="human_validation"):
    bill_position_list = ['support', 'oppose', 'amend', 'monitor']
    df_list = []

    for bill_position in bill_position_list:
        file_path = os.path.join(base_dir, f"human_validation_{bill_position}.csv")
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            if bill_position == 'mention':
                df = df[90:200]
                # df = df[:150]
            # print(bill_position,len(df))
            df = df.rename(columns={'final_ground_truth':'label'})
            df_list.append(df)
        else:
            print(f"Warning: File not found: {file_path}")

    if df_list:
        combined_df = pd.concat(df_list, ignore_index=True)
        return combined_df
    else:
        raise FileNotFoundError("No valid human validation CSV files found.")


def get_llm_bill_position_df():
    llm_bill_position_df = open_csv('llm_output/llm_bill_position_final_df.csv')
    return llm_bill_position_df


def compare_labels(llm_df, human_df):
    merged_df = pd.merge(
        llm_df,
        human_df,
        on=["paragraph", "bill_id"],
        suffixes=("_llm", "_human")
    )

    merged_df['label_match'] = merged_df['label_llm'] == merged_df['label_human']

    # calcuate Accuracy 
    total_matches = merged_df['label_match'].sum()
    total = len(merged_df)
    overall_accuracy = total_matches / total if total > 0 else 0

    print(f"Overall Accuracy: {total_matches}/{total} = {overall_accuracy:.2%}")

    label_stats = merged_df.groupby('label_human').apply(
        lambda x: pd.Series({
            'total_samples': len(x),
            'correct_predictions': (x['label_llm'] == x['label_human']).sum(),
            'accuracy': (x['label_llm'] == x['label_human']).mean()
        })
    ).reset_index()
    
    print("\nLabel-wise Accuracy Stats:")
    print(label_stats)

    # calcuate F1 score 
    y_true = merged_df['label_human']
    y_pred = merged_df['label_llm']
    label_order = ['Support', 'Oppose', 'Amend', 'Monitor']

    print("\nClassification Report (includes precision, recall, F1-score):")
    print(classification_report(
        y_true,
        y_pred,
        labels=label_order,
        target_names=label_order,
        digits=4
    ))

    macro_f1 = f1_score(y_true, y_pred, average='macro', labels=label_order)
    micro_f1 = f1_score(y_true, y_pred, average='micro', labels=label_order)
    print(f"Macro F1 Score: {macro_f1:.4f}")
    print(f"Micro F1 Score: {micro_f1:.4f}")

    return


In [3]:
human_validation_df = get_human_validation_df()
llm_bill_position_df = get_llm_bill_position_df()

In [4]:
compare_labels(llm_bill_position_df, human_validation_df)

Overall Accuracy: 379/391 = 96.93%

Label-wise Accuracy Stats:
  label_human  total_samples  correct_predictions  accuracy
0       Amend           98.0                 92.0  0.938776
1     Monitor          103.0                100.0  0.970874
2      Oppose           98.0                 98.0  1.000000
3     Support           92.0                 89.0  0.967391

Classification Report (includes precision, recall, F1-score):
              precision    recall  f1-score   support

     Support     0.9780    0.9674    0.9727        92
      Oppose     0.9899    1.0000    0.9949        98
       Amend     0.9787    0.9388    0.9583        98
     Monitor     0.9524    0.9709    0.9615       103

   micro avg     0.9743    0.9693    0.9718       391
   macro avg     0.9748    0.9693    0.9719       391
weighted avg     0.9744    0.9693    0.9717       391

Macro F1 Score: 0.9719
Micro F1 Score: 0.9718


  label_stats = merged_df.groupby('label_human').apply(
