In [1]:
import os
os.environ["DOC_AI_LOCATION"] = "us"
os.environ["PROJECT_ID"] = "602280418311"

import pandas as pd

In [2]:

df = pd.read_csv('/Users/odeine/PycharmProjects/ilios-DocAI/2024-07-25_15-22-45_output-loan-agreement.csv')

In [3]:
df.shape

In [4]:
df = df.dropna(subset=["key_item", "file_name"])
columns_not_to_evaluate = [
"Provisions",
"Origination Fee",
]

df = df[~df["key_item"].isin(columns_not_to_evaluate)]


In [5]:
from src.pipelines.constants import NOT_PROVIDED_STR
from typing import List
from  sklearn import metrics as sk_metrics
def classification_metrics(y_true: pd.Series, y_pred: pd.Series) -> str:
    """
    Compute classification metrics. Return a string with the metrics.
    Confusion matrix, F1, Fbeta, Precision, Recall.
    """
    cm = sk_metrics.confusion_matrix(y_true, y_pred)

    f1_score = sk_metrics.f1_score(y_true, y_pred)
    fbeta_score = sk_metrics.fbeta_score(y_true, y_pred, beta=0.5)

    precision = sk_metrics.precision_score(y_true, y_pred)
    recall = sk_metrics.recall_score(y_true, y_pred)

    return (
        f"TP: {cm[1][1]}"
        f" FP: {cm[0][1]}"
        f" FN: {cm[1][0]}"
        f" TN: {cm[0][0]}\n"
        f" F1: {np.round(f1_score, 2)}"
        f" Fbeta: {np.round(fbeta_score, 2)}\n"
        f" Precision: {np.round(precision, 2)}"
        f" Recall: {np.round(recall, 2)}"
    )


def get_confusion_matrix_and_true_positive_metric(
    results_df: pd.DataFrame, metrics: List[str]
) -> pd.DataFrame:
    """Compute confusion matrix."""

    results_df = results_df.copy().dropna(
        subset=["Legal Terms", "Predicted Legal Terms"]
    )
    y_true = results_df["Legal Terms"] != NOT_PROVIDED_STR
    y_pred = results_df["Predicted Legal Terms"] != NOT_PROVIDED_STR

    ture_positives = y_true & y_pred
    metrics_on_true_positive: pd.DataFrame = pd.DataFrame(
        results_df[metrics][ture_positives].mean()
    ).T
    metrics_on_true_positive["file_name"] = np.nan
    metrics_on_true_positive["key_item"] = np.nan

    metrics_on_true_positive["Legal Terms"] = np.nan
    metrics_on_true_positive["Predicted Legal Terms"] = classification_metrics(
        y_true, y_pred
    )

    return metrics_on_true_positive

In [6]:
import numpy as np
from src.pipelines.term_extraction.pipeline_config import LoanAgreementPipelineConfig


config = LoanAgreementPipelineConfig()

def rework(df: pd.DataFrame) -> pd.DataFrame:
    metrics = [
            "llm_validation",
            "llm_validation_binary",
            "levenshtein_score",
            "bleu_score",
            "rougeL_f1_score",
            "meteor_score",
            "rouge1_f1_score",
            "rouge1_precision_score",
            "rouge1_recall_score",
            "rougeL_precision_score",
            "rougeL_recall_score",
            ]
    results_df = df.copy()
    cols = [
            "file_name",
            "key_item",
            "Legal Terms",
            "Predicted Legal Terms",
        ] + metrics
    
    confusion_matrix_and_true_positive_metric = (
            get_confusion_matrix_and_true_positive_metric(results_df, metrics)
        )
    
    metrics_total: pd.DataFrame = pd.DataFrame(results_df[metrics].mean()).T
    metrics_total["file_name"] = np.nan
    metrics_total["key_item"] = np.nan
    # 
    metrics_grouped_by_key_items: pd.DataFrame = (
        results_df.groupby("key_item")[metrics].mean().reset_index()
    )
    metrics_grouped_by_key_items["file_name"] = np.nan
    
    metrics_grouped_by_file_name: pd.DataFrame = (
        results_df.groupby("file_name")[metrics].mean().reset_index()
    )
    metrics_grouped_by_file_name["key_item"] = np.nan
    
    results_df = pd.concat(
        [
            confusion_matrix_and_true_positive_metric,
            metrics_total,
            metrics_grouped_by_key_items,
            metrics_grouped_by_file_name,
            results_df,
        ],
        ignore_index=True,
    )[cols]
    
    return results_df

results_df = rework(df)

In [7]:
results_df

In [8]:
results_df.to_csv('/Users/odeine/PycharmProjects/ilios-DocAI/2024-07-25_15-22-45_output-loan-agreement-improved-0.78.csv', index=False)

In [8]:
results_df