In [2]:
import os
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd
from langchain.chains import LLMChain
from tqdm import tqdm
from datetime import datetime

from src.gen_ai.gen_ai import get_llm
from src.pipelines.term_extraction.utils import get_project_preview

os.environ["DOC_AI_LOCATION"] = "us"
os.environ["PROJECT_ID"] = "602280418311"
os.environ["LOCATION"] = "us-west1"
os.environ["AWS_ACCESS_KEY_ID"] = "AKIASRTAGEHZRQDO7DU6"
os.environ["AWS_SECRET_ACCESS_KEY"] = "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"

In [3]:
from src.pipelines.term_extraction.pipeline_config import PVSystPipelineConfig, PipelineConfig
import pathlib

pipeline_config = PVSystPipelineConfig(use_gcs_storage=True)

main_path = pathlib.Path().absolute().parent.parent
pipeline_folder = Path(main_path / f"src/pipelines/terms/{pipeline_config.pipeline_name}")

instructions_path = pipeline_folder / "terms-instructions.csv"
terms_and_instructions = pd.read_csv(pipeline_folder / "terms-instructions.csv")
terms_and_definitions = pd.read_csv(pipeline_folder / "terms-definitions.csv")

In [5]:
def load_legal_terms(pipeline_config: PipelineConfig) -> pd.DataFrame:
    few_shots_legal_terms = []
    for file_name in pipeline_config.file_names + pipeline_config.few_shot_file_names:
        print(f"File: {file_name}")

        correct_project_preview = get_project_preview(pipeline_config.get_project_previews_path(), file_name)
        # print(correct_project_preview.columns)
        few_shots_legal_terms.append(
            correct_project_preview[["Key Items", "Value",
                                     # "Legal Terms"
                                     ]].assign(file_name=file_name))

    return pd.concat(few_shots_legal_terms)


legal_terms = load_legal_terms(pipeline_config)

In [8]:
legal_terms["Legal Terms"] = legal_terms["Value"].copy()

In [9]:
legal_terms.sort_values(["Key Items", "file_name"])

In [10]:
legal_terms["Legal Terms"].dropna().apply(len).hist(bins=20)
print(legal_terms["Legal Terms"].dropna().apply(len).mean())

In [11]:
key_items_counts = legal_terms["Key Items"].value_counts().reset_index()

In [12]:
# key_items_counts[key_items_counts["count"] == key_items_counts["count"].max()]
key_items_counts

In [14]:
legal_terms_with_counts = legal_terms.groupby("Key Items").count().sort_values(["file_name", "Legal Terms"],
                                                                               ascending=False)
legal_terms_with_counts = legal_terms_with_counts.rename({"file_name": "Number of files", "Legal Terms": "Number of examples"}, axis=1)

legal_terms_with_counts.merge(terms_and_definitions, on="Key Items", how="outer").sort_values("Number of files")

In [15]:
legal_terms_with_counts = legal_terms.groupby("Key Items").count().sort_values(["file_name", "Legal Terms"],
                                                                               ascending=False)
legal_terms_with_counts

In [16]:
legal_terms.groupby("file_name").count().sort_values(["Legal Terms", "Key Items"], ascending=False)

In [18]:
legal_terms.pivot_table(index="Key Items", columns="file_name", values="Legal Terms", aggfunc=lambda x: x).to_csv(
    "legal_terms_pivot.csv")

In [19]:
legal_terms.groupby("Key Items").count().reset_index().merge(terms_and_definitions, how="left", on="Key Items")

In [20]:
legal_terms_with_counts.merge(terms_and_definitions, how="left", on="Key Items").to_csv("terms-definitions-counts.csv",
                                                                                        index=False)

In [21]:
terms_and_definitions_counts = pd.read_csv("terms-definitions-counts.csv")

few_shots_examples = [
    "Site Green - Emerald Garden - Cape Fear.pdf",
    "Site Lease - Novel - Bartel (ES).pdf",
    "Site Lease- SunRaise - Plympton.pdf",
]

for file_name in few_shots_examples:
    correct_project_preview = get_project_preview(PROJECT_PREVIEWS_PATH, file_name).assign(file_name=file_name)
    terms_and_definitions_counts = terms_and_definitions_counts.merge(
        correct_project_preview[["Key Items", "Legal Terms"]], how="left", on="Key Items",
        suffixes=("", f"_example_{file_name}")
    )

terms_and_definitions_counts
terms_and_definitions_counts.to_csv("terms-definitions-examples.csv", index=False)