In [18]:
import numpy as np
import pandas as pd
import os
os.environ["DOC_AI_LOCATION"] = "us"
os.environ["PROJECT_ID"] = "602280418311"

In [19]:
from src.pipelines.term_extraction.pipeline_config import *
from src.pipelines.term_extraction.utils import get_project_preview

ppa_settings = (PPAPipelineConfig(use_gcs_storage=True),
[
# "Sheridan PPA.pdf",
# "GLD.PPA.Canton.pdf",
# "Skyway PPA.pdf",
# "PP Agreement - Emerald Garden - Mt Kemble.pdf",
"Skyway PPA.pdf",
"PP Agreement - Emerald Garden - Mt Kemble.pdf",
"PP Agreement - RSP - Mt Hope.pdf",
"PP Agreement - Shine - DuQuoin.pdf",
"Sunraise.PPA.Enterprise.pdf",
])

epc_settings = (EPCPipelineConfig(use_gcs_storage=True),
[
"EPC Agreement - Novel - Bartel.pdf",
"EPC Agreement - SunRaise - Pequawket Trail Baldwin.pdf",
"210331 233 Randolph 74 Solar I EPC Agreement executed.pdf",
"EPC - Blue Sky - Felicita Town Center.pdf",
"EPC - Emerald Garden - Mt Kimble.pdf",
"EPC Agreement - Bullrock - Lakeville.pdf",
])


om_settings = (OMPipelineConfig(use_gcs_storage=True),
[
"O&M Agreement - Emerald Green - Cape Fear.pdf",
"O&M- Bullrock - Lakeville.pdf",
"Enterprise O&M.pdf",
"O&M - Emerald Garden - Mt Kimble.pdf",
"O&M Agreement - GLD- Canton.pdf",
    ])



pipeline_config, FILE_NAMES = om_settings

legal_terms = []
for file_name in FILE_NAMES:
    print(f"File: {file_name}")

    correct_project_preview = get_project_preview(pipeline_config.get_project_previews_path(), file_name)

    legal_terms.append(correct_project_preview[["Key Items", "Value", "Legal Terms"]].assign(file_name=file_name))

legal_terms = pd.concat(legal_terms)

In [20]:
legal_terms = legal_terms.sort_values(["Key Items"])

In [21]:
import logging
from typing import Any, Dict

import pandas as pd
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from tqdm import tqdm
from src.gen_ai.gemini import get_llm

from src.pipelines.term_extraction.pipeline_config import PipelineConfig
# from src.prompts.prompts import term_summary_prompt_template

In [22]:
legal_terms = legal_terms[legal_terms["Legal Terms"] != "Not provided."]

In [23]:
legal_terms

In [24]:
few_shots = legal_terms[legal_terms["Value"] != "Not provided."].copy()
few_shots["Example"] = few_shots[["Legal Terms", "Value"]].apply(tuple, axis=1)

few_shots = few_shots.groupby("Key Items")["Example"].unique()

In [25]:
terms_and_instructions = pd.read_csv(f"/Users/odeine/PycharmProjects/ilios-DocAI/src/pipelines/terms/{pipeline_config.pipeline_name}/terms-instructions.csv")

In [26]:
terms_and_instructions["Key Items"]

In [27]:
good_check = terms_and_instructions[["Key Items"]].merge(few_shots, on="Key Items", how="left")

In [28]:
terms_and_definitions = pd.read_csv(f"/Users/odeine/PycharmProjects/ilios-DocAI/src/pipelines/terms/{pipeline_config.pipeline_name}/terms-definitions.csv")

In [29]:
good_check = good_check.merge(terms_and_definitions, on="Key Items", how="left")

In [30]:
good_check

In [31]:
def few_shots_to_prompt(few_shots) -> str:
            
#     ans = """Retrieve {key_item} from the text.
# Definitions: {key_item} - {definition}.
# Retrieve {key_item} from the text in this format:"""
    
    
    if isinstance(few_shots["Example"], np.ndarray):
        ans = """Retrieve {key_item} from the text in this format:"""
        for legal_terms, value,  in few_shots["Example"][:3]:
            
            ans += """
Text:
```
{legal_terms}...
```
Data retrieved:
{value}

""".format(legal_terms=legal_terms[:200], value=value)  
    else:
        ans = """Retrieve {key_item} from the text.
Definitions: {key_item} - {definition}.
"""
        
    return ans.format(key_item=few_shots["Key Items"], definition=few_shots["Definitions"])


good_check["Example"] = good_check[["Key Items", "Example", "Definitions"]].apply(few_shots_to_prompt, axis=1)

In [32]:
good_check["Example"]

In [33]:
import time
good_check.to_csv(f"{pipeline_config.pipeline_name}-short-instructions-{int(time.time())}.csv", index=False)

In [17]:
good_check["Example"][0]

In [18]:
few_shots = terms_and_instructions[["Key Items"]].merge(few_shots.reset_index(), on="Key Items", how="left")

In [19]:
few_shots

In [113]:
few_shots

In [114]:
# def predict_short_terms(row: pd.Series) -> Dict[str, Any]:
#     short_instructions = few_shots_to_prompt(row["Value"])
#     return chain.invoke(
#         inputs={"legal_terms": row["Legal Terms"],
#                 "instructions": short_instructions,
#             }
#     )

In [115]:
# short_term_instructions = pd.read_csv(
        #     "/Users/odeine/PycharmProjects/ilios-DocAI/src/pipelines/"
        #     "terms/site-lease/terms-instructions-short.csv"
        # )
short_term_instructions = pd.read_csv("/Users/odeine/PycharmProjects/ilios-DocAI/docai-91-short-terms-improvement/good_check_short-instructions.csv").rename({"Example": "Instructions"}, axis=1)


term_summary_prompt_template = "{instructions}" + \
"""
Text:
```
{legal_terms}...
```
Data retrieved:
"""

def _build_chain() -> LLMChain:
    """Build the Langchain chain."""
    prompt = PromptTemplate.from_template(term_summary_prompt_template)
    chain = LLMChain(llm=get_llm(), prompt=prompt)
    return chain


chain = _build_chain()

def _get_term_summary(inputs: Dict[str, str]) -> str:
    """Get the term summary for one predicted term"""
    result: Dict[str, str] = chain.invoke(input=inputs)
    return result["text"]

def run(extracted_terms: pd.DataFrame) -> pd.DataFrame:
    """Run the TermSummaryPipeline."""
    term_summaries = []
    for _, row in tqdm(extracted_terms.iterrows(), desc="Getting term summaries"):
        if (
            row["Legal Terms"]
            and pd.notna(row["Legal Terms"])
            and not (
                row["Legal Terms"].strip()
                in ['""', "", "Not provided.", "N/A"]
            )
        ):
            short_instructions = short_term_instructions[
                short_term_instructions["Key Items"] == row["Key Items"]
            ]["Instructions"].values[0]
            term_summaries.append(
                _get_term_summary(
                    inputs={
                        "legal_terms": row["Legal Terms"],
                        "instructions": short_instructions,
                    }
                )
            )
        else:
            if "(Y/N)" in row["Key Items"]:
                term_summaries.append("No.")
            else:
                term_summaries.append("N/A")
    extracted_terms["Term Summary"] = term_summaries
    return extracted_terms


In [297]:
from langchain.globals import set_debug
set_debug(False)

# run(legal_terms[legal_terms["Key Items"].isin(short_term_instructions["Key Items"])].sample(10))

In [298]:
print("Retrieve data from the text in this format:\nText:\n```\nSection 31.b.(x) If this Lease terminates because of Tenant’s default or if the Leasehold Estate is foreclosed, Landlord shall, upon written request from any Leasehold Mortgagee within ninety (90) day...\n```\nData retrieved:\nSee Section 31.b.x\n\n\nText:\n```\n11.02 If the default relates to work (other than the operation of and sale of electricity from theSystem) to be performed by Developer, perform such work or cause it to be performed, for the account o...\n```\nData retrieved:\nOwner can terminate this Lease upon at least 30 days additional written notice to\r\nDeveloper upon occurrence of an Event of Default by Developer\n\n\nText:\n```\n17(a)- Lessor have the right to terminate Lease due to a Lessee's default event only after the expiration of cure period and action to cure default is not commenced within that cure period....\n```\nData retrieved:\n17(a)\n\n\nText:\nText:\n```\n11.02 If the default relates to work (other than the operation of and sale of electricity from theSystem) to be performed by Developer, perform such work or cause it to be performed, for the account of Developer, without waiving such Event of Default, and without liability to Developer for any loss or damage which may result to Developer’s equipment or business by reason of such work, and Developer, on demand shall pay to Owner as a Lease fee hereunder, the cost of such work plus ten percent (10%) thereof as administrative costs; or\r\n(b)Terminate this Lease upon at least thirty (30) days additional written notice toDeveloper; and/or\r\n(c)Take possession and control of the System and operate the same for the furtherance ofthe Parties’ stated intentions and agreements under this Lease. To be certain, where Owner elects this remedy for Developer default, title to the System, electric revenue and all Attributes & Incentives shall remain with Developer, but Developer shall be liable to Owner for reasonable costs of operating and maintaining the System and Owner shall operate and maintain the System in accordance with this Lease and the Interconnection Agreements....\n```\nData retrieved:")

In [300]:
ans = run(legal_terms[legal_terms["Key Items"].isin(short_term_instructions["Key Items"])])


# ans = ans.rename({"Terms Summary": "Predicted Value"}, axis=1)
# 
# ans = ans[["file_name", "Key Items", "Legal Terms", "Value", "Predicted Value"]]
# ans

In [301]:
legal_terms.shape

In [302]:
ans = ans.rename({"Term Summary": "Predicted Value"}, axis=1)

ans = ans[["file_name", "Key Items", "Legal Terms", "Value", "Predicted Value"]]
ans

In [63]:
legal_terms

In [11]:
legal_terms["Value"].dropna().apply(len).hist(bins=20)
print(legal_terms["Value"].dropna().apply(len).mean())

In [23]:
key_items_counts = legal_terms["Key Items"].value_counts().reset_index()

In [16]:
# key_items_counts[key_items_counts["count"] == key_items_counts["count"].max()]
key_items_counts

In [17]:
legal_terms_with_counts = legal_terms.groupby("Key Items").count().sort_values(["file_name", "Legal Terms"],
                                                                               ascending=False)
legal_terms_with_counts = legal_terms_with_counts.rename({"file_name": "Number of files", "Legal Terms": "Number of examples"}, axis=1)

legal_terms_with_counts.merge(pipeline_config.get_terms_and_definitions(), on="Key Items", how="outer").sort_values("Number of files")

In [82]:
legal_terms_with_counts = legal_terms.groupby("Key Items").count().sort_values(["file_name", "Legal Terms"],
                                                                               ascending=False)
legal_terms_with_counts

In [37]:
legal_terms.groupby("file_name").count().sort_values(["Legal Terms", "Key Items"], ascending=False)

In [38]:
legal_terms.pivot_table(index="Key Items", columns="file_name", values="Legal Terms", aggfunc=lambda x: x).to_csv(
    "legal_terms_pivot.csv")

In [48]:
from src.pipeline.utils import get_terms_and_definitions

terms_and_definitions = get_terms_and_definitions()
terms_and_definitions

In [49]:
legal_terms.groupby("Key Items").count().reset_index().merge(terms_and_definitions, how="left", on="Key Items")

In [50]:
legal_terms_with_counts.merge(terms_and_definitions, how="left", on="Key Items").to_csv("terms-definitions-counts.csv",
                                                                                        index=False)

In [53]:
terms_and_definitions_counts = pd.read_csv("terms-definitions-counts.csv")

few_shots_examples = [
    "Site Green - Emerald Garden - Cape Fear.pdf",
    "Site Lease - Novel - Bartel (ES).pdf",
    "Site Lease- SunRaise - Plympton.pdf",
]

for file_name in few_shots_examples:
    correct_project_preview = get_project_preview(PROJECT_PREVIEWS_PATH, file_name).assign(file_name=file_name)
    terms_and_definitions_counts = terms_and_definitions_counts.merge(
        correct_project_preview[["Key Items", "Legal Terms"]], how="left", on="Key Items",
        suffixes=("", f"_example_{file_name}")
    )

terms_and_definitions_counts
terms_and_definitions_counts.to_csv("terms-definitions-examples.csv", index=False)