In [3]:
import os
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd
from langchain.chains import LLMChain
from tqdm import tqdm
from datetime import datetime

from src.gen_ai.gen_ai import get_llm
from src.pipelines.term_extraction.utils import get_project_preview

os.environ["DOC_AI_LOCATION"] = "us"
os.environ["PROJECT_ID"] = "602280418311"
os.environ["LOCATION"] = "us-west1"
os.environ["AWS_ACCESS_KEY_ID"] = "AKIA3FLDZKXBZON2YNPB"
os.environ["AWS_SECRET_ACCESS_KEY"] = "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"
os.environ["GOOGLE_API_KEY"]="AIzaSyBs6eyE7JHovx_7Jp0RmO-SasqLaJk5rlI"
os.environ["PROJECT_ID"]="602280418311"
os.environ["DOC_AI_LOCATION"]="us"
os.environ["DOC_AI_PROCESSOR_ID"]="e977fdd46ee23308"


## Specify the pipeline configuration

In [16]:
from src.pipelines.term_extraction.pipeline_config import PipelineConfig, OperatingAgreementPipelineConfig
import pathlib

pipeline_config = OperatingAgreementPipelineConfig(use_gcs_storage=True)

main_path = pathlib.Path().absolute().parent.parent
pipeline_folder = Path(main_path / f"src/pipelines/terms/{pipeline_config.pipeline_name}")

instructions_path = pipeline_folder / "terms-instructions.csv"
terms_and_instructions = pd.read_csv(pipeline_folder / "terms-instructions.csv")
terms_and_definitions = pd.read_csv(pipeline_folder / "terms-definitions.csv")

#### Load the few-shot data

In [17]:
def load_few_shots(pipeline_config: PipelineConfig) -> pd.DataFrame:
    few_shots_legal_terms = []
    print(pipeline_config.few_shot_file_names)
    for file_name in pipeline_config.few_shot_file_names:
        print(f"File: {file_name}")

        correct_project_preview = get_project_preview(pipeline_config.get_project_previews_path(), file_name)
        print(correct_project_preview.columns)
        few_shots_legal_terms.append(
            correct_project_preview[["Key Items", "Value", "Legal Terms"]].assign(file_name=file_name))

    return pd.concat(few_shots_legal_terms)


legal_terms = load_few_shots(pipeline_config)

In [18]:
legal_terms = legal_terms.sort_values(["Key Items"])
legal_terms = legal_terms[legal_terms["Legal Terms"] != "Not provided."]  # Remove rows with no legal terms
legal_terms.head()

In [19]:
few_shots = legal_terms.copy()
few_shots["Example"] = legal_terms["Legal Terms"]  # Use Legal Terms as Example
few_shots = few_shots.groupby("Key Items")["Example"].unique()  # Create set of examples for each Key Item
few_shots.head()

In [20]:
terms_and_instructions["Key Items"].to_list()

In [21]:
few_shots = terms_and_instructions[["Key Items"]].merge(few_shots, on="Key Items", how="left")
few_shots = few_shots.merge(terms_and_definitions, on="Key Items", how="left")


In [22]:
few_shots["Example"].apply(type)

In [51]:
llm = get_llm(model_type="CLAUDE") # Gemini 1.5 works best for generating instructions

def get_key_words_and_patterns(few_shots) -> str:
    examples = "\n".join([f"> {example}" for example in few_shots["Example"][:3]])
    prompt = (f"# Extract keywords listed from the following examples."
              f"\n## Output only list of keywords. "
              f"\n* Keyword1"
              f"\n* keyword2"
              f"\nExamples: {examples}")
    
    ans = llm.invoke(prompt)
    return ans.content

In [52]:
def few_shots_to_prompt(few_shots) -> str:
    if not isinstance(few_shots["Example"], np.ndarray):
        return ""
    
    
    keywords_and_patterns = get_key_words_and_patterns(few_shots)
    
    
    examples = "\n".join([f"> {example}" for example in few_shots["Example"][:3]])
    
    ans = f"""## RETRIEVE FULL SECTION OF SIMILAR TEXT AS IN EXAMPLES:

* Look for keywords and Patterns:
    {keywords_and_patterns}
    
## RETRIEVE FULL SECTION OF SIMILAR TEXT AS IN EXAMPLES:

{examples} 
    
"""


    return ans.format(key_item=few_shots["Key Items"], definition=few_shots["Definitions"])

In [53]:
# def few_shots_to_prompt(few_shots) -> str:
#     ans = """Build a prompt for Claude LLM model to extract this citations of Legal Terms: {key_item} 
#     
# <example_of_instructions>
#     
# ## Finding Parties in a document
# 
# **Prompt:**
# 
# Given a site lease document, identify and extract sections of text that define the involved parties, specifically the Landlord and Tenant (or Lessor and Lessee). Use the following guidelines:
# 
# **Keywords and Patterns:**
# 
# * Look for phrases like:
#     * "by and between"
#     * "this Lease" or "this Agreement"
#     * "Landlord" and "Tenant" (or "Lessor" and "Lessee")
#     * "Party" and "Parties"
# * Identify company names, often followed by identifiers such as:
#     * LLC (Limited Liability Company)
#     * Inc. (Incorporated)
#     * Corporation
#     * Trust
#     * Individual names, possibly followed by titles like Trustee
# * Look for addresses associated with the company or individual names.
# * Pay attention to quotation marks surrounding names or titles, and parentheses often containing additional information like state of incorporation.
# * Look for patterns like:
#     * **[Company Name], a [State] [Company Type], with an address at [Address] ("[Landlord/Tenant/Lessor/Lessee]")**
#     * **between [Company/Individual Name] (the "[Landlord/Lessor]") and [Company/Individual Name] (the "[Tenant/Lessee]")**
# 
# **Example:**
# 
# In the phrase:
# >"**By and between HEELSTONE LAND HOLDINGS, LLC, a Delaware limited liability company (the “Landlord”) and 233 RANDOLPH 74 SOLAR I, LLC, a North Carolina limited liability company (the “Tenant”).**"
# 
# * "By and between" indicates the start of the relevant section.
# * "HEELSTONE LAND HOLDINGS, LLC" and "233 RANDOLPH 74 SOLAR I, LLC" are the company names.
# * "Delaware limited liability company" and "North Carolina limited liability company" identify the company types.
# * "(the “Landlord”)" and "(the “Tenant”)" explicitly label the roles of each party.
# 
# Term to look for: Lessor (Landlord) Entity Name
# Definition of term: The individual or entity who holds title to the property and is leasing to REA controlled entity
# 
# <example_of_instructions/>
# 
#     
#     
#     Term to build instructions for - {key_item}
#     Definition of the term - {definition}
#     Examples of extracted Legal Terms:
#     
# """
#     if isinstance(few_shots["Example"], np.ndarray):
# 
#         ans += """\nExamples of extracted Legal Terms:
# """
# 
#         for legal_terms in few_shots["Example"][:3]:
#             ans += """
# <example_of_extracted_legal_terms>
# {legal_terms}...
# </example_of_extracted_legal_terms>
# 
# Prompt similar to example for extracting of information for {key_item}:
# """.format(legal_terms=legal_terms[:400], key_item=few_shots["Key Items"])
# 
#     return ans.format(key_item=few_shots["Key Items"], definition=few_shots["Definitions"])

In [54]:
def generate_prompts(few_shots: pd.DataFrame) -> List[str]:
    prompts = []

    for _, few_shots_row in few_shots[["Key Items", "Example", "Definitions"]].iterrows():
        prompts.append(few_shots_to_prompt(few_shots_row))
    return prompts


prompts = generate_prompts(few_shots)

In [55]:
for prompt in prompts:
    print(prompt)

In [56]:
llm = get_llm(model_type="CLAUDE") # Gemini 1.5 works best for generating instructions


def generate_instructions_from_prompts(prompts: pd.Series, llm: LLMChain) -> List[str]:
    instructions = []
    for prompt in tqdm(prompts):
        instructions.append(prompt)
    return instructions


instructions = generate_instructions_from_prompts(prompts, llm)

In [58]:
for instruction in instructions:
    print(instruction)

In [60]:
def save_instructions(instructions: List[str], instructions_path: Path) -> None:
    def postprocess(text: str) -> str:
        if text.startswith("```"):
            text = text[3:]
        if text.endswith("```"):
            text = text[:-3]
        return text.strip()

    instructions = [postprocess(instruction) for instruction in instructions]
    terms_and_instructions = pd.read_csv(instructions_path)
    terms_and_instructions["Instructions"] = instructions

    path_to_save = instructions_path.parent / f"terms-instructions_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"
    terms_and_instructions.to_csv(path_to_save, index=False)

save_instructions(instructions, instructions_path)