In [1]:
import os
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd
from langchain.chains import LLMChain
from tqdm import tqdm
from datetime import datetime

from src.gen_ai.gen_ai import get_llm
from src.pipelines.term_extraction.utils import get_project_preview

os.environ["DOC_AI_LOCATION"] = "us"
os.environ["PROJECT_ID"] = "602280418311"
os.environ["LOCATION"] = "us-west1"
os.environ["AWS_ACCESS_KEY_ID"] = "AKIA3FLDZKXBZON2YNPB"
os.environ["AWS_SECRET_ACCESS_KEY"] = "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"
os.environ["GOOGLE_API_KEY"]="AIzaSyBs6eyE7JHovx_7Jp0RmO-SasqLaJk5rlI"
os.environ["PROJECT_ID"]="602280418311"
os.environ["DOC_AI_LOCATION"]="us"
os.environ["DOC_AI_PROCESSOR_ID"]="e977fdd46ee23308"


## Specify the pipeline configuration

In [2]:
from src.pipelines.term_extraction.pipeline_config import PipelineConfig, SiteLeasePipelineConfig
import pathlib

pipeline_config = SiteLeasePipelineConfig(use_gcs_storage=True)

main_path = pathlib.Path().absolute().parent.parent
pipeline_folder = Path(main_path / f"src/pipelines/terms/{pipeline_config.pipeline_name}")

instructions_path = pipeline_folder / "terms-instructions.csv"
terms_and_instructions = pd.read_csv(pipeline_folder / "terms-instructions.csv")
terms_and_definitions = pd.read_csv(pipeline_folder / "terms-definitions.csv")

terms_and_instructions = terms_and_instructions[terms_and_instructions["Key Items"] == "Rent Escalator (Amount)"]
terms_and_definitions = terms_and_definitions[terms_and_definitions["Key Items"] == "Rent Escalator (Amount)"]


In [3]:
terms_and_definitions

Unnamed: 0,Key Items,Definitions
18,Rent Escalator (Amount),Amount of increase in price


#### Load the few-shot data

In [4]:
def load_few_shots(pipeline_config: PipelineConfig) -> pd.DataFrame:
    few_shots_legal_terms = []
    print(pipeline_config.few_shot_file_names)
    for file_name in pipeline_config.few_shot_file_names:
        print(f"File: {file_name}")

        correct_project_preview = get_project_preview(pipeline_config.get_project_previews_path(), file_name)
        print(correct_project_preview.head())
        print(correct_project_preview.columns)
        few_shots_legal_terms.append(
            correct_project_preview[["Key Items", "Value", "Legal Terms"]].assign(file_name=file_name))

    return pd.concat(few_shots_legal_terms)


legal_terms = load_few_shots(pipeline_config)

['Site Lease - GLD - Canton (ES).pdf', 'Site Lease - SunRaise - Happy Hollow (ES).pdf', 'Carmen.Site Lease.Blue Sky.pdf', 'Caroline_SiteLease_Updated Ex 2_2018.12.31.pdf', 'Site Green - Emerald Garden - Cape Fear.pdf']
File: Site Lease - GLD - Canton (ES).pdf
                       Key Items  \
0  Lessor (Landlord) Entity Name   
1    Lessee (Tenant) Entity Name   
2                 Effective Date   
3                  Property Size   
4                   Initial Term   

                                               Value  \
0                                PWH Properties, LLC   
1                              Canton GLC Solar, LLC   
2                                      Not provided.   
3                                        ±19.9 acres   
4  20 years beginning on the date the Permission ...   

                                         Legal Terms  
0  By and between PWH Properties, LLC ("Owner"), ...  
1  By and between PWH Properties, LLC ("Owner"), ...  
2                    

In [5]:
legal_terms

Unnamed: 0,Key Items,Value,Legal Terms,file_name
0,Lessor (Landlord) Entity Name,"PWH Properties, LLC","By and between PWH Properties, LLC (""Owner""), ...",Site Lease - GLD - Canton (ES).pdf
1,Lessee (Tenant) Entity Name,"Canton GLC Solar, LLC","By and between PWH Properties, LLC (""Owner""), ...",Site Lease - GLD - Canton (ES).pdf
2,Effective Date,Not provided.,Not provided.,Site Lease - GLD - Canton (ES).pdf
3,Property Size,±19.9 acres,"1. Hereby demises and leases to Tenant, and Te...",Site Lease - GLD - Canton (ES).pdf
4,Initial Term,20 years beginning on the date the Permission ...,"2. The operating term of this Agreement (the ""...",Site Lease - GLD - Canton (ES).pdf
...,...,...,...,...
35,Purchase Options (Y/N),Not provided.,Not provided.,Site Green - Emerald Garden - Cape Fear.pdf
36,Liens,Waived,Landlord hereby waives all rights to distraint...,Site Green - Emerald Garden - Cape Fear.pdf
37,Amendments and/or Estoppels,Yes,First Amendment to Ground Lease Agreement date...,Site Green - Emerald Garden - Cape Fear.pdf
38,Prevailing Party Provision (Y/N),No,Not provided.,Site Green - Emerald Garden - Cape Fear.pdf


In [6]:
legal_terms = legal_terms.sort_values(["Key Items"])
legal_terms = legal_terms[legal_terms["Legal Terms"] != "Not provided."]  # Remove rows with no legal terms
legal_terms.head()

Unnamed: 0,Key Items,Value,Legal Terms,file_name
37,Amendments and/or Estoppels,Yes,First Amendment to Ground Lease Agreement date...,Site Green - Emerald Garden - Cape Fear.pdf
38,Amendments and/or Estoppels,Not provided.,"Yes, Estoppel. No - amendments",Caroline_SiteLease_Updated Ex 2_2018.12.31.pdf
36,Amendments and/or Estoppels,31,Upon the receipt of a request from the other l...,Site Lease - SunRaise - Happy Hollow (ES).pdf
38,Amendments and/or Estoppels,Not provided.,"J. From time to time, upon written request, b...",Carmen.Site Lease.Blue Sky.pdf
24,Assignment by Lessee,Yes,IX. A. Tenant cannot assign this Lease except:...,Carmen.Site Lease.Blue Sky.pdf


In [7]:
few_shots = legal_terms.copy()
few_shots["Example"] = legal_terms["Legal Terms"]  # Use Legal Terms as Example
few_shots = few_shots.groupby("Key Items")["Example"].unique()  # Create set of examples for each Key Item
few_shots.head()

Key Items
Amendments and/or Estoppels    [First Amendment to Ground Lease Agreement dat...
Assignment by Lessee           [IX. A. Tenant cannot assign this Lease except...
Assignment by Lessor           [6b. Owner may assign its rights and obligatio...
Co-terminus with PPA (Y/N)     [N/A - no PPA, 5a.Notwithstanding anything to ...
Default                        [a.\tDefault. Any Party that fails to perform ...
Name: Example, dtype: object

In [8]:
few_shots

Key Items
Amendments and/or Estoppels          [First Amendment to Ground Lease Agreement dat...
Assignment by Lessee                 [IX. A. Tenant cannot assign this Lease except...
Assignment by Lessor                 [6b. Owner may assign its rights and obligatio...
Co-terminus with PPA (Y/N)           [N/A - no PPA, 5a.Notwithstanding anything to ...
Default                              [a.\tDefault. Any Party that fails to perform ...
Effective Date                       [December 31, 2018, THIS GROUND LEASE AGREEMEN...
Eminent Domain                       [In the event the Premises or Property are tra...
Expiration Date                      [2. Expiring on the twentieth (20th) anniversa...
Fee Simple Lessor  (Y/N)             [16i. Owner is the fee owner of the Premises a...
Fee Simple Owner (Y/N)               [12. Lessor respresents, warrants and covenant...
First Payment Due                    [2. Starting on the Commencement Date, Tenant ...
Force Majeure                    

In [9]:
terms_and_instructions[["Key Items"]]

Unnamed: 0,Key Items
18,Rent Escalator (Amount)


In [10]:
terms_and_instructions["Key Items"].to_list()

['Rent Escalator (Amount)']

In [11]:
few_shots = terms_and_instructions[["Key Items"]].merge(few_shots, on="Key Items", how="left")
few_shots = few_shots.merge(terms_and_definitions, on="Key Items", how="left")


In [12]:
few_shots

Unnamed: 0,Key Items,Example,Definitions
0,Rent Escalator (Amount),[3. Annual Rent for Years 3-20: Previous year ...,Amount of increase in price


In [13]:
few_shots["Example"].apply(type)

0    <class 'numpy.ndarray'>
Name: Example, dtype: object

In [14]:
def few_shots_to_prompt(few_shots) -> str:
    ans = """Build a prompt for Claude LLM model to extract this citations of Legal Terms: {key_item} 
    
<example_of_instructions>
    
## Finding Parties in a document

**Prompt:**

Given a site lease document, identify and extract sections of text that define the involved parties, specifically the Landlord and Tenant (or Lessor and Lessee). Use the following guidelines:

**Keywords and Patterns:**

* Look for phrases like:
    * "by and between"
    * "this Lease" or "this Agreement"
    * "Landlord" and "Tenant" (or "Lessor" and "Lessee")
    * "Party" and "Parties"
* Identify company names, often followed by identifiers such as:
    * LLC (Limited Liability Company)
    * Inc. (Incorporated)
    * Corporation
    * Trust
    * Individual names, possibly followed by titles like Trustee
* Look for addresses associated with the company or individual names.
* Pay attention to quotation marks surrounding names or titles, and parentheses often containing additional information like state of incorporation.
* Look for patterns like:
    * **[Company Name], a [State] [Company Type], with an address at [Address] ("[Landlord/Tenant/Lessor/Lessee]")**
    * **between [Company/Individual Name] (the "[Landlord/Lessor]") and [Company/Individual Name] (the "[Tenant/Lessee]")**

**Example:**

In the phrase:
>"**By and between HEELSTONE LAND HOLDINGS, LLC, a Delaware limited liability company (the “Landlord”) and 233 RANDOLPH 74 SOLAR I, LLC, a North Carolina limited liability company (the “Tenant”).**"

* "By and between" indicates the start of the relevant section.
* "HEELSTONE LAND HOLDINGS, LLC" and "233 RANDOLPH 74 SOLAR I, LLC" are the company names.
* "Delaware limited liability company" and "North Carolina limited liability company" identify the company types.
* "(the “Landlord”)" and "(the “Tenant”)" explicitly label the roles of each party.

Term to look for: Lessor (Landlord) Entity Name
Definition of term: The individual or entity who holds title to the property and is leasing to REA controlled entity

<example_of_instructions/>

    
    
    Term to build instructions for - {key_item}
    Definition of the term - {definition}
    Examples of extracted Legal Terms:
    
"""
    if isinstance(few_shots["Example"], np.ndarray):

        ans += """\nExamples of extracted Legal Terms:
"""

        for legal_terms in few_shots["Example"][:3]:
            ans += """
<example_of_extracted_legal_terms>
{legal_terms}...
</example_of_extracted_legal_terms>

Prompt similar to example for extracting of information for {key_item}:
""".format(legal_terms=legal_terms[:400], key_item=few_shots["Key Items"])

    return ans.format(key_item=few_shots["Key Items"], definition=few_shots["Definitions"])

In [15]:
# few_shots = pd.DataFrame(
#     [
#         {
#             "Key Items": "Promissory Note",
#             "Example": [
#                 """FOR VALUE RECEIVED, the undersigned, Canton GLC Solar, LLC, a Maine limited
# liability company (hereinafter referred to as “Borrower”), promises to pay to the order of M1 Bank,
# a Missouri chartered bank, whose address is 7 N. Bemiston Ave., Clayton, Missouri 63105
# (hereinafter referred to as the “Lender”), the principal sum of EIGHT MILLION THREE
# HUNDRED FOURTEEN THOUSAND SEVEN HUNDRED FIFTY AND NO/100 DOLLARS
# ($8,314,750.00), with interest on such amount of principal as may be outstanding from time to
# time as provided herein.""",
#                 """FOR VALUE RECEIVED, the undersigned, 233 Randolph 74 Solar I, LLC, a North Carolina limited liability company (hereinafter referred to as “Borrower”), promises to pay to the order of Crestmark, a division of MetaBank, National Association, whose address is 5480 Corporate Drive, Suite 350, Troy, Michigan 48098 (hereinafter referred to as the “Lender”), the principal sum of THREE MILLION TWO HUNDRED SEVENTY-SIX THOUSAND FIVE HUNDRED THIRTY-EIGHT AND 00/100 DOLLARS ($3,276,538.00), with interest on such amount of principal as may be outstanding from time to time as provided herein."""
#             ],
#             "Definitions": "A promissory note is a financial instrument that contains a written promise by one party (the note's issuer or maker) to pay another party (the note's payee) a definite sum of money, either on demand or at a specified future date."
#         },
#         
#         
#         
#         
#         
#         
#         
#         
#         
#     ]
#     
#     
#     
#     
#     
# )

In [16]:
def generate_prompts(few_shots: pd.DataFrame) -> List[str]:
    prompts = []

    for _, few_shots_row in few_shots[["Key Items", "Example", "Definitions"]].iterrows():
        prompts.append(few_shots_to_prompt(few_shots_row))
    return prompts


prompts = generate_prompts(few_shots)

In [17]:
for prompt in prompts:
    print(prompt)

Build a prompt for Claude LLM model to extract this citations of Legal Terms: Rent Escalator (Amount) 
    
<example_of_instructions>
    
## Finding Parties in a document

**Prompt:**

Given a site lease document, identify and extract sections of text that define the involved parties, specifically the Landlord and Tenant (or Lessor and Lessee). Use the following guidelines:

**Keywords and Patterns:**

* Look for phrases like:
    * "by and between"
    * "this Lease" or "this Agreement"
    * "Landlord" and "Tenant" (or "Lessor" and "Lessee")
    * "Party" and "Parties"
* Identify company names, often followed by identifiers such as:
    * LLC (Limited Liability Company)
    * Inc. (Incorporated)
    * Corporation
    * Trust
    * Individual names, possibly followed by titles like Trustee
* Look for addresses associated with the company or individual names.
* Pay attention to quotation marks surrounding names or titles, and parentheses often containing additional information like st

In [18]:
llm = get_llm(model_type="CLAUDE") # Gemini 1.5 works best for generating instructions


def generate_instructions_from_prompts(prompts: pd.Series, llm: LLMChain) -> List[str]:
    instructions = []
    for prompt in tqdm(prompts):
        instructions.append(llm.invoke(prompt))
    return instructions


instructions = generate_instructions_from_prompts(prompts, llm)

  return InMemoryRateLimiter(
100%|██████████| 1/1 [00:12<00:00, 12.14s/it]


In [19]:
for instruction in instructions:
    print(instruction.content)

**Prompt:**

Given a site lease document, identify and extract sections of text that define the Rent Escalator (Amount), which is the amount of increase in rent price over time. Use the following guidelines:

**Keywords and Patterns:**

* Look for phrases like:
    * "Rent Escalator"
    * "Rent Increase"
    * "Escalation" or "Escalate"
    * "Annual Rent" or "Rent" followed by terms like "increased by" or "plus"
* Identify numerical values, often followed by identifiers such as:
    * Percentage (%) 
    * Dollar amount ($)
* Look for time periods associated with the rent escalation, such as:
    * "per annum" or "annually"
    * "Year" or "Years" followed by a range or specific year numbers
* Pay attention to parentheses often containing additional details or calculations.
* Look for patterns like:
    * **Annual Rent increased by [Percentage] (per annum/annually)**
    * **Rent shall be [Dollar Amount] with an escalator of [Percentage] (per annum/annually)**
    * **[Dollar Amount]

In [16]:
def save_instructions(instructions: List[str], instructions_path: Path) -> None:
    def postprocess(text: str) -> str:
        if text.startswith("```"):
            text = text[3:]
        if text.endswith("```"):
            text = text[:-3]
        return text.strip()

    instructions = [postprocess(instruction.content) for instruction in instructions]
    terms_and_instructions = pd.read_csv(instructions_path)
    terms_and_instructions["Instructions"] = instructions

    path_to_save = instructions_path.parent / f"terms-instructions_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"
    terms_and_instructions.to_csv(path_to_save, index=False)

save_instructions(instructions, instructions_path)

ValueError: Length of values (1) does not match length of index (39)