In [14]:


import os
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd
from langchain.chains import LLMChain
from tqdm import tqdm
from datetime import datetime

from src.gen_ai.gen_ai import get_llm
from src.pipelines.term_extraction.utils import get_project_preview

os.environ["DOC_AI_LOCATION"] = "us"
os.environ["PROJECT_ID"] = "602280418311"
os.environ["LOCATION"] = "us-west1"
os.environ["AWS_ACCESS_KEY_ID"] = "AKIASRTAGEHZRQDO7DU6"
os.environ["AWS_SECRET_ACCESS_KEY"] = "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"


## Specify the pipeline configuration

In [15]:
import pathlib
from src.pipelines.term_extraction.pipeline_config import PipelineConfig, SiteLeasePipelineConfig

pipeline_config = SiteLeasePipelineConfig(use_gcs_storage=True)

main_path = pathlib.Path().absolute().parent.parent
pipeline_folder = Path(main_path / f"src/pipelines/terms/{pipeline_config.pipeline_name}")

instructions_path = pipeline_folder / "terms-instructions.csv"
terms_and_instructions = pd.read_csv(pipeline_folder / "terms-instructions.csv")
terms_and_definitions = pd.read_csv(pipeline_folder / "terms-definitions.csv")

terms_and_instructions = terms_and_instructions[terms_and_instructions["Key Items"] == "Rent Escalator (Amount)"]
terms_and_definitions = terms_and_definitions[terms_and_definitions["Key Items"] == "Rent Escalator (Amount)"]

#### Load the few-shot data

In [16]:
def load_few_shots(pipeline_config: PipelineConfig) -> pd.DataFrame:
    few_shots_legal_terms = []
    for file_name in pipeline_config.few_shot_file_names:
        print(f"File: {file_name}")

        correct_project_preview = get_project_preview(pipeline_config.get_project_previews_path(), file_name)

        few_shots_legal_terms.append(
            correct_project_preview[["Key Items", "Value", "Legal Terms"]].assign(file_name=file_name))

    return pd.concat(few_shots_legal_terms)


legal_terms = load_few_shots(pipeline_config)

File: Site Lease - GLD - Canton (ES).pdf
File: Site Lease - SunRaise - Happy Hollow (ES).pdf
File: Carmen.Site Lease.Blue Sky.pdf
File: Caroline_SiteLease_Updated Ex 2_2018.12.31.pdf
File: Site Green - Emerald Garden - Cape Fear.pdf


In [17]:
legal_terms

Unnamed: 0,Key Items,Value,Legal Terms,file_name
0,Lessor (Landlord) Entity Name,"PWH Properties, LLC","By and between PWH Properties, LLC (""Owner""), ...",Site Lease - GLD - Canton (ES).pdf
1,Lessee (Tenant) Entity Name,"Canton GLC Solar, LLC","By and between PWH Properties, LLC (""Owner""), ...",Site Lease - GLD - Canton (ES).pdf
2,Effective Date,Not provided.,Not provided.,Site Lease - GLD - Canton (ES).pdf
3,Property Size,±19.9 acres,"1. Hereby demises and leases to Tenant, and Te...",Site Lease - GLD - Canton (ES).pdf
4,Initial Term,20 years beginning on the date the Permission ...,"2. The operating term of this Agreement (the ""...",Site Lease - GLD - Canton (ES).pdf
...,...,...,...,...
35,Purchase Options (Y/N),Not provided.,Not provided.,Site Green - Emerald Garden - Cape Fear.pdf
36,Liens,Waived,Landlord hereby waives all rights to distraint...,Site Green - Emerald Garden - Cape Fear.pdf
37,Amendments and/or Estoppels,Yes,First Amendment to Ground Lease Agreement date...,Site Green - Emerald Garden - Cape Fear.pdf
38,Prevailing Party Provision (Y/N),No,Not provided.,Site Green - Emerald Garden - Cape Fear.pdf


In [18]:
legal_terms = legal_terms.sort_values(["Key Items"])
legal_terms = legal_terms[legal_terms["Legal Terms"] != "Not provided."]  # Remove rows with no legal terms
legal_terms.head()

Unnamed: 0,Key Items,Value,Legal Terms,file_name
37,Amendments and/or Estoppels,Yes,First Amendment to Ground Lease Agreement date...,Site Green - Emerald Garden - Cape Fear.pdf
38,Amendments and/or Estoppels,Not provided.,"Yes, Estoppel. No - amendments",Caroline_SiteLease_Updated Ex 2_2018.12.31.pdf
36,Amendments and/or Estoppels,31,Upon the receipt of a request from the other l...,Site Lease - SunRaise - Happy Hollow (ES).pdf
38,Amendments and/or Estoppels,Not provided.,"J. From time to time, upon written request, b...",Carmen.Site Lease.Blue Sky.pdf
24,Assignment by Lessee,Yes,IX. A. Tenant cannot assign this Lease except:...,Carmen.Site Lease.Blue Sky.pdf


In [19]:
few_shots = legal_terms.copy()
few_shots["Example"] = few_shots[["Value", "Legal Terms"]].apply(tuple, axis=1)  # Use Legal Terms as Example
few_shots = few_shots.groupby("Key Items")["Example"].unique()  # Create set of examples for each Key Item
few_shots.head()

Key Items
Amendments and/or Estoppels    [(Yes, First Amendment to Ground Lease Agreeme...
Assignment by Lessee           [(Yes, IX. A. Tenant cannot assign this Lease ...
Assignment by Lessor           [(Owner can assign without consent, 6b. Owner ...
Co-terminus with PPA (Y/N)     [(Not provided., N/A - no PPA), (Not provided....
Default                        [(16a, a.\tDefault. Any Party that fails to pe...
Name: Example, dtype: object

In [20]:
terms_and_instructions["Key Items"].to_list()

['Rent Escalator (Amount)']

In [21]:
few_shots = terms_and_instructions[["Key Items"]].merge(few_shots, on="Key Items", how="left")
few_shots = few_shots.merge(terms_and_definitions, on="Key Items", how="left")


In [22]:
few_shots["Example"].apply(type)

0    <class 'numpy.ndarray'>
Name: Example, dtype: object

In [23]:
def few_shots_to_prompt(few_shots) -> str:
            
    if isinstance(few_shots["Example"], np.ndarray):
        ans = """1. You have a task for due diligence company, working with solar projects. 
* As a due diligence manager you will need to build a project preview for th Solar Project.
* Preview contains Legal Terms and Short Values.
* Legal Terms are the terms that are used in the contract. Direct citations from the contract.
2. Your task is to extract Short Values from those Leral Terms, so that the preview can be built. And it will be easy for other due diligence managers understand main points of the Legal Terms by looking at the Short Values.

3. I want you to putput only the Short Values for the Legal Terms. 
* Avoid any unnecessary comments or explanations.
* You will be provided with couple of examples for each Legal Term. 
* You will need to extract the Short Value from the Legal Terms in the same format as in the examples. 
* Then you will need to provide the Short Value for the Legal Terms for tha actual case.
* Avoid adding to you output such things as: "<actual_case>", "Text:", "```", "Data retrieved:", Legal Terms itself

Retrieve {key_item} from the text in this format:"""
        for value, legal_terms  in few_shots["Example"][:3]:
            
            ans += """
<example>
Text:
```
{legal_terms}...
```
Data retrieved:
{value}</example>

""".format(legal_terms=legal_terms[:200], value=value)  
    else:
        ans = """Retrieve {key_item} from the text.
Definitions: {key_item} - {definition}.
"""
        
    return ans.format(key_item=few_shots["Key Items"], definition=few_shots["Definitions"])

In [24]:
def generate_prompts(few_shots: pd.DataFrame) -> List[str]:
    prompts = []

    for _, few_shots_row in few_shots[["Key Items", "Example", "Definitions"]].iterrows():
        prompts.append(few_shots_to_prompt(few_shots_row))
    return prompts


short_instructions = generate_prompts(few_shots)

In [25]:
for prompt in short_instructions[-1:]:
    print(prompt)

1. You have a task for due diligence company, working with solar projects. 
* As a due diligence manager you will need to build a project preview for th Solar Project.
* Preview contains Legal Terms and Short Values.
* Legal Terms are the terms that are used in the contract. Direct citations from the contract.
2. Your task is to extract Short Values from those Leral Terms, so that the preview can be built. And it will be easy for other due diligence managers understand main points of the Legal Terms by looking at the Short Values.

3. I want you to putput only the Short Values for the Legal Terms. 
* Avoid any unnecessary comments or explanations.
* You will be provided with couple of examples for each Legal Term. 
* You will need to extract the Short Value from the Legal Terms in the same format as in the examples. 
* Then you will need to provide the Short Value for the Legal Terms for tha actual case.
* Avoid adding to you output such things as: "<actual_case>", "Text:", "```", "Dat

In [12]:
def save_instructions(instructions: List[str], instructions_path: Path) -> None:
    terms_and_instructions = pd.read_csv(instructions_path)
    terms_and_instructions["Instructions"] = instructions

    path_to_save = instructions_path.parent / f"claude-short-instructions_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"
    terms_and_instructions.to_csv(path_to_save, index=False)

save_instructions(short_instructions, instructions_path)

ValueError: Length of values (0) does not match length of index (8)