In [1]:


import os
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd
# from langchain.chains import LLMChain
# from tqdm import tqdm
from datetime import datetime

# from src.gen_ai.gen_ai import get_llm
from src.pipelines.term_extraction.utils import get_project_preview

os.environ["DOC_AI_LOCATION"] = "us"
os.environ["PROJECT_ID"] = "602280418311"
os.environ["LOCATION"] = "us-west1"
os.environ["AWS_ACCESS_KEY_ID"] = "AKIASRTAGEHZRQDO7DU6"
os.environ["AWS_SECRET_ACCESS_KEY"] = "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"


## Specify the pipeline configuration

In [2]:
import pathlib
from src.pipelines.term_extraction.pipeline_config import PVSystPipelineConfig, PipelineConfig

pipeline_config = PVSystPipelineConfig(use_gcs_storage=True)

main_path = pathlib.Path().absolute().parent.parent
pipeline_folder = Path(main_path / f"src/pipelines/terms/{pipeline_config.pipeline_name}")

instructions_path = pipeline_folder / "terms-instructions.csv"
terms_and_instructions = pd.read_csv(pipeline_folder / "terms-instructions.csv")
terms_and_definitions = pd.read_csv(pipeline_folder / "terms-definitions.csv")

#### Load the few-shot data

In [3]:
from src.pipelines.term_extraction.utils import add_units


def load_few_shots(pipeline_config: PipelineConfig) -> pd.DataFrame:
    few_shots_legal_terms = []
    for file_name in pipeline_config.few_shot_file_names:
        print(f"File: {file_name}")

        correct_project_preview = get_project_preview(pipeline_config.get_project_previews_path(), file_name)
        correct_project_preview["Legal Terms"] = correct_project_preview["Value"]
        correct_project_preview["units"] = (
                        terms_and_instructions["unit"]
                    )
        correct_project_preview["Legal Terms"] = (
            correct_project_preview.apply(add_units, axis=1)
        )
        correct_project_preview["Value"] = correct_project_preview["Legal Terms"]

        few_shots_legal_terms.append(
            correct_project_preview[["Key Items", "Value", "Legal Terms"]].assign(file_name=file_name))

    return pd.concat(few_shots_legal_terms)


legal_terms = load_few_shots(pipeline_config)

In [4]:
legal_terms = legal_terms.sort_values(["Key Items"])
legal_terms = legal_terms[legal_terms["Legal Terms"] != "Not provided."]  # Remove rows with no legal terms
legal_terms.head()

In [5]:
few_shots = legal_terms.copy()
few_shots["Example"] = few_shots[["Value", "Legal Terms"]].apply(tuple, axis=1)  # Use Legal Terms as Example
few_shots = few_shots.groupby("Key Items")["Example"].unique()  # Create set of examples for each Key Item
few_shots.head()

In [6]:
terms_and_instructions["Key Items"].to_list()

In [7]:
few_shots = terms_and_instructions[["Key Items"]].merge(few_shots, on="Key Items", how="left")
few_shots = few_shots.merge(terms_and_definitions, on="Key Items", how="left")


In [8]:
few_shots["Example"].apply(type)

In [9]:
few_shots["Example"]

In [10]:
def few_shots_to_prompt(few_shots) -> str:
            
    if isinstance(few_shots["Example"], np.ndarray):
        ans = """1. You have a task for due diligence company, working with solar projects. 
* As a due diligence manager you will need to build a project preview for th Solar Project.
* Preview contains Legal Terms and Short Values.
* Legal Terms are the terms that are used in the contract. Direct citations from the contract.
2. Your task is to extract Short Values from those Leral Terms, so that the preview can be built. And it will be easy for other due diligence managers understand main points of the Legal Terms by looking at the Short Values.
3. I want you to putput only the Short Values for the Legal Terms. 
* Avoid any unnecessary comments or explanations.
* You will be provided with couple of examples for each Legal Term. 
* You will need to extract the Short Value from the Legal Terms in the same format as in the examples. 
* Then you will need to provide the Short Value for the Legal Terms for tha actual case.
* Avoid adding to you output such things as: "<actual_case>", "Text:", "```", "Data retrieved:", Legal Terms itself

Retrieve {key_item} from the text in this format.
Examples of data and format:"""
        for i, (value, legal_terms)  in enumerate(few_shots["Example"][:3]):
            
            ans += """
<example of format {i}>
{value}
</example of format {i}>

""".format(legal_terms=legal_terms[:200], value=value, i=i)  
    else:
        ans = """Retrieve {key_item} from the text in specified format.
Definitions: {key_item} - {definition}.
"""
    ans += """Retrieve {key_item} from the text in specified format.
<text to retrieve information from>
{text}
</text to retrieve information from>
{key_item} retrieved from the text in specified format:"""
        
    return ans.format(key_item=few_shots["Key Items"], definition=few_shots["Definitions"], text="{text}")

In [11]:
def generate_prompts(few_shots: pd.DataFrame) -> List[str]:
    prompts = []

    for _, few_shots_row in few_shots[["Key Items", "Example", "Definitions"]].iterrows():
        prompts.append(few_shots_to_prompt(few_shots_row))
    return prompts


short_instructions = generate_prompts(few_shots)

In [12]:
for prompt in short_instructions:
    print(prompt)
    print("\n\n")

In [13]:
def save_instructions(instructions: List[str], instructions_path: Path) -> None:
    terms_and_instructions = pd.read_csv(instructions_path)
    terms_and_instructions["Instructions"] = instructions

    path_to_save = instructions_path.parent / f"claude-short-instructions_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"
    terms_and_instructions.to_csv(path_to_save, index=False)

save_instructions(short_instructions, instructions_path)

In [13]:
instructions_path.parent