In [12]:
import uuid
from typing import Any, List

import pandas as pd
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryByteStore
from langchain.text_splitter import CharacterTextSplitter, TextSplitter
from langchain_community.retrievers import TFIDFRetriever
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import BaseRetriever
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

from src.preprocessing.doc_ai.file_sequence import FileSequence

import os
os.environ["DOC_AI_LOCATION"] = "us"
os.environ["PROJECT_ID"] = "602280418311"

In [13]:
import os
os.environ["DOC_AI_LOCATION"] = "us"
os.environ["PROJECT_ID"] = "602280418311"


from src.pipelines.term_extraction.pipeline_config import (
    EPCPipelineConfig,
    InterconnectionAgreementPipelineConfig,
    OMPipelineConfig,
    PPAPipelineConfig,
    SiteLeasePipelineConfig,
)

In [14]:
FILE_NAMES = [
            "Blue Sky.Interconnection Agreement.Felicita Town Centre.pdf",
            "GLD.Interconnection Agreement.Canton.pdf",
            "Interconnection Agreement - Bullrock - Lakeville.pdf",
            "Interconnection Agreement - Emerald Green - Cape Fear.pdf",
            "Interconnection Agreement - Emerald Green - Mt Kimble BLDG A.pdf",
            "Interconnection Agreement - Neighborhood Power - My Hope.pdf",
            "Interconnection Agreement - Novel - Bartel.pdf",
            "Interconnection Agreement - Shine - DuQuoin.pdf",
            "Interconnection Agreement - SunRaise - Pequawket Trail Baldwin.pdf",
            "Sunraise.Interconnection Agreement.Enterprise.pdf",
        ]

In [15]:
from src.pipelines.term_extraction.pipeline_config import InterconnectionAgreementPipelineConfig
from src.pipelines.term_extraction.utils import get_project_preview

pipeline_config = InterconnectionAgreementPipelineConfig()


legal_terms = []
for file_name in FILE_NAMES:
    print(f"File: {file_name}")

    correct_project_preview = get_project_preview(pipeline_config.get_project_previews_path(), file_name)

    legal_terms.append(correct_project_preview[["Key Items", "Legal Terms"]].assign(file_name=file_name))

legal_terms = pd.concat(legal_terms)

In [16]:
key_items = pipeline_config.get_terms_and_definitions()["Key Items"]


definitions = pd.read_csv(pipeline_config.get_path("terms-definitions.csv"))

In [23]:
# for key_item in key_items:
#     samples = legal_terms[legal_terms["Key Items"] == key_item]["Legal Terms"].to_list()
#     samples = [sample for sample in samples if sample != "Not provided."]
#     print(key_item)
    
#     print(definitions[definitions["Key Items"] == key_item]["Definitions"].iloc[-1])
#     print()
#     for sample in samples:
#         print(sample)
#         print()
#     print("-"*20)

In [24]:
import vertexai
from vertexai.preview.generative_models import GenerativeModel, Image, GenerationConfig
 
PROJECT_ID = "602280418311"
REGION = "us-central1"
vertexai.init(project=PROJECT_ID, location=REGION)
 
model = GenerativeModel("gemini-1.5-pro-preview-0409")


def generate_prompt(term, definition, examples):
    examples = "\n\n".join(examples)
    
    prompt = f"""Using term name, description and examples provided generate for me set of instructions of how to find and provide these examples of text from interconnection agreement  document of solar energy due diligence company. Describe list of keywords, patterns, sections numbers that I should look for. Be as precise as possible. No additional tips.
Term: {term}
Definition: {definition}
Examples:

{examples}
    
Instructions:"""
    
    print(prompt)
    
    
    ans = model.generate_content(prompt, 
                            generation_config=GenerationConfig(max_output_tokens=8000)).candidates[0].content.parts[0].text
    print(ans)
    
    return ans

In [34]:
answers = []

import time
for key_item in key_items:
    samples = legal_terms[legal_terms["Key Items"] == key_item]["Legal Terms"].to_list()
    samples = [sample for sample in samples if sample != "Not provided."]
    print(key_item)
    
    definition = definitions[definitions["Key Items"] == key_item]["Definitions"].iloc[-1]
    
    answer = generate_prompt(key_item, definition, samples)
    print("Answer:")

    print(answer)
    answers.append(answer)
    print("--"*10)
    time.sleep(12)

In [35]:
#A

In [36]:
pd.DataFrame({"Key Items": key_items, "Instructions": answers}).to_csv(F"terms-instructions-{pipeline_config.pipeline_name}.csv")

In [None]:
config

In [3]:
from src.preprocessing.doc_ai.processor import DocAIProcessor
processor = DocAIProcessor(
            location=config.processor_location,
            project_id=config.processor_project_id,
            processor_id=config.processor_id,
        )


file_name = config.file_names[1]

from src.preprocessing.doc_ai.file_sequence import FileSequence
file_sequence: FileSequence = processor.process_documents([config.get_documents_path() + file_name])

text = file_sequence.get_all_text()


chunk_size = 5000
text_splitter = CharacterTextSplitter(
            separator="\n",
            chunk_size=chunk_size,
            chunk_overlap=chunk_size // 3,
            keep_separator=True,
        )

docs = text_splitter.create_documents([text])

for doc in docs:
    print(doc.page_content)
    print("----------------------------------------------")