In [1]:
import uuid
from typing import Any, List

import pandas as pd
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryByteStore
from langchain.text_splitter import CharacterTextSplitter, TextSplitter
from langchain_community.retrievers import TFIDFRetriever
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import BaseRetriever
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

from src.preprocessing.doc_ai.file_sequence import FileSequence

import os
os.environ["DOC_AI_LOCATION"] = "us"
os.environ["PROJECT_ID"] = "602280418311"

In [2]:
import os
os.environ["DOC_AI_LOCATION"] = "us"
os.environ["PROJECT_ID"] = "602280418311"


from src.pipelines.term_extraction.pipeline_config import (
    EPCPipelineConfig,
    InterconnectionAgreementPipelineConfig,
    OMPipelineConfig,
    PPAPipelineConfig,
    SiteLeasePipelineConfig,
)

In [5]:
from src.pipelines.term_extraction.pipeline_config import SiteLeasePipelineConfig
from src.pipelines.term_extraction.utils import get_project_preview

pipeline_config = SiteLeasePipelineConfig()



In [6]:
import vertexai
from vertexai.preview.generative_models import GenerativeModel, Image, GenerationConfig
 
PROJECT_ID = "602280418311"
REGION = "us-central1"
vertexai.init(project=PROJECT_ID, location=REGION)
 
model = GenerativeModel("gemini-1.5-pro-preview-0409")

In [8]:
from typing import Any, Dict, Iterator, List, Mapping, Optional

from langchain_core.callbacks.manager import CallbackManagerForLLMRun
from langchain_core.language_models.llms import LLM
from langchain_core.outputs import GenerationChunk

import vertexai
from vertexai.preview.generative_models import GenerativeModel, Image, GenerationConfig
 
PROJECT_ID = "602280418311"
REGION = "us-central1"
vertexai.init(project=PROJECT_ID, location=REGION)
 
model = GenerativeModel("gemini-1.5-pro-preview-0409")


class CustomLLM(LLM):
    """A custom chat model that echoes the first `n` characters of the input.

    When contributing an implementation to LangChain, carefully document
    the model including the initialization parameters, include
    an example of how to initialize the model and include any relevant
    links to the underlying models documentation or API.

    Example:

        .. code-block:: python

            model = CustomChatModel(n=2)
            result = model.invoke([HumanMessage(content="hello")])
            result = model.batch([[HumanMessage(content="hello")],
                                 [HumanMessage(content="world")]])
    """

    model: GenerativeModel
    """The number of characters from the last message of the prompt to be echoed."""

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        """Run the LLM on the given input.

        Override this method to implement the LLM logic.

        Args:
            prompt: The prompt to generate from.
            stop: Stop words to use when generating. Model output is cut off at the
                first occurrence of any of the stop substrings.
                If stop tokens are not supported consider raising NotImplementedError.
            run_manager: Callback manager for the run.
            **kwargs: Arbitrary additional keyword arguments. These are usually passed
                to the model provider API call.

        Returns:
            The model output as a string. Actual completions SHOULD NOT include the prompt.
        """
        if stop is not None:
            raise ValueError("stop kwargs are not permitted.")
            
        ans = model.generate_content(prompt, 
                            generation_config=GenerationConfig(max_output_tokens=8000)).candidates[0].content.parts[0].text
            
        return ans

    def _stream(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> Iterator[GenerationChunk]:
        """Stream the LLM on the given prompt.

        This method should be overridden by subclasses that support streaming.

        If not implemented, the default behavior of calls to stream will be to
        fallback to the non-streaming version of the model and return
        the output as a single chunk.

        Args:
            prompt: The prompt to generate from.
            stop: Stop words to use when generating. Model output is cut off at the
                first occurrence of any of these substrings.
            run_manager: Callback manager for the run.
            **kwargs: Arbitrary additional keyword arguments. These are usually passed
                to the model provider API call.

        Returns:
            An iterator of GenerationChunks.
        """
        for char in prompt[: self.n]:
            chunk = GenerationChunk(text=char)
            if run_manager:
                run_manager.on_llm_new_token(chunk.text, chunk=chunk)

            yield chunk

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        """Return a dictionary of identifying parameters."""
        return {
            # The model name allows users to specify custom token counting
            # rules in LLM monitoring applications (e.g., in LangSmith users
            # can provide per token pricing for their model and monitor
            # costs for the given LLM.)
            "model_name": "CustomChatModel",
        }

    @property
    def _llm_type(self) -> str:
        """Get the type of language model used by this chat model. Used for logging purposes only."""
        return "custom"

In [10]:
llm = CustomLLM(model=model)

In [15]:
llm.batch(["Hallo", "Good bay"], max_paralellism=1)

In [9]:
answers = []


for key_item in key_items:
    samples = legal_terms[legal_terms["Key Items"] == key_item]["Legal Terms"].to_list()
    samples = [sample for sample in samples if sample != "Not provided."]
    print(key_item)
    answer = generate_prompt(samples)
    print("Answer:")

    print(answer)
    answers.append(answer)
    print("--"*10)

In [10]:
#A

In [11]:
pd.DataFrame({"Key Items": key_items, "Instructions": answers}).to_csv(F"terms-instructions-{pipeline_config.pipeline_name}.csv")

In [None]:
config

In [3]:
from src.preprocessing.doc_ai.processor import DocAIProcessor
processor = DocAIProcessor(
            location=config.processor_location,
            project_id=config.processor_project_id,
            processor_id=config.processor_id,
        )


file_name = config.file_names[1]

from src.preprocessing.doc_ai.file_sequence import FileSequence
file_sequence: FileSequence = processor.process_documents([config.get_documents_path() + file_name])

text = file_sequence.get_all_text()


chunk_size = 5000
text_splitter = CharacterTextSplitter(
            separator="\n",
            chunk_size=chunk_size,
            chunk_overlap=chunk_size // 3,
            keep_separator=True,
        )

docs = text_splitter.create_documents([text])

for doc in docs:
    print(doc.page_content)
    print("----------------------------------------------")