In [None]:
# Installation & Authentication
# Install LangChain, Vertex AI LLM SDK, ChromaDB, and related libraries.

In [9]:
%pip install -q google-cloud-aiplatform==1.36.0 langchain==0.0.327 unstructured chromadb 

Note: you may need to restart the kernel to use updated packages.


In [10]:
# Get Libraries & Classes
# Reference Libraries

# In this section, we will identify all the library classes that will be referenced in the code.

In [11]:
from langchain.embeddings import VertexAIEmbeddings
from langchain.llms import VertexAI
from langchain.document_loaders import GCSDirectoryLoader

# Chroma DB as Vector Store Database
from langchain.vectorstores import Chroma

# Using Vertex AI
import vertexai
from google.cloud import aiplatform

print(f"Vertex AI SDK version: {aiplatform.__version__}")

2023-11-12 15:06:15.736875: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-12 15:06:16.675626: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-11-12 15:06:16.675778: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/loca

Vertex AI SDK version: 1.36.0


In [12]:
# Initialize Vertex AI
# We will need a project id and location where the Vertex compute and embedding will be hosted

In [13]:
PROJECT_ID = "genai-387907"  # @param {type:"string"}

LOCATION = "us-central1"  # @param {type:"string"}

# Initialize Vertex AI SDK
vertexai.init(project=PROJECT_ID, location=LOCATION)

In [None]:
# Ingest the Contracts to build the context for the LLM
# Load all the Procurement Contract Documents

In [14]:
loader = GCSDirectoryLoader(
    project_name=PROJECT_ID, bucket="contractunderstandingatticusdataset"
)
documents = loader.load()

In [15]:
# split the documents into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(documents)
print(f"# of documents = {len(docs)}")

# of documents = 2149


In [16]:
# Structuring the ingested documents in a vector space using a Vector Database
# Create an embedding vector engine for all the text in the contract documents that have been ingested

In [17]:
# Define Text Embeddings model
embedding = VertexAIEmbeddings()


embedding

VertexAIEmbeddings(project=None, location='us-central1', request_parallelism=5, max_retries=6, stop=None, model_name='textembedding-gecko', client=<vertexai.preview.language_models._PreviewTextEmbeddingModel object at 0x7f8fedeeb970>, temperature=0.0, max_output_tokens=128, top_p=0.95, top_k=40, credentials=None, n=1, streaming=False)

In [18]:
contracts_vector_db = Chroma.from_documents(docs, embedding)

In [19]:
# Obtain handle to the retriever
# We will use the native retriever provided by Chroma DB to perform similarity search within the contracts document vector store among the different document chunks so as to return that document chunk which has the lowest vectoral "distance" with the incoming user query.

In [20]:
# Expose index to the retriever
retriever = contracts_vector_db.as_retriever(
    search_type="similarity", search_kwargs={"k": 2}
)

In [21]:
# Define a Retrieval QA Chain to use retriever

In [22]:
# Create chain to answer questions
from langchain.chains import RetrievalQA

llm = VertexAI(
    model_name="text-bison-32k",
    max_output_tokens=256,
    temperature=0.1,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

# Uses LLM to synthesize results from the search index.
# We use Vertex PaLM Text API for LLM
qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True
)

In [23]:
# Leverage LLM to search from retriever

In [24]:
query = "Who all entered into agreement with Sagebrush?"
result = qa({"query": query})
print(result)

{'query': 'Who all entered into agreement with Sagebrush?', 'result': ' The Partnership, all of the Partners except Alpha Mariah (Prime), Inc. and Beta Mariah (Prime) Inc., and Manager', 'source_documents': [Document(page_content='Each party cooperated and participated in the drafting and preparation of this Agreement and the documents referred to herein, and any and all drafts relating thereto exchanged among the parties shall be deemed the work product of all of the parties and may not be construed against any party by reason of its drafting or preparation. Accordingly, any rule of law or any legal decision that would require interpretation of any ambiguities in this Agreement against any party that drafted or prepared it is of no application and is hereby expressly waived by each of the parties hereto, and any controversy over interpretations of this Agreement shall be decided without regards to events of drafting or preparation.\n\n[Signature Pages Follow]   7\n\nIN WITNESS WHEREOF

In [25]:
## Build a Front End
#Enable a simple front end so users can query against contract documents and obtain intelligent answers with grounding information that references the base documents that was used to respond to user query

In [26]:
%pip install -q gradio

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ydata-profiling 4.6.0 requires pydantic<2,>=1.8.1, but you have pydantic 2.4.2 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [27]:
from google.cloud import storage
import gradio as gr
import markdown


def chatbot(inputtext):
    result = qa({"query": inputtext})

    return (
        result["result"],
        get_public_url(result["source_documents"][0].metadata["source"]),
        result["source_documents"][0].metadata["source"],
    )


def get_public_url(uri):
    """Returns the public URL for a file in Google Cloud Storage."""
    # Split the URI into its components
    components = uri.split("/")

    # Get the bucket name
    bucket_name = components[2]

    # Get the file name
    file_name = components[3]

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(file_name)
    return blob.public_url


print("Launching Gradio")

iface = gr.Interface(
    fn=chatbot,
    inputs=[gr.Textbox(label="Query")],
    examples=[
        "Who are parties to ADMA agreement",
        "What is the agreement between MICOA & Stratton Cheeseman",
        "What is the commission % that Stratton Cheeseman will get from MICOA and how much will they get if MICOA's revenues are $100",
    ],
    title="Contract Analyst",
    outputs=[
        gr.Textbox(label="Response"),
        gr.Textbox(label="URL"),
        gr.Textbox(label="Cloud Storage URI"),
    ],
    theme=gr.themes.Soft,
)

iface.launch(share=False)

ImportError: cannot import name 'RootModel' from 'pydantic' (/opt/conda/lib/python3.10/site-packages/pydantic/__init__.cpython-310-x86_64-linux-gnu.so)