This demo is a collaboration with Nicolas (x-machina), made for David from https://www.intuitecht.com/
For more details, see: https://docs.google.com/document/d/1hm5J3sREaap3G5HcK1QK50tyBTDkmtpn6TXhZ2mxmo4/edit

In [34]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from vertexai.preview.generative_models import (
    FunctionDeclaration,
    GenerativeModel,
    Tool,
)

from langchain_core.documents.base import Document

from pydantic import BaseModel
from typing import List
import os
from dotenv import load_dotenv
import vertexai

In [35]:
load_dotenv()

True

In [36]:
PROJECT_ID = os.environ.get("PROJECT_ID", "atbv-sb-datascience")
LOCATION_ID = os.environ.get("LOCATION_ID", "northamerica-northeast1")
DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", "gemini-1.5-pro-001")
GOOGLE_SEARCH_API = os.environ.get("GOOGLE_SEARCH_API")
GOOGLE_CSE_ID = os.environ.get("GOOGLE_CSE_ID")
LOGGING_MODE = os.environ.get("LOGGING_MODE", "DEBUG")

In [None]:
vertexai.init(project=PROJECT_ID, location=LOCATION_ID)

In [15]:
def load_pdf(filepath):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000, chunk_overlap=100
    )

    loader = PyPDFLoader(filepath, extract_images=False)
    doc_pdf = loader.load()
    documents = text_splitter.split_documents(doc_pdf)
    return documents

In [17]:
pdf_docs = load_pdf('hydraulic.pdf')

In [32]:
type(pdf_docs[9])

langchain_core.documents.base.Document

In [None]:
def make_prompt(question:str, docs:List[Document]):

    prompt = f"""You are an expert scientist that processes questions about hydraulic pumps and returns an expert answer. 
    You are tasked with with giving an answer to the following question: 
    {question}.
    You are given the following documents to use as a reference.
    Your answer should ONLY rely on these sources and not on any prior knowledge.
    Here are the documents:
    """

    for doc in pdf_docs:
        prompt += """
        page: {doc['page']}
        content: {doc['page_content']}
        """

    return prompt

In [25]:
class Function(BaseModel):
    name: str
    description: str
    parameters: dict
    # instruction: str | None = None

In [29]:
answer_question_func = FunctionDeclaration(
    name="answerQuestion",
    description="Answers a question with references to relevant pages from a source document",
    parameters={
        "type": "object",
        "properties": {
            "page": {
                "type": "integer",
                "description": "The relevant page in the source document",
            },
            "answer": {
                "type": "string",
                "description": "The answer to the question asked",
            },
        },
        "required": ["title", "summary", "source"],
    },
)

In [30]:
tool = Tool(function_declarations=[answer_question_func])

In [None]:
prompt = make_prompt(question='sdfsdfdsf', docs=pdf_docs)

In [None]:
model = GenerativeModel("gemini-1.5-pro-001")

In [None]:
model_response = model.generate_content(
    prompt,
    tools=[tool],
)

### DOING IT WITH RAG

In [None]:
# RAG PORTION

def get_retriever(
    documents,
    embedding,
    session,
    retriver_db,
    search_topk=15,
    supabase_url=None,
    supabase_key=None,
    supabase_chunksize=500,
):
    if retriver_db != "SUPABASE":
        ## Joseph 20240506: Change default vector storage to firestore
        # try:
        #     contracts_vector_db = Chroma.from_documents(collection_name="documents", documents=documents, embedding=embedding)
        # except:
        #     contracts_vector_db = Chroma(collection_name="documents", embedding_function=embedding)
        try:
            ## TODO: Need create index before deployment:
            # gcloud alpha firestore indexes composite create \
            # --project=atbv-sb-datascience \
            # --collection-group=ai_studio_documents \
            # --query-scope=COLLECTION \
            # --field-config field-path=embedding,vector-config='{"dimension":"768", "flat": "{}"}' \
            # --database=test
            contracts_vector_db = FirestoreVectorStore.from_documents(
                client=FIRESTORE_CLIENT,
                collection="ai_studio_documents",
                documents=documents,
                embedding=embedding,
                filters=FieldFilter("metadata.session_id", "==", f"{session}")
            )
        except:
            contracts_vector_db = FirestoreVectorStore(
                client=FIRESTORE_CLIENT,
                collection="ai_studio_documents", 
                embedding=embedding,
                filters=FieldFilter("metadata.session_id", "==", f"{session}")
            )
    else:
        try:
            supabase: Client = create_client(supabase_url, supabase_key)
            contracts_vector_db = SupabaseVectorStore.from_documents(
                documents,
                embedding,
                client=supabase,
                table_name="documents",
                query_name="match_documents",
                chunk_size=supabase_chunksize,
            )
        except:
            contracts_vector_db = SupabaseVectorStore(
                client=supabase,
                embedding=embedding,
                table_name="documents",
                query_name="match_documents",
            )
        

    return contracts_vector_db.as_retriever(
        search_type="similarity",
        search_kwargs={
            "k": search_topk,
            "filter": {"session_id": session},
        },  # Filter search result by session id
    )



def embed_retriver(
    documents,
    retriver_db,
    session,
    supabase_url=None,
    supabase_key=None,
    embedding_model="textembedding-gecko@latest",  # The API accepts a maximum of 3,072 input tokens and outputs 768-dimensional vector embeddings. Use the following parameters for the text embeddings model textembedding-gecko,
    search_topk=15,
):
    try:
        embedding = VertexAIEmbeddings(model_name=embedding_model)
        # contracts_vector_db = Chroma.from_documents(documents, embedding)
        # retriever = contracts_vector_db.as_retriever(
        #     search_type="similarity", search_kwargs={"k": search_topk}
        # )
        retriever = get_retriever(
            documents,
            embedding,
            session,
            retriver_db,
            search_topk,
            supabase_url,
            supabase_key,
        )
        return retriever
    except Exception as combine_documents_chain_err:
        raise combine_documents_chain_err
    
    
def invoke_retriver(llm, retriever, question):
    contextualize_q_prompt = ChatPromptTemplate.from_messages(
                [
                    ("system", prompt_template.contextualize_q_system_prompt),
                    MessagesPlaceholder("chat_history"),
                    ("human", "{input}"),
                ]
            )
    history_aware_retriever = create_history_aware_retriever(
        llm, retriever, contextualize_q_prompt
    )
    
    qa_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", prompt_template.qa_system_template),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
    )

    question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

    rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
    
    ##Joseph 20240521: Chat history not being populated in this version to minimize hallucination.
    chat_history = []
    
    res_answer = rag_chain.invoke({"input": question, "chat_history": chat_history})
    
    return res_answer
