In [None]:
! pip install google-cloud-aiplatform langchain pandas datasets google-api-python-client chromadb faiss-cpu faiss-cpu transformers config google-cloud-documentai tiktoken pypdf2 --upgrade

In [None]:
! sudo mkdir /share
! sudo mount -t cifs -o username=[username],password=[password],vers=3.0 [UNC PATH TO SHARE] /share

In [148]:
#
# Forgive me Guido
# 
# This notebook is a demo of a simple QnA fueled by PALM en enriched with context through embeddings. 
# 
# The embeddings are created by processing 3K+ old video game manual pdf's with DocAI and feeding them to Vertex AI embeddings.
# This specifiek version add a touch of real life - the manuals are situated on a MS Windows Fileshare (on a VM in the same VPC)
# and are real only and the files are not copied or stored anywhere else but in memory. Sad but true - 
# 
# 
#
# Utils
import time
import io
from typing import List
from os import listdir
import numpy as np
import math


# Langchain
import langchain
from pydantic import BaseModel
from langchain.text_splitter import RecursiveCharacterTextSplitter

#PyPDF2
from PyPDF2 import PdfReader, PdfWriter

# Vertex AI
from google.cloud import aiplatform
from langchain.chat_models import ChatVertexAI
from langchain.embeddings import VertexAIEmbeddings
from langchain.llms import VertexAI
from langchain.schema import HumanMessage, SystemMessage

# DocAI
from typing import Optional
from google.api_core.client_options import ClientOptions
import google.cloud.documentai_v1beta3 as documentai

# Variables obviously replace this will your own!

# DocAI
project_number = 'PROJECT_NUMBER' # project number, not id
location = 'eu' # Format is 'us' or 'eu'
processor_id = 'fe9efa246ac573db' #  Create processor before running
processor_version_id = 'pretrained-ocr-v1.0-2020-09-23' # Processor version to use
mime_type = 'application/pdf' # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
field_mask = "text"  # Optional. The fields to return in the Document object.

# Response Processing
max_pages = 12 # DocAI in online mode has two limits, 15 pages and 20MB, I picked 12 because that kept me consistently under both
docs = [] # Empty list to store the return documents
dir = 'a' # Directory that contains the files

In [None]:
# Stolen from: https://github.com/GoogleCloudPlatform/generative-ai/blob/main/language/examples/langchain-intro/intro_langchain_palm_api.ipynb
#
# Utility functions for Embeddings API with rate limiting
def rate_limit(max_per_minute):
    period = 60 / max_per_minute
    print("Waiting")
    while True:
        before = time.time()
        yield
        after = time.time()
        elapsed = after - before
        sleep_time = max(0, period - elapsed)
        if sleep_time > 0:
            print(".", end="")
            time.sleep(sleep_time)


class CustomVertexAIEmbeddings(VertexAIEmbeddings, BaseModel):
    requests_per_minute: int
    num_instances_per_batch: int

    # Overriding embed_documents method
    def embed_documents(self, texts: List[str]):
        limiter = rate_limit(self.requests_per_minute)
        results = []
        docs = list(texts)

        while docs:
            # Working in batches because the API accepts maximum 5
            # documents per request to get embeddings
            head, docs = (
                docs[: self.num_instances_per_batch],
                docs[self.num_instances_per_batch :],
            )
            chunk = self.client.get_embeddings(head)
            results.extend(chunk)
            next(limiter)

        return [r.values for r in results]

In [None]:
# Stolen from: https://github.com/GoogleCloudPlatform/generative-ai/blob/main/language/examples/langchain-intro/intro_langchain_palm_api.ipynb
# 
#  I increased the temp a little, haven't experimented with top k and p to much
#
# LLM model
llm = VertexAI(
    model_name="text-bison@001",
    max_output_tokens=256,
    temperature=0.5,
    top_p=0.8,
    top_k=40,
    verbose=True,
    safety_setting=[
            {
            "category": glm.HarmCategory.HARM_CATEGORY_VIOLANT,
            "threshold": safety_types.HarmBlockThreshold.BLOCK_ONLY_HIGH,
        }
    ]
)

# Embedding
EMBEDDING_QPM = 60
EMBEDDING_NUM_BATCH = 5
embeddings = CustomVertexAIEmbeddings(
    requests_per_minute=EMBEDDING_QPM,
    num_instances_per_batch=EMBEDDING_NUM_BATCH,
    max_output_tokens=1024 # I've changed the default to allow for more output token in the embeddings (default is like 256)
)

In [151]:
# Straight out of our docs but I changed it a bit to allow for binary input (instead of a file)
#
# 
def process_document(
    response_byte_stream: str,
    project_number: str,
    location: str,
    processor_id: str,
    mime_type: str,
    field_mask: Optional[str] = None,
    processor_version_id: Optional[str] = None,
) -> None:
    # You must set the `api_endpoint` if you use a location other than "us".
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    if processor_version_id:
        # The full resource name of the processor version, e.g.:
        # `projects/{project_number}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
        name = client.processor_version_path(
            project_number, location, processor_id, processor_version_id
        )
    else:
        # The full resource name of the processor, e.g.:
        # `projects/{project_number}/locations/{location}/processors/{processor_id}`
        name = client.processor_path(project_number, location, processor_id)
    
    # Load binary data
    raw_document = documentai.RawDocument(content=response_byte_stream, mime_type=mime_type)

    # Configure the process request
    request = documentai.ProcessRequest(
        name=name, raw_document=raw_document, field_mask=field_mask
    )

    result = client.process_document(request=request)

    # For a full list of `Document` object attributes, reference this page:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
    document = result.document
    # Doing some ad hoc cleaning up of the data, anything that doesn't add to the semantic context should go really, but this is a start
    text = document.text.replace('  ', ' ')
    text = text.replace('  ', ' ')
    return text

In [153]:
# If you are asking yourself - shouldn't this be a function? It probably should! Or combined with the process_document function.
# On the other hand, I'm not using it anywhere else and this just a demo. 
#
# Basically what I am doing here is I'm listing the documents in a share ( which is mounted from a Windows host) and loading them as bytes.
# I use pypdf2 to count the pages and put in some logic to create batches because we can't go over 15 pages.
# I was to lazy to also check for filesize - if the pfd is under 12 pages it gets sent to DocAI immediately, else it will split the pdf.
# I use a list to store all the text string returned by DocAI
#
# Maybe its the old school metadata fanboy in me but I also store the full_path of the file and the directory listing to the docs list
#
# Processing like this is not really fast. Its actually quite slow. This probably belongs in a data processing pipeline of sorts
# or could be changed into parallel processes but I don't know how to do that yet in Python
#
list_dir = listdir(dir)
docs.append(dir)
for file_name in (list_dir):
    #print(i)
    full_path=dir+'/'+file_name
    with open(full_path,"rb") as fh:
        bytes_stream= io.BytesIO(fh.read())
    reader=PdfReader(bytes_stream)
    page_number=len(reader.pages)
    if page_number >= max_pages:
        page_array = np.arange(1,page_number)
        batches= math.floor(page_number/max_pages)+1
        batch_array = np.array_split(page_array, batches)
        for batch in batch_array:
            pdf_writer = PdfWriter()
            for page_num in batch:
                pdf_writer.add_page(reader.pages[page_num.item()])
            response_bytes_stream = io.BytesIO()
            pdf_writer.write(response_bytes_stream)
            result=process_document(response_bytes_stream.getvalue(),project_number,location,processor_id,mime_type)
            docs.append(result)
    else:
        result=process_document(bytes_stream.getvalue(),project_number,location,processor_id,mime_type)
        docs.append(result)
    docs.append(full_path)

In [154]:
# The max token size for outputs from embeddings is 1024, same as the max input token size for Palm.
# That leaves no room for a prompt, so I'm using the recursive textsplitter to make smaller chunks. 
# Might be interesting to see the results with even smaller chunks
#
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=300, chunk_overlap=50)
documents = text_splitter.create_documents(docs)

In [159]:
# Store docs in local vectorstore as index
# it may take a while since API is rate limited
# Also found this somewhere, added persistence for the db

db = Chroma.from_documents(texts, embeddings, persist_directory = "index")
db.persist()

Waiting
......................................

In [156]:
# Max k as a search arguments gives us some room to experiment what works best when using embeddings. 
#
#
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 1})

In [157]:
# Uses LLM to synthesize results from the search index.
# We use Vertex PaLM Text API for LLM
# Create three query types so be able to test the differences
#

qa1 = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever,
    return_source_documents=True,
)
qa2 = RetrievalQA.from_chain_type(
    llm=llm, chain_type="map_reduce", retriever=retriever
)
qa3 = RetrievalQA.from_chain_type(
    llm=llm, chain_type="refine", retriever=retriever
)

In [158]:
# I'm sure I haven't mastered the art of prompt engineering just yet, but I like this prompt for now. I only replace the question
# at the end and pick qa1/2/3 
query="You are Mr. Robot. You know everything about video games.Argos no Juujiken (Japan)?"

result = qa1({"query": query})
print(result)

{'query': 'You are Mr. Robot. You know everything about video games.Argos no Juujiken (Japan)?', 'result': 'Argos no Juujiken (Japan) is the Japanese name for Anticipation.', 'source_documents': [Document(page_content='EmuMovies\nNintendo\nNINTENDO OF AMERICA INC. PO BOX 957 REDMOND) WA 98073-0957 USA\nPRINTED IN JAPAN\nNES-AP USA\nAnticipation\n●\nA B C D E F G H I J K L M N O P Q R S T U V W X Y Z\nINSTRUCTION BOOKLET\nLook for this seal on all software and accessories\nfor your Nintendo Entertainment System. It repre-\nsents Nintendo\'s commitment to bringing you only\nthe highest quality products. Items not carrying\nthis seal have not been approved by\nNintendo, and are not guaranteed\nto meet our standards of\nexcellence in workmanship,\nreliability and most of all,\nentertainment value.\n20\nTHIS SEAL IS\nYOUR ASSURANCE THAT\nCONFE\nNintendo\nHMS APPROVED AND\nGUARANTEED THE\nQUALITY OF THIS\nPRODUCT.\nThank you for selecting the Nintendo Entertainment System ANTICIPATION™* Pak.