In [None]:
# Base system dependencies
!sudo apt -y -qq install tesseract-ocr libtesseract-dev

# required by PyPDF2 for page count and other pdf utilities
!sudo apt-get -y -qq install poppler-utils python-dev libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext tesse

In [None]:
# Install the packages
import os

if not os.getenv("IS_TESTING"):
    USER = "--user"
else:
    USER = ""
# Install Vertex AI LLM SDK, langchain and dependencies
! pip install google-cloud-aiplatform langchain==0.0.229 chromadb==0.3.26 pydantic==1.10.8 typing-inspect==0.8.0 ty

In [None]:
import urllib
import warnings
from pathlib import Path as p
from pprint import pprint

import pandas as pd
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import VertexAIEmbeddings
from langchain.llms import VertexAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

In [None]:
vertex_llm_text = VertexAI(model_name="text-bison@001")
vertex_embeddings = VertexAIEmbeddings(model_name="textembedding-gecko@001")

In [None]:
# Straight out of our docs and some online examples. Some examples cheated by reading the results immediately and dumping
# it into some big string in the function but I'm not doing that
#
#
def batch_process_documents(
    project_number: str,
    location: str,
    processor_id: str,
    gcs_input_uri: str,
    gcs_output_uri: str,
    processor_version_id: Optional[str] = None,
    input_mime_type: Optional[str] = None,
    field_mask: Optional[str] = None,
    timeout: int = 9999, # gave it a loooong timeout because we are processing a lot of files
):
    # You must set the api_endpoint if you use a location other than "us".
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    if not gcs_input_uri.endswith("/") and "." in gcs_input_uri:
        # Specify specific GCS URIs to process individual documents
        gcs_document = documentai.GcsDocument(
            gcs_uri=gcs_input_uri, mime_type=input_mime_type
        )
        # Load GCS Input URI into a List of document files
        gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
        input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
    else:
        # Specify a GCS URI Prefix to process an entire directory
        gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_uri)
        input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)
        
    # Cloud Storage URI for the Output Directory
    gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
        gcs_uri=gcs_output_uri, field_mask=field_mask
    )
    # Where to write results
    output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)

    if processor_version_id:
        # The full resource name of the processor version, e.g.:
        # projects/{project_number}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
        name = client.processor_version_path(
            project_number, location, processor_id, processor_version_id
        )
    else:
        # The full resource name of the processor, e.g.:
        # projects/{project_number}/locations/{location}/processors/{processor_id}
        name = client.processor_path(project_number, location, processor_id)

    request = documentai.BatchProcessRequest(
        name=name,
        input_documents=input_config,
        document_output_config=output_config, 
    )

    # BatchProcess returns a Long Running Operation (LRO)
    operation = client.batch_process_documents(request)

    # Continually polls the operation until it is complete.
    # This could take some time for larger files
    # Format: projects/{project_number}/locations/{location}/operations/{operation_id}
    try:
        print(f"Waiting for operation {operation.operation.name} to complete...")
        operation.result(timeout=timeout)
    # Catch exception when operation doesn"t finish before timeout
    except (RetryError, InternalServerError) as e:
        print(e.message)

    # NOTE: Can also use callbacks for asynchronous processing
    #
    # def my_callback(future):
    #   result = future.result()
    #
    # operation.add_done_callback(my_callback)

    # Once the operation is complete,
    # get output document information from operation metadata
    metadata = documentai.BatchProcessMetadata(operation.metadata)

    if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED:
        raise ValueError(f"Batch Process Failed: {metadata.state_message}")

In [None]:
# call the function
batch_process_documents(
        project_number=project_number,
        location=location,
        processor_id=processor_id,
        gcs_input_uri=gcs_input_uri,
        gcs_output_uri=gcs_output_uri,
        input_mime_type=input_mime_type,
        field_mask=field_mask,
    )

In [None]:
# Reading the output files from DocAI from GCS. I'm using the document object from DocAI to easily load the JSON.
#
# Maybe its the old school metadata fanboy in me but I also store the all the paths as strings in the docs list
#
client = storage.Client()
output_blobs = client.list_blobs(gcs_bucket_name, prefix="pdf_output/")
docs=[]
paths=[]
for blob in output_blobs:
    if blob.content_type != "application/json":
        print(f"Skipping non-supported file: {blob.name} - Mimetype: {blob.content_type}")
        continue
    paths.append(blob.name)
    document=documentai.Document.from_json(blob.download_as_bytes(), ignore_unknown_fields=True)
    docs.append(document.text)
paths="".join(paths)
docs.append(paths)

In [None]:
# The max token size for outputs from embeddings is 1024, same as the max input token size for Palm.
# That leaves no room for a prompt, so I'm using the recursive textsplitter to make smaller chunks. 
# Might be interesting to see the results with even smaller chunks
#
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=800, chunk_overlap=50)
texts = text_splitter.create_documents(docs)

In [None]:
# Store docs in local vectorstore as index
# it may take a while since API is rate limited
# Also found this somewhere, added persistence for the db
# This takes a lotta lottta lotta time
db = Chroma.from_documents(texts, embeddings, persist_directory = "index_ninkasi")
db.persist()

In [None]:
#
# Maybe its the old school metadata fanboy in me but I also store the all the paths as strings in the docs list
#
from langchain.document_loaders import CSVLoader

docs=[]
list_dir = listdir(dir)
for file_name in (list_dir):
    full_path=dir+'/'+file_name
    csv = CSVLoader (full_path).load()
    for content in csv:
        docs.append(content)


In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=800, chunk_overlap=50)
texts = text_splitter.split_documents(docs)


In [None]:
# Store docs in local vectorstore as index
# it may take a while since API is rate limited
# Also found this somewhere, added persistence for the db
# This takes a lotta lottta lotta time
db = Chroma.from_documents(texts, embeddings, persist_directory = "index_ninkasi")
db.persist()

In [None]:
#
from langchain.document_loaders import TextLoader

dir='gcs/txt/'

docs=[]
list_dir = listdir(dir)
for file_name in (list_dir):
    full_path=dir+'/'+file_name
    txt = TextLoader (full_path).load()
    for content in txt:
        docs.append(content)


In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=800, chunk_overlap=50)
texts = text_splitter.split_documents(docs)


In [None]:
# Store docs in local vectorstore as index
# it may take a while since API is rate limited
# Also found this somewhere, added persistence for the db
# This takes a lotta lottta lotta time
db = Chroma.from_documents(texts, embeddings, persist_directory = "index_ninkasi")
db.persist()

In [None]:
# Max k as a search arguments gives us some room to experiment what works best when using embeddings. 
#
#
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [None]:
from langchain import PromptTemplate
from langchain.chains import LLMChain

prompt_template = """Use the context below create a recipe of max 1000 words with special ingredients for a beer with type below and translate into modern english:
    Context: {context}
    Type: {type}
    recipe:"""

PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "type"])

chain = LLMChain(llm=llm, prompt=PROMPT)

In [None]:
! pip install pgvector
! pip install psycopg2-binary

In [None]:
from langchain.vectorstores.pgvector import PGVector
from langchain.docstore.document import Document

user = "postgres"
host = "IP"
port = "5432"
dbname= "postgres"
password = "Password"


CONNECTION_STRING = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{dbname}?sslmode=require"

COLLECTION_NAME = "test"

db = PGVector.from_documents(
    embedding=embeddings,
    documents=texts,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)


In [None]:
import pprint

def generate_recipe(type):
    docs = db.similarity_search(type, k=1)
    inputs = [{"context": doc.page_content, "type": type} for doc in docs]
    chain2= (chain.apply(inputs))
    pprint.pprint(chain)
    print(chain.apply(inputs))

In [None]:
generate_recipe('Summit')

In [None]:
print(db.similarity_search_with("beer", k=1))