# Preliminaries + Installs

These instructions are for Python 3.10
### Fix Pillow
* `cd /usr/lib/python3/dist-packages`
* `sudo rm -rf pillow-10.2.0.egg-info`
* `python3.10 -m pip install Pillow --user`
### Install Ollama
* `cd /tmp`
* `curl -fsSL https://ollama.com/install.sh | sh`
* Test, Optional (2GB download): `ollama run llama3.2`, Type `/bye` when done
### Install Langchain
* `python3.10 -m pip install langchain langchain_community langchain_chroma langchain_ollama llama-index-legacy langchain-unstructured --user`
* `python3.10 -m pip install pypdf "unstructured[pdf]" --user` , This needed to be separate for some reason.
### Install SQLite ( >= 3.35.0 required, This will install 3.46 )
* `sudo apt install libreadline-dev python3.10-dev`
* `wget https://sqlite.org/2024/sqlite-autoconf-3460100.tar.gz`
* `tar -xvf sqlite-autoconf-3460100.tar.gz && cd sqlite-autoconf-3460100`
* `./configure`
* `make`
* `sudo make install`
* `python3.10 -m pip uninstall pysqlite3`
* `python3.10 -m pip install pysqlite3-binary --user`

# Build/Expand Document Database + Embeddings

## State

In [1]:
from os import environ, path
import time, sys, json
now = time.time

from utils import RAG_State


environ["_RAG_STATE_PATH"] = "data/state.json"

RAGstate = RAG_State.load_state( environ["_RAG_STATE_PATH"] )

## Copy PDFs

In [2]:
import os
from os import path, makedirs
# from aa_scrape_PDF import copy_pdfs

environ["_RAG_PDF_SOURCE"] = "/media/james/FILEPILE/$_Robust_Planning/Literature/References/storage"
environ["_RAG_PDF_DESTIN"] = "data/input/pdf"
environ["_RAG_PDF_ERROR"]  = "data/input/BAD_PDF"
environ["_RAG_PAGE_DESTN"] = "data/input/pages"
environ["_RAG_VERBOSE"]    =    ""
environ["_RAG_DOC_ADD"]    =   "400" #"25" #"200"
environ["_RAG_DOC_LIMIT"]  = "20000"
environ["_RAG_DOC_DBASE"]  = "lit_pdf"
environ["_RAG_RUN_QUERY"]  = "" #"True"

if not path.exists( environ["_RAG_PDF_ERROR"] ):
    makedirs( environ["_RAG_PDF_ERROR"] )


## Determine if more docs will be loaded this session

In [3]:
%%capture
# Loading the vector store sometimes spews warnings

__import__('pysqlite3')
sys.modules['sqlite3'] = sys.modules.pop( 'pysqlite3' )
import chromadb
from langchain_chroma import Chroma


persistent_client = chromadb.PersistentClient();
collection        = persistent_client.get_or_create_collection( environ["_RAG_DOC_DBASE"] );

pdfs_drct   = environ["_RAG_PDF_DESTIN"]
eror_drct   = environ["_RAG_PDF_ERROR" ]
fNames      = [item for item in os.listdir( pdfs_drct ) if (str( item ).split('.')[-1].lower() == 'pdf')]

environ["_RAG_PDF_COUNT"   ] = str( len( fNames ) )
environ["_RAG_DOCDB_COUNT" ] = str( collection.count() )
environ["_RAG_DOCDB_REMAIN"] = str( min( int(environ["_RAG_DOC_LIMIT"])-len(RAGstate.docPaths), int(environ["_RAG_DOC_ADD"]) ) )


In [4]:
from utils import copy_pdfs

print( f"{environ['_RAG_PDF_COUNT' ]} input PDFs exist!" )
print( f"{environ['_RAG_DOCDB_COUNT' ]} vector records exist!" )
print( f"{environ['_RAG_DOCDB_REMAIN']} files will be copied!" )

copy_pdfs( environ["_RAG_PDF_SOURCE"], environ["_RAG_PDF_DESTIN"], 
           int(environ['_RAG_DOCDB_REMAIN']), verbose = environ["_RAG_VERBOSE"] )

10514 input PDFs exist!
225139 vector records exist!
400 files will be copied!
........................................................................................................................................................................................................................


## Load PDFs by page chunks

### https://python.langchain.com/docs/how_to/document_loader_pdf/

* `python3.10 -m pip install pypdf langchain-unstructured "unstructured[pdf]" --user`
* `apt install tesseract-ocr`

In [5]:

def collect_unique_metadata_by_key( key ):
    """ Return a list of unqiue metadata values from the vector store by `key` """
    global collection
    unique = set()
    for result in collection.get()['metadatas']:
       if key in result:
           unique.add( result[key] )
    return list( unique )


def find_unread_PDFs_at_input():
    """ Return a list of PDF paths that do NOT have pages in the vector store """
    global RAGstate
    inputPDFs = collect_unique_metadata_by_key( 'source' )
    rtnPaths  = list()
    for pdf in inputPDFs:
        if pdf not in RAGstate['libDocs']:
            rtnPaths.append( pdf )
    return rtnPaths
            

In [6]:
import os, shutil
from collections import deque
from langchain_community.document_loaders import PyPDFLoader

pages = deque() # Fast append


# needToParse = find_unread_PDFs_at_input()
# fNames.extend( needToParse )

print( f"There are {len(fNames)} candidate files!" )

if len( fNames ) > 0:
    bgn    = now()
    lastLn = 0
    Nexist = 0
    
    for i, fNam in enumerate( fNames ):
        # file_path = str( path.join( pdfs_drct, fNam ) ) # fNam # str( path.join( pdfs_drct, fNam ) )
        file_path = fr"{path.join( pdfs_drct, fNam )}" # fNam # str( path.join( pdfs_drct, fNam ) )
        # file_path = file_path.replace( ' ', '\\ ' )
        # file_path = file_path.replace( ' ', '\\ ' )
        if file_path not in RAGstate.docPaths:
            try:
                loader    = PyPDFLoader( file_path )
                async for page in loader.alazy_load():
                    pages.append( page )
                print( f"{i+1}:{len(pages)-lastLn}:{len(pages)}", end = ', ', flush = True )
                lastLn = len(pages)
            except Exception as e:
                print( f"ERROR:{e}", end = ', ', flush = True )
                try:
                    # errr_path = str(path.join( eror_drct, fNam ))
                    errr_path = fr"{path.join( eror_drct, fNam )}"
                    # errr_path = errr_path.replace( ' ', '\\ ' )
                    shutil.move( file_path, errr_path )
                except Exception as e:
                    print( f"FAILED to move {file_path} --to-> {path.join( eror_drct, fNam )}" )
            except asyncio.CancelledError as e:
                print( f"Load operation cancelled by user" )
                raise e
        else:
            Nexist += 1
                    
    print()
    pages = list( pages )
    print( f"Read {len(pages)} pages in {(now()-bgn)/60.0:.2f} minutes! (Skipped {Nexist} existing PDFs.)" )

RAGstate.save_state( environ["_RAG_STATE_PATH"] )

There are 10514 candidate files!
65:41:41, 241:7:48, 246:11:59, 402:6:65, 408:14:79, 442:2:81, 507:15:96, 

Advanced encoding /KSCms-UHC-H not implemented yet
Advanced encoding /KSCms-UHC-H not implemented yet
Advanced encoding /90ms-RKSJ-H not implemented yet
Advanced encoding /KSCms-UHC-H not implemented yet
Advanced encoding /90ms-RKSJ-H not implemented yet


588:7:103, 764:8:111, 766:11:122, 836:3:125, 868:21:146, 1077:17:163, 1186:27:190, 1194:129:319, 1218:3:322, 

Advanced encoding /KSCms-UHC-H not implemented yet
Advanced encoding /90ms-RKSJ-H not implemented yet
Advanced encoding /KSCms-UHC-H not implemented yet


1248:4:326, 1350:1:327, 1417:75:402, 1418:135:537, 1484:1:538, 1526:3:541, 1531:3:544, 1664:9:553, 1691:1:554, 1697:1:555, 1735:95:650, 1797:27:677, 1855:6:683, 1871:2:685, 1940:1:686, 1984:3:689, 2013:7:696, 2043:3:699, 2069:4:703, 2176:1:704, 2181:13:717, 2291:376:1093, 2455:1:1094, 2639:7:1101, 2650:17:1118, 2829:20:1138, 2860:66:1204, 2957:20:1224, 3035:16:1240, 3112:15:1255, 3167:45:1300, 3424:6:1306, 3425:21:1327, 3468:23:1350, 3544:2:1352, 3571:60:1412, 3573:72:1484, 3631:585:2069, 3792:7:2076, 3827:196:2272, 3857:1:2273, 3879:2:2275, 3885:5:2280, 3886:24:2304, 3988:204:2508, 4036:8:2516, 

Advanced encoding /KSCms-UHC-H not implemented yet


4086:5:2521, 4162:135:2656, 4238:7:2663, 4269:9:2672, 4370:30:2702, 4375:3:2705, 4385:12:2717, 4428:20:2737, 4493:44:2781, 4519:14:2795, 4577:33:2828, 4675:5:2833, 4679:9:2842, 4719:2:2844, 

could not convert string to float: b'0.00-71968505' : FloatObject (b'0.00-71968505') invalid; use 0.0 instead


4778:17:2861, 4836:20:2881, 4881:365:3246, 4989:13:3259, 5048:13:3272, 5418:6:3278, 5479:266:3544, 5600:5:3549, 5668:9:3558, 5733:7:3565, 5759:7:3572, 5865:9:3581, 5939:1:3582, 5950:11:3593, 5961:10:3603, 6051:8:3611, 6199:24:3635, 6270:2:3637, 6353:18:3655, 

Advanced encoding /KSCms-UHC-H not implemented yet
Advanced encoding /90ms-RKSJ-H not implemented yet
Advanced encoding /KSCms-UHC-H not implemented yet
Advanced encoding /90ms-RKSJ-H not implemented yet
Advanced encoding /KSCms-UHC-H not implemented yet
Advanced encoding /KSCms-UHC-H not implemented yet
Advanced encoding /90ms-RKSJ-H not implemented yet


6405:4:3659, 6442:8:3667, 6519:13:3680, 

Ignoring wrong pointing object 19 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 305 0 (offset 0)


6714:76:3756, 6715:89:3845, 6743:10:3855, 6826:59:3914, 6833:38:3952, 6846:4:3956, 6874:7:3963, 6965:19:3982, 7036:7:3989, 7151:7:3996, 7331:10:4006, 7358:15:4021, 7532:150:4171, 7542:118:4289, 7552:3:4292, 7741:12:4304, 7762:2:4306, 7801:6:4312, 7837:4:4316, 7844:2:4318, 

Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 202 0 (offset 0)
Ignoring wrong pointing object 345 0 (offset 0)


7853:54:4372, 8064:35:4407, 8065:279:4686, 8076:8:4694, 8300:21:4715, 

Advanced encoding /KSCms-UHC-H not implemented yet
Advanced encoding /90ms-RKSJ-H not implemented yet
Advanced encoding /KSCms-UHC-H not implemented yet
Advanced encoding /90ms-RKSJ-H not implemented yet
Advanced encoding /KSCms-UHC-H not implemented yet
Advanced encoding /90ms-RKSJ-H not implemented yet
Advanced encoding /KSCms-UHC-H not implemented yet
Advanced encoding /90ms-RKSJ-H not implemented yet
Advanced encoding /KSCms-UHC-H not implemented yet
Advanced encoding /90ms-RKSJ-H not implemented yet
Advanced encoding /KSCms-UHC-H not implemented yet
Advanced encoding /90ms-RKSJ-H not implemented yet
Advanced encoding /KSCms-UHC-H not implemented yet
Advanced encoding /90ms-RKSJ-H not implemented yet
Advanced encoding /KSCms-UHC-H not implemented yet
Advanced encoding /90ms-RKSJ-H not implemented yet
Advanced encoding /KSCms-UHC-H not implemented yet
Advanced encoding /90ms-RKSJ-H not implemented yet
Advanced encoding /KSCms-UHC-H not implemented yet
Advanced encoding /90ms-RKSJ-H 

8309:14:4729, 8362:6:4735, 8406:488:5223, 8411:10:5233, 8463:83:5316, 8486:1:5317, 8489:1:5318, 8506:6:5324, 

Advanced encoding /KSCms-UHC-H not implemented yet
Advanced encoding /90ms-RKSJ-H not implemented yet
Advanced encoding /KSCms-UHC-H not implemented yet
Advanced encoding /KSCms-UHC-H not implemented yet


8522:4:5328, 8525:4:5332, 8752:1:5333, 8765:2:5335, 8799:2:5337, 8971:14:5351, 9053:11:5362, 9287:8:5370, 9333:11:5381, 9414:8:5389, 9485:24:5413, 9519:10:5423, 9538:10:5433, 9694:1:5434, 9848:7:5441, 9981:6:5447, 9991:21:5468, 10427:39:5507, 10442:17:5524, 10443:20:5544, 10451:19:5563, 10471:132:5695, 10498:7:5702, 
Read 5702 pages in 3.47 minutes! (Skipped 10360 existing PDFs.)


In [7]:
# print(f"{pages[0].metadata}\n")
# print(pages[0].page_content)

## Load the text embedding model

In [8]:
import sys, os, time
now = time.time

from utils import pull_ollama_model

from langchain_ollama import OllamaEmbeddings

pull_ollama_model( "all-minilm" )

local_embeddings = OllamaEmbeddings( model = "all-minilm" )

About to save 'all-minilm'.
This will spew a lot of text on the first run...


[?25lpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest 
pulling 797b70c4edf8... 100% ▕████████████████▏  45 MB                         
pulling c71d239df917... 100% ▕████████████████▏  11 KB                         
pulling 85011998c600... 100% ▕████████████████▏   16 B                         
pulling 548455b72658... 100% ▕████████████████▏  407 B                         
verifying sha256 digest 
writing manifest 
success [?25h


## Populate document vector database (of pages)

In [9]:
import base64
import io

import fitz, pymupdf
from PIL import Image


def pdf_page_to_base64( pdf_path: str, page_number: int ):
    zoom_x       = 1.5  # horizontal zoom
    zoom_y       = 1.5  # vertical zoom
    mat          = pymupdf.Matrix( zoom_x, zoom_y )
    pdf_document = fitz.open( pdf_path )
    page         = pdf_document.load_page(page_number - 1)  # input is one-indexed
    pix          = page.get_pixmap( matrix = mat )
    img          = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    buffer       = io.BytesIO()
    
    img.save( buffer, format="PNG" )

    return base64.b64encode( buffer.getvalue() ).decode("utf-8")

In [10]:
import pickle

from utils import gen_ID

if not path.exists( environ["_RAG_PAGE_DESTN"] ):
    makedirs( environ["_RAG_PAGE_DESTN"] )


def get_page_meta_key( source, page ):
    """ Generate a (probably not) unique page key with useful data that can also be used for sorting """
    return str( source ).split('/')[-1].replace(' ','') + '_' + str( page )


if int(environ["_RAG_DOCDB_REMAIN"]) > 0:
    bgn = now()
    docIDs  = [str( gen_ID() ) for _ in range( len(pages) )]
    dcmnts  = [str( pg.page_content ) for pg in pages]
    metaDt  = deque() # Fast append
    d       = 50
    readSet = set([])

    for i, pg in enumerate( pages ):

        id_i  = docIDs[i]
        
        # Save Text Metadata #
        mDct = pg.metadata
        mDct['metakey'] = get_page_meta_key( pg.metadata['source'], pg.metadata['page'] )
        mDct['docID'  ] = id_i
        metaDt.append( mDct )

        src = pg.metadata['source']

        #  vv- Quick Search -vv     vvvvv--- Long Search ---vvvvvvvv
        if (src not in readSet) and (src not in RAGstate.docPaths):
            readSet.add( src )
            RAGstate.docPaths.append( src )

        # Save PDF Page image #
        try:
            pkl_i = path.join( environ["_RAG_PAGE_DESTN"], f"{id_i}.pkl" )
            pgPic = pdf_page_to_base64( pg.metadata['source'], pg.metadata['page'] )
            
            with open( pkl_i, 'wb' ) as f:
                RAGstate.pgImgs[ id_i ] = str( pkl_i )
                pickle.dump( pgPic, f )
        except Exception as e:
            print( f"Could NOT save image ID {id_i}!, {e}" )

        if (i % d == 0):
            print( '.', end='', flush = True )
    print()

    metaDt = list( metaDt )

    if len( metaDt ):
        collection.add(
            ids       = docIDs, 
            metadatas = metaDt,
            documents = dcmnts
        )
        print( f"Added {len(dcmnts)} documents in {(now()-bgn)/60.0:.2f} minutes!" )
    else:
        print( f"NO documents to add!" )

RAGstate.save_state( environ["_RAG_STATE_PATH"] )



.........................................................................MuPDF error: syntax error: could not parse color space (921 0 R)

MuPDF error: syntax error: could not parse color space (315 0 R)

MuPDF error: syntax error: could not parse color space (612 0 R)

MuPDF error: syntax error: could not parse color space (765 0 R)

MuPDF error: syntax error: could not parse color space (857 0 R)

MuPDF error: syntax error: could not parse color space (889 0 R)

..........................................


Exception occurred invoking consumer for subscription 7ce98a0fa87b4056a2d6ac405daef0c1to topic persistent://default/default/a84553ac-5e8e-4481-8051-a5e59bf2af75 'utf-8' codec can't encode character '\ud835' in position 2720: surrogates not allowed


Added 5702 documents in 10.83 minutes!


# Create vector store

In [11]:
bgn = now()
vector_store_from_client = Chroma(
    client             = persistent_client,
    collection_name    = environ["_RAG_DOC_DBASE"],
    embedding_function = local_embeddings,
)
print( f"Built vector store in {(now()-bgn):.4f} seconds!" )

Built vector store in 0.0027 seconds!


# Load VLM

In [12]:
from langchain_ollama import ChatOllama

if environ["_RAG_RUN_QUERY"]:
    pull_ollama_model( "llava" )
    
    llm = ChatOllama(
        model="llava",
    )
else:
    llm = None

# Setup LangChain

In [13]:
os.environ["LANGCHAIN_TRACING_V1"] = "false"
os.environ["LANGCHAIN_TRACING_V2"] = "false"

if environ["_RAG_RUN_QUERY"]:
    # from langchain import hub
    from langchain_core.runnables import RunnablePassthrough
    from langchain_core.prompts import PromptTemplate
    from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
    
    # Retrieve and generate using the relevant snippets of the blog.
    retriever = vector_store_from_client.as_retriever()
    
    # Instantiation using from_template (recommended)
    prompt1 = PromptTemplate( 
        template = """You are an expert assistant capable of interpreting textual information to provide accurate 
                      and detailed responses. You are provided with the following data:
                      Context: {docData}
                      Text query: {userQuery}
                      Use your understanding of the provided context to generate a response to based on 
                      relevant, up-to-date information. Ensure your answer is factually accurate, detailed, and leverages academic 
                      sources where possible. If additional context is required for clarification, request it from the user.""",
        input_variables = ["docData","userQuery"],
    )
    
    
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)
    
    rag_chain1 = (
        { "docData": retriever | format_docs, 
          "userQuery": RunnablePassthrough()}
        | prompt1
        | llm
        | StrOutputParser()
    )

In [14]:
def ask_with_sources( q ):
    retrieved_docs = retriever.invoke( q )
    generated_ansr = rag_chain1.invoke( q )
    return {
        'response' : generated_ansr,
        'sources'  : retrieved_docs,
    }

In [15]:
def fetch_local_pages( sourceList ):
    global RAGstate
    rtnObjs = list()
    for source in sourceList:
        if source.metadata['docID'] in RAGstate.pgImgs:
            pklPath = RAGstate.pgImgs[ source.metadata['docID'] ]
            with open( pklPath, 'rb' ) as f:
                obj_i = pickle.load( f )
                rtnObjs.append( obj_i )
        else:
            print( f"No page with ID {source.metadata['docID']}" )
    return rtnObjs

In [16]:
from pprint import pprint
from IPython.display import Image as IPImage
from IPython.display import display
from langchain_core.messages import HumanMessage

if environ["_RAG_RUN_QUERY"]:
    def deep_doc_ask( q ):
        bgn = now()
        res = ask_with_sources( q ) 
        pprint( res['response'] )
        print( f"Initial LLM summary took {now()-bgn:.2f} seconds to process!" )
        
        pag = fetch_local_pages( res['sources'] )
    
        for p in pag:
            display( IPImage( data = base64.b64decode( p ) ) )
            message = HumanMessage(
                content=[
                    {"type": "text", "text": q},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{p}"},
                    },
                ],
            )
            bgn = now()
            response = llm.invoke( [message] )
            print( f"LLM query took {now()-bgn:.2f} seconds to process!" )
            pprint( response.content )

In [17]:
if environ["_RAG_RUN_QUERY"]:
    deep_doc_ask( "How do I estimate the running time of a robot plan based on confidence in the current state estimate?" )