# Preliminaries + Installs

These instructions are for Python 3.10
### Install Ollama
* `cd /tmp`
* `curl -fsSL https://ollama.com/install.sh | sh`
* Test, Optional (2GB download): `ollama run llama3.2`, Type `/bye` when done
### Install Langchain
* `python3.10 -m pip install langchain langchain_community langchain_chroma langchain_ollama llama-index-legacy pypdf langchain-unstructured "unstructured[pdf]" --user`
### Install SQLite ( >= 3.35.0 required, This will install 3.46 )
* `sudo apt install libreadline-dev python3.10-dev`
* `wget https://sqlite.org/2024/sqlite-autoconf-3460100.tar.gz`
* `tar -xvf sqlite-autoconf-3460100.tar.gz && cd sqlite-autoconf-3460100`
* `./configure`
* `make`
* `sudo make install`
* `python3.10 -m pip uninstall pysqlite3`
* `python3.10 -m pip install pysqlite3-binary --user`

# Build/Expand Document Database + Embeddings

## State

In [1]:
from os import environ, path
import time, sys, json
now = time.time

environ["_RAG_STATE_PATH"] = "data/state.json"

RAGstate = {
    'libDocs' : list()
}

def save_state():
    global RAGstate
    with open( environ["_RAG_STATE_PATH"], 'w' ) as f:
        json.dump( RAGstate, f, indent = 4 )

def load_state():
    global RAGstate
    try:
        with open( environ["_RAG_STATE_PATH"], 'r' ) as f:
            RAGstate = json.load(f)
    except FileNotFoundError as e:
        print( f"Could not load {environ['_RAG_STATE_PATH']}!\n{e}" )

load_state()

Could not load data/state.json!
[Errno 2] No such file or directory: 'data/state.json'


## Helper Functions

In [2]:
from uuid import uuid4

def safe_str( data ):
    """Filters out invalid UTF-8 characters from a string."""
    return str( data ).encode( 'utf-8', 'ignore' ).decode( 'utf-8' )

def gen_ID():
    """ Generate a unique ID """
    return safe_str( uuid4() )

def pull_ollama_model( modelStr ):
    """ Pull a named model from Ollama and store it wherever """
    print( f"About to save '{modelStr}'.\nThis will spew a lot of text on the first run..." )
    os.system( f"ollama pull {modelStr}" )



## Copy PDFs

In [3]:
from aa_scrape_PDF import copy_pdfs

environ["_RAG_PDF_SOURCE"] = "/media/james/FILEPILE/$_Robust_Planning/Literature/References/storage"
environ["_RAG_PDF_DESTIN"] = "data/input/pdf"
environ["_RAG_VERBOSE"]    =    ""
environ["_RAG_DOC_ADD"]    =  "50"
environ["_RAG_DOC_LIMIT"]  = "250"
environ["_RAG_DOC_DBASE"]  = "lit_pdf"
environ["_RAG_VEC_DBASE"]  = "lit_vec"

## Determine if more docs will be loaded this session

In [4]:
__import__('pysqlite3')
sys.modules['sqlite3'] = sys.modules.pop( 'pysqlite3' )
import chromadb
from langchain_chroma import Chroma


persistent_client = chromadb.PersistentClient()
collection        = persistent_client.get_or_create_collection( environ["_RAG_DOC_DBASE"] )

environ["_RAG_DOCDB_COUNT" ] = str( collection.count() )
environ["_RAG_DOCDB_REMAIN"] = str( min( int(environ["_RAG_DOC_LIMIT"])-collection.count(), int(environ["_RAG_DOC_ADD"]) ) )

print( f"{environ['_RAG_DOCDB_REMAIN']} files will be copied!" )

copy_pdfs()

50 files will be copied!
.

## Load PDFs by page chunks

### https://python.langchain.com/docs/how_to/document_loader_pdf/

* `python3.10 -m pip install pypdf langchain-unstructured "unstructured[pdf]" --user`
* `apt install tesseract-ocr`

In [5]:
import os
from collections import deque
from langchain_community.document_loaders import PyPDFLoader

if int(environ["_RAG_DOCDB_REMAIN"]) > 0:
    bgn = now()
    pdfs_drct = environ["_RAG_PDF_DESTIN"]
    fNames    = [item for item in os.listdir( pdfs_drct ) if (str( item ).split('.')[-1].lower() == 'pdf')]
    print( f"Copied {len(fNames)} files!" )
    pages = deque() # Fast append
    
    for i, fNam in enumerate( fNames ):
        file_path = str( path.join( pdfs_drct, fNam ) )
        if file_path not in RAGstate['libDocs']:
            loader    = PyPDFLoader( file_path )
            async for page in loader.alazy_load():
                pages.append( page )
            print( f"{i+1}:{len(pages)}", end = ', ', flush = True )
            RAGstate['libDocs'].append( file_path )
    print()
    pages = list( pages )
    print( f"Read {len(pages)} pages in {(now()-bgn)/60.0} minutes!" )

save_state()

Copied 50 files!
1:15, 2:22, 3:27, 4:39, 5:52, 6:66, 7:77, 8:87, 9:106, 10:127, 11:135, 12:152, 13:196, 14:218, 15:250, 16:283, 17:309, 18:326, 19:336, 20:348, 21:361, 22:379, 23:387, 24:399, 25:424, 26:435, 27:458, 28:517, 29:523, 30:541, 31:555, 32:567, 33:575, 34:585, 35:605, 36:626, 37:639, 38:666, 39:674, 

Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 25 0 (offset 0)
Ignoring wrong pointing object 122 0 (offset 0)
Ignoring wrong pointing object 127 0 (offset 0)
Ignoring wrong pointing object 132 0 (offset 0)
Ignoring wrong pointing object 134 0 (offset 0)
Ignoring wrong pointing object 143 0 (offset 0)
Ignoring wrong pointing object 148 0 (offset 0)
Ignoring wrong pointing object 153 0 (offset 0)
Ignoring wrong pointing object 155 0 (offset 0)
Ignoring wrong pointing object 223 0 (offset 0)
Ignoring wrong pointing object 225 0 (offset 0)
Ignoring wrong pointing object 227 0 (offset 0)
Ignoring wrong pointing object 229 0 (offset 0)
Ignoring wrong pointing object 231 0 (offset 0)
Ignoring wrong pointing object 233 0 (offset 0)
Ignoring wrong pointing object 235 0 (offset 0)
Ignoring wrong pointing object 237 0 (offset 0)
Ignoring wrong pointing object 239 0 (offset

40:715, 41:727, 42:738, 43:1107, 44:1125, 45:1160, 46:1179, 47:1194, 48:1217, 49:1228, 50:1237, 
Read 1237 pages in 0.7850842396418254 minutes!


In [6]:
# print(f"{pages[0].metadata}\n")
# print(pages[0].page_content)

## Load the text embedding model

In [7]:

import sys, os, time
now = time.time


# from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
# from langchain_community import embeddings

pull_ollama_model( "nomic-embed-text" )

local_embeddings = OllamaEmbeddings( model = "nomic-embed-text" )

About to save 'nomic-embed-text'.
This will spew a lot of text on the first run...


[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest 
pulling 970aa74c0a90... 100% ▕████████████████▏ 274 MB                         
pulling c71d239df917... 100% ▕████████████████▏  11 KB                         
pulling ce4a164fc046... 100% ▕████████████████▏   17 B                         
pulling 31df23ea7daa... 100% ▕████████████████▏  420 B                         
verifying sha256 digest 
writing manifest 
success [?25h


## Populate document database (of pages)

In [8]:
def get_page_meta_key( source, page ):
    """ Generate a (probably not) unique page key with useful data that can also be used for sorting """
    return str( source ).split('/')[-1].replace(' ','') + '_' + str( page )

if int(environ["_RAG_DOCDB_REMAIN"]) > 0:
    bgn = now()
    docIDs = [str( gen_ID() ) for _ in range( len(pages) )]
    dcmnts = [str( pg.page_content ) for pg in pages]
    metaDt = list()

    for pg in pages:
        mDct = pg.metadata
        mDct['metakey'] = get_page_meta_key( pg.metadata['source'], pg.metadata['page'] )
        metaDt.append( mDct )
    
    collection.add(
        ids       = docIDs, 
        metadatas = metaDt,
        documents = dcmnts
    )
    print( f"Added {len(dcmnts)} documents in {(now()-bgn)/60.0} minutes!" )



Added 1237 documents in 0.824593722820282 minutes!


# Create vector store

In [9]:
bgn = now()
vector_store_from_client = Chroma(
    client             = persistent_client,
    collection_name    = environ["_RAG_DOC_DBASE"],
    embedding_function = local_embeddings,
)
print( f"Built vector store in {(now()-bgn)} seconds!" )

Built vector store in 0.003161907196044922 seconds!


# Load VLM

In [10]:
from langchain_ollama import ChatOllama

pull_ollama_model( "llava" )

llm = ChatOllama(
    model="llava",
)

About to save 'llava'.
This will spew a lot of text on the first run...


[?25lpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest 
pulling 170370233dd5... 100% ▕████████████████▏ 4.1 GB                         
pulling 72d6f08a42f6... 100% ▕████████████████▏ 624 MB                         
pulling 43070e2d4e53... 100% ▕████████████████▏  11 KB                         
pulling c43332387573... 100% ▕████████████████▏   67 B                         
pulling ed11eda7790d... 100% ▕████████████████▏   30 B                         
pulling 7c658f9561e5... 100% ▕████████████████▏  564 B                         
verifying sha256 digest 
writing manifest 
success [?25h


In [11]:
import base64
import io

import fitz
from PIL import Image


def pdf_page_to_base64(pdf_path: str, page_number: int):
    pdf_document = fitz.open(pdf_path)
    page = pdf_document.load_page(page_number - 1)  # input is one-indexed
    pix = page.get_pixmap()
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    buffer = io.BytesIO()
    img.save(buffer, format="PNG")

    return base64.b64encode(buffer.getvalue()).decode("utf-8")

In [12]:
from IPython.display import Image as IPImage
from IPython.display import display

base64_image = pdf_page_to_base64(file_path, 11)
display( IPImage( data = base64.b64decode( base64_image ) ) )

ValueError: page not in document

In [None]:
from langchain_core.messages import HumanMessage

query = "What can be said about the data composition?"

message = HumanMessage(
    content=[
        {"type": "text", "text": query},
        {
            "type": "image_url",
            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
        },
    ],
)
bgn = now()
response = llm.invoke( [message] )
print( f"LLM query took {now()-bgn} seconds to process!" )
print( response.content )