## Copy PDFs

In [1]:
from os import environ, path
import time, sys
now = time.time
from aa_scrape_PDF import copy_pdfs

environ["_RAG_PDF_SOURCE"] = "/media/james/FILEPILE/$_Robust_Planning/Literature/References/storage"
environ["_RAG_PDF_DESTIN"] = "data/input/pdf"
environ["_RAG_VERBOSE"]    =    ""
environ["_RAG_DOC_ADD"]    =  "50"
environ["_RAG_DOC_LIMIT"]  = "250"
environ["_RAG_DOC_DBASE"]  = "lit_pdf"
environ["_RAG_VEC_DBASE"]  = "lit_vec"
copy_pdfs()

.....


## Check existence of the database and Determine if more docs will be loaded this session

In [2]:
__import__('pysqlite3')
sys.modules['sqlite3'] = sys.modules.pop( 'pysqlite3' )
import chromadb
from langchain_chroma import Chroma
from uuid import uuid4 as gen_id

persistent_client = chromadb.PersistentClient()
collection        = persistent_client.get_or_create_collection( environ["_RAG_DOC_DBASE"] )

if environ["_RAG_DOC_DBASE"] in [c.name for c in persistent_client.list_collections()]:
    environ["_RAG_DOCDB_EXISTS"] = "1"
else:
    environ["_RAG_DOCDB_EXISTS"] = ""
    


## Load PDFs by page chunks

### https://python.langchain.com/docs/how_to/document_loader_pdf/

* `python3.10 -m pip install pypdf langchain-unstructured "unstructured[pdf]" --user`
* `apt install tesseract-ocr`

In [3]:
import os
from collections import deque
from langchain_community.document_loaders import PyPDFLoader

if not environ["_RAG_DOCDB_EXISTS"]:
    bgn = now()
    pdfs_drct = environ["_RAG_PDF_DESTIN"]
    fNames    = [item for item in os.listdir( pdfs_drct ) if (str( item ).split('.')[-1].lower() == 'pdf')]
    print( f"Copied {len(fNames)} files!" )
    pages = deque() # Fast append
    
    for i, fNam in enumerate( fNames ):
        file_path = path.join( pdfs_drct, fNam )
        loader    = PyPDFLoader( file_path )
        async for page in loader.alazy_load():
            pages.append( page )
        print( f"{i+1}:{len(pages)}", end = ', ', flush = True )
    print()
    pages = list( pages )
    print( f"Read {len(pages)} pages in {(now()-bgn)/60.0} minutes!" )

In [4]:
# print(f"{pages[0].metadata}\n")
# print(pages[0].page_content)

## Load the text embedding model

In [5]:
def pull_ollama_model( modelStr ):
    """ Pull a named model from Ollama and store it wherever """
    print( f"About to save '{modelStr}'.\nThis will spew a lot of text on the first run..." )
    os.system( f"ollama pull {modelStr}" )

In [6]:

import sys, os, time
now = time.time


# from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
# from langchain_community import embeddings

pull_ollama_model( "nomic-embed-text" )

local_embeddings = OllamaEmbeddings( model = "nomic-embed-text" )

About to save 'nomic-embed-text'.
This will spew a lot of text on the first run...


[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest 
pulling 970aa74c0a90... 100% ▕████████████████▏ 274 MB                         
pulling c71d239df917... 100% ▕████████████████▏  11 KB                         
pulling ce4a164fc046... 100% ▕████████████████▏   17 B                         
pulling 31df23ea7daa... 100% ▕████████████████▏  420 B                         
verifying sha256 digest 
writing manifest 
success [?25h


## Populate document database (of pages)

In [7]:

if not environ["_RAG_DOCDB_EXISTS"]:
    bgn = now()
    docIDs = [str( gen_id() ) for _ in range( len(pages) )]
    dcmnts = [str( pg.page_content ) for pg in pages]
    collection.add(
        ids       = docIDs, 
        documents = dcmnts
    )
    print( f"Added {len(dcmnts)} documents in {(now()-bgn)/60.0} minutes!" )



# Create vector store

In [8]:
bgn = now()
vector_store_from_client = Chroma(
    client             = persistent_client,
    collection_name    = environ["_RAG_DOC_DBASE"],
    embedding_function = local_embeddings,
)
print( f"Built vector store in {(now()-bgn)} seconds!" )

Built vector store in 0.00527191162109375 seconds!


In [9]:
# vectorstore      = Chroma.from_documents( documents = pages, embedding = local_embeddings )
# print( f"Built vector store in {now()-bgn} seconds!" )

### 2024-10-18: Langchain's API Key fails

In [10]:
# import getpass
# import os

# _UNSTRUCT_KEY_LOC = "secrets/Langchain_Unstructured.txt"

# with open( _UNSTRUCT_KEY_LOC, 'r' ) as f:
#     os.environ["UNSTRUCTURED_API_KEY"] = str( f.read() ).strip()
    

In [11]:
# from langchain_unstructured import UnstructuredLoader

# loader = UnstructuredLoader(
#     file_path = file_path,
#     strategy  = "hi_res",
#     partition_via_api = True,
#     coordinates = True,
# )
# docs = []
# for doc in loader.lazy_load():
#     docs.append( doc )

### https://python.langchain.com/docs/how_to/document_loader_pdf/#local-parsing

In [12]:
from langchain_unstructured import UnstructuredLoader

loader_local = UnstructuredLoader(
    file_path = file_path,
    strategy  = "fast", #"hi_res",
)
docs_local = []
for doc in loader_local.lazy_load():
    docs_local.append( doc )

NameError: name 'file_path' is not defined

In [None]:
len( docs_local )

In [None]:
from langchain_ollama import ChatOllama

pull_ollama_model( "llava" )

llm = ChatOllama(
    model="llava",
)

In [None]:
import base64
import io

import fitz
from PIL import Image


def pdf_page_to_base64(pdf_path: str, page_number: int):
    pdf_document = fitz.open(pdf_path)
    page = pdf_document.load_page(page_number - 1)  # input is one-indexed
    pix = page.get_pixmap()
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    buffer = io.BytesIO()
    img.save(buffer, format="PNG")

    return base64.b64encode(buffer.getvalue()).decode("utf-8")

In [None]:
from IPython.display import Image as IPImage
from IPython.display import display

base64_image = pdf_page_to_base64(file_path, 11)
display( IPImage( data = base64.b64decode( base64_image ) ) )

In [None]:
from langchain_core.messages import HumanMessage

query = "What can be said about the data composition?"

message = HumanMessage(
    content=[
        {"type": "text", "text": query},
        {
            "type": "image_url",
            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
        },
    ],
)
bgn = now()
response = llm.invoke( [message] )
print( f"LLM query took {now()-bgn} seconds to process!" )
print( response.content )