<a href="https://colab.research.google.com/github/lcbjrrr/genai/blob/main/04_RAG_Doc_Chunks_GCP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG: OCR and Chunking

Chunking is the process of breaking down a large document (like a PDF after OCR) into smaller, manageable pieces or 'chunks'. This is done so that when you query your vector database, you retrieve relevant, smaller sections of text rather than entire documents, leading to more precise and relevant answers. OCR (Optical Character Recognition) is used to convert scanned images of text (e.g., from a PDF) into machine-readable text. This text is then processed, often chunked, and finally embedded into numerical representations (vectors) that are stored in a vector database. When a query is made, it's also converted into a vector, and the database finds the most similar document chunks based on their vector proximity.

![](https://pbs.twimg.com/media/G5bI9D4WAAAznDv?format=jpg&name=medium)

In [None]:
!pip install langchain langchain-core langchain-google-genai langchain-chroma



In [None]:
from langchain_core.documents import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
import os
from google.colab import userdata
#GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
API_KEY=<key>
os.environ["GOOGLE_API_KEY"] = API_KEY

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
e=embeddings.embed_documents("Python is the best to do LLM")
print(e[-1])
len(e[-1])

[0.026136338710784912, -0.02872188203036785, -0.06689122319221497, -0.0079328129068017, 0.06634596735239029, 0.016866765916347504, 0.01718820258975029, -0.023990225046873093, 0.002822413807734847, 0.03302762657403946, -0.01597127504646778, 0.031227611005306244, -0.029169045388698578, 0.04163304716348648, -0.00040752653148956597, -0.009566799737513065, 0.009417647495865822, 0.018642636016011238, 0.028313888236880302, -0.022992653772234917, 0.01731102541089058, 0.011708144098520279, -0.007833445444703102, 0.0008992083603516221, 0.025473080575466156, -0.0059366244822740555, -0.007776164915412664, -0.07772943377494812, -0.046668414026498795, 0.01961865834891796, -0.057121392339468, 0.012875753454864025, -0.06017376482486725, 0.016606176272034645, 0.005525827873498201, -0.02592640556395054, -0.010746434330940247, 0.0030842397827655077, -0.00869656726717949, 0.0263503585010767, 0.02141505666077137, -0.01877114549279213, -0.043773502111434937, 0.004614102654159069, 0.009283085353672504, -0.00

768

In [None]:
import os
files = []
for file in os.listdir('.'):
    if file.endswith('.pdf'):
        files.append(file)

files[0]

'0808.1508v1.pdf'




```
Warning: No languages specified, defaulting to English.
223
Document(metadata={'source': '2206.05480v2.pdf', 'coordinates': {'points': ((60.1199, 120.24720397999988), (60.1199, 169.6914855199999), (551.545185, 169.6914855199999), (551.545185, 120.24720397999988)), 'system': 'PixelSpace', 'layout_width': 612.0, 'layout_height': 792.0}, 'filename': '2206.05480v2.pdf', 'last_modified': '2025-11-05T03:12:20', 'page_number': 1, 'languages': ['eng'], 'filetype': 'application/pdf', 'parent_id': '3b3906ea80f0c91098c4d7a13276c4ab', 'category': 'UncategorizedText', 'element_id': '848976ef18c16e154bc75624052d8f9c'}, page_content='Qiang Hu1*, Yuejun Guo2*, Xiaofei Xie3, Maxime Cordy1, Mike Papadakis1, Lei Ma4,5 and Yves Le Traon1 1University of Luxembourg, Luxembourg 2Luxembourg Institute of Science and Technology, Luxembourg 3Singapore Management University, Singapore')
```


In [None]:
!pip install "unstructured[pdf]" langchain_community langchain-text-splitters

In [None]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

def load_pdf(filename):
    loader = UnstructuredPDFLoader(filename,mode="elements")
    return loader.load()

def chunk(filename,chunks_all):
  print('Processing',filename)
  elements = load_pdf('./'+filename)
  docs=[]
  for e in elements:
    meta = e.metadata.copy()
    docs.append(Document(page_content=e.page_content, metadata=meta))
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=50)
  chunks = text_splitter.split_documents(docs)
  #print(chunks[10])
  #print(len(docs),len(chunks))
  #docs[8].page_content
  print(chunks)
  chunks_all.extend(chunks)
  print('------ Processed',len(docs),len(chunks))

all_chunks = []
for file in files:
  chunk(file,all_chunks)



Processing 0808.1508v1.pdf
------ Processed 647 648
Processing 1211.6411v1.pdf
[Document(metadata={'source': './1211.6411v1.pdf', 'coordinates': {'points': ((72.84, 115.09577999999999), (72.84, 154.74110399999995), (519.4258094999999, 154.74110399999995), (519.4258094999999, 115.09577999999999)), 'system': 'PixelSpace', 'layout_width': 595.0, 'layout_height': 842.0}, 'file_directory': '.', 'filename': '1211.6411v1.pdf', 'last_modified': '2025-11-10T23:26:18', 'page_number': 1, 'languages': ['eng'], 'filetype': 'application/pdf', 'category': 'Title', 'element_id': '2f176568fce3d4db3c50da33678d4785'}, page_content='New Heuristics for Interfacing Human Motor System using Brain Waves'), Document(metadata={'source': './1211.6411v1.pdf', 'coordinates': {'points': ((134.82, 175.06495999999993), (134.82, 185.43024000000003), (455.368125, 185.43024000000003), (455.368125, 175.06495999999993)), 'system': 'PixelSpace', 'layout_width': 595.0, 'layout_height': 842.0}, 'file_directory': '.', 'filena

In [None]:
from langchain_community.vectorstores.utils import filter_complex_metadata
print(len(all_chunks))
all_chunks=filter_complex_metadata(all_chunks)
len(all_chunks)

733


733

In [None]:
print(all_chunks[10].page_content)

This report describes experimental results for a set of benchmarks on program veriﬁcation. It compares the capabilities of CPBVP “Constraint Programming framework for Bounded Program Veriﬁcation” [4] with the following frameworks: ESC/Java, CBMC, Blast, EUREKA and Why.




```
page_content='a higher impact on the model than others, and 4) pre-trained bimodal models are relatively more resistant to distribution shifts.' metadata={'source': '2206.05480v2.pdf', 'coordinates': {'points': ((48.60037399999989, 205.47873931999993), (48.60037399999989, 375.52649427999984), (300.5855234663385, 375.52649427999984), (300.5855234663385, 205.47873931999993)), 'system': 'PixelSpace', 'layout_width': 612.0, 'layout_height': 792.0}, 'filename': '2206.05480v2.pdf', 'last_modified': '2025-11-05T03:12:20', 'page_number': 1, 'languages': ['eng'], 'filetype': 'application/pdf', 'parent_id': '471a9e59fa30198d2d4a6444c1ce710c', 'category': 'NarrativeText', 'element_id': 'd1583868f9ca2609a5fd02f7dfebad8a'}

223 249

Abstract—Distribution shift has been a longstanding challenge for the reliable deployment of deep learning (DL) models due to unexpected accuracy degradation. Although DL has been becoming a driving force for large-scale source code analysis in the big code era, limited progress has been made on distribution shift analysis and benchmarking for source code tasks. To ﬁll this gap, this paper initiates to propose CodeS, a distribution shift benchmark dataset, for source code learning. Speciﬁcally, CodeS supports two programming languages (Java and Python) and ﬁve shift types (task, programmer, time-stamp, token, and concrete syntax tree). Extensive experiments based on CodeS reveal that 1) out-of-distribution detectors from other domains (e.g., computer vision) do not generalize to source code, 2) all code classiﬁcation models suffer from distribution shifts, 3) representation-based shifts have a higher impact on the model than others, and 4) pre-trained bimodal models are relatively more resistant to distribution shifts.
```



In [None]:
!pip install langchain-chroma



In [None]:
#embed_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
embed_model = GoogleGenerativeAIEmbeddings(model="embedding-001")
i=0
vectorstore = Chroma(embedding_function=embeddings,persist_directory='./vectordb')
for chunk in all_chunks:
    print(i,'=>',chunk.metadata.get('filename'),chunk.metadata.get('element_id'))
    try:
       vectorstore.add_documents([chunk])
    except Exception as e:
       print('#### Error ',e)
    i+=1

0 => 0808.1508v1.pdf 6de100ee1a0b502b2c6bc21e8526c5e9
1 => 0808.1508v1.pdf 2c2f7ed4fa08fdeef5a39ee50ed96a3b
2 => 0808.1508v1.pdf 4efb041398d69879b1b1ef147f15c545
3 => 0808.1508v1.pdf 7c0dc1b739893b3f67ed97937dd92ef3
4 => 0808.1508v1.pdf 19b4e1bda3c0c6386d4879f6fcf78b22
5 => 0808.1508v1.pdf 33fee360e9a8c58795d1a2fd01a26ed1
6 => 0808.1508v1.pdf af65f932af3452074d86a1605d5fde00
7 => 0808.1508v1.pdf e08033b67c2df8871fb381609814b898
8 => 0808.1508v1.pdf bd7c69d04e6c4522bddb3fe9a6ff3bd3
9 => 0808.1508v1.pdf 6c9ff06724dc2646df741b73df95eef2
10 => 0808.1508v1.pdf aaec7b0e9c626406f1eeebea86524fa1
11 => 0808.1508v1.pdf dfdde719e343431f5b93c3c7d11fd2b5
12 => 0808.1508v1.pdf b02ec7039fca42679567950b0f793fec
13 => 0808.1508v1.pdf 56d4ecb60ed6bfa4402c77da0ebd4d8b
14 => 0808.1508v1.pdf a4eab46b756d558a0ebaf9a0794395cc
15 => 0808.1508v1.pdf df3e986584b09116a6fe5498b20f65c9
16 => 0808.1508v1.pdf 2725af51f9ea06d408535ff826ad7e0c
17 => 0808.1508v1.pdf fb91082734d992fd6274e3170efb3a4a
18 => 0808.1508v1.pd

In [None]:
!pip install google-genai

In [None]:
from google import genai
from google.genai import types
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma

API_KEY='yourkey'
os.environ["GOOGLE_API_KEY"] = API_KEY
GCP_MODEL = 'gemini-2.5-flash'
embed_model = GoogleGenerativeAIEmbeddings(model="embedding-001")
vectorstore = Chroma(embedding_function=embed_model,persist_directory='./vectordb')

def query_vector_db(db,q,k,sec=None,th=None):
  filter_criteria = 'None'
  if sec:
    filter_criteria = {"$and": [{"section": sec},{"theme": th}]}
    results = db.similarity_search_with_score(q, k=k, filter=filter_criteria)
  else:
    results = db.similarity_search_with_score(q, k=k)
  print(f"Filtered hybrid search results for query: '{q}' (k={k}) with filter {filter_criteria}")
  if results:
    for doc, score in results:
          print("-" * 50)
          print(f'Score: {score}')
          print(f"Document Content: {doc.page_content}")
          print(f"Document Metadata: {doc.metadata}")
    return results
  else:
      print("No documents found matching the filter and query.")
      return None

question = input('Enter your question/prompt: ')
resp = query_vector_db(vectorstore,question,3)

client = genai.Client()
config = types.GenerateContentConfig(
    system_instruction='You are scientific research assistant',
    temperature=0.9)
chat = client.chats.create(model=GCP_MODEL,config=config)

PROMPT='''
What would be a promising line of resarch with a resarch question like {q}.
Considering the followin as a start point:
{context}
'''
cont = ''
print('=========================')
print(resp)
print('=========================')
for r in resp:
  print('----->',r[0])
  cont += r[0].page_content+' '
p =PROMPT.format(q=question,context=cont)
print(p)
response = chat.send_message(p)
print(response.text)
#How LeoTask can be evolved?

Enter your question/prompt: motor
Filtered hybrid search results for query: 'motor' (k=3) with filter None
--------------------------------------------------
Score: 0.16414135694503784
Document Content: 1
Document Metadata: {'last_modified': '2025-11-10T23:26:18', 'category': 'UncategorizedText', 'file_directory': '.', 'source': './0808.1508v1.pdf', 'element_id': 'af65f932af3452074d86a1605d5fde00', 'filename': '0808.1508v1.pdf', 'parent_id': '33fee360e9a8c58795d1a2fd01a26ed1', 'filetype': 'application/pdf', 'page_number': 1}
--------------------------------------------------
Score: 0.16414135694503784
Document Content: 1
Document Metadata: {'source': './0808.1508v1.pdf', 'page_number': 19, 'last_modified': '2025-11-10T23:26:18', 'filetype': 'application/pdf', 'element_id': '799c80bd557f30593ef6bdf7145ed6b5', 'filename': '0808.1508v1.pdf', 'file_directory': '.', 'category': 'UncategorizedText', 'parent_id': 'f6c1bdc003dd4f4be3b5e1a384ba347f'}
--------------------------------------------



```

```





```

```





```

```



In [None]:
#LOAD
from langchain_core.documents import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
import os
#GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
API_KEY='yourkey'
os.environ["GOOGLE_API_KEY"] = API_KEY

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
e=embeddings.embed_documents("Python is the best to do LLM")
print(e[-1])
print(len(e[-1]))

import os
files = []
for file in os.listdir('./pdfs/'):
    if file.endswith('.pdf'):
        files.append(file)

print(files[0])

from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
def load_pdf(filename):
    loader = UnstructuredPDFLoader(filename,mode="elements")
    return loader.load()

def chunk(filename,chunks_all):
  print('Processing',filename)
  elements = load_pdf('./pdfs/'+filename)
  docs=[]
  for e in elements:
    meta = e.metadata.copy()
    docs.append(Document(page_content=e.page_content, metadata=meta))
    print('------- \n',Document(page_content=e.page_content, metadata=meta))
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=50)
  chunks = text_splitter.split_documents(docs)
  print('====================== chunks ======================')
  for c in chunks:
    print('--------\n',c)
  chunks_all.extend(chunks)
  print('------ Processed',len(docs),len(chunks))

all_chunks = []
for file in files:
  chunk(file,all_chunks)

from langchain_community.vectorstores.utils import filter_complex_metadata
print(len(all_chunks))
all_chunks=filter_complex_metadata(all_chunks)
len(all_chunks)

#embed_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
embed_model = GoogleGenerativeAIEmbeddings(model="embedding-001")
i=0
vectorstore = Chroma(embedding_function=embeddings,persist_directory='./vectordb')
for chunk in all_chunks:
    print(i,'=>',chunk.metadata.get('filename'),chunk.metadata.get('element_id'))
    try:
       vectorstore.add_documents([chunk])
    except Exception as e:
       print('#### Error ',e)
    i+=1

In [None]:
from google import genai
from google.genai import types
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma

API_KEY='yourkey'
os.environ["GOOGLE_API_KEY"] = API_KEY
GCP_MODEL = 'gemini-2.5-flash'

embed_model = GoogleGenerativeAIEmbeddings(model="embedding-001")
vectorstore = Chroma(embedding_function=embed_model,persist_directory='./vectordb')

def query_vector_db(db,q,k,sec=None,th=None):
  filter_criteria = 'None'
  if sec:
    filter_criteria = {"$and": [{"section": sec},{"theme": th}]}
    results = db.similarity_search_with_score(q, k=k, filter=filter_criteria)
  else:
    results = db.similarity_search_with_score(q, k=k)
  print(f"Filtered hybrid search results for query: '{q}' (k={k}) with filter {filter_criteria}")
  if results:
    for doc, score in results:
          print("-" * 50)
          print(f'Score: {score}')
          print(f"Document Content: {doc.page_content}")
          print(f"Document Metadata: {doc.metadata}")
    return results
  else:
      print("No documents found matching the filter and query.")
      return None

question = input('Enter your question/prompt: ')
resp = query_vector_db(vectorstore,question,3)

client = genai.Client()
config = types.GenerateContentConfig(
    system_instruction='You are scientific research assistant',
    temperature=0.9)
chat = client.chats.create(model=GCP_MODEL,config=config)

PROMPT='''
What would be a promising line of resarch with a resarch question like {q}.
Considering the followin as a start point:
{context}
'''
cont = ''
print('=========================')
print(resp)
print('=========================')
for r in resp:
  print('----->',r[0])
  cont += r[0].page_content+' '
p =PROMPT.format(q=question,context=cont)
print(p)
response = chat.send_message(p)
print(response.text)