In [24]:
from langchain_community.document_loaders import PyPDFLoader

# Step 1: Data ingestion (load PDF)
loader = PyPDFLoader("Ikigai.pdf")
pages = loader.load_and_split()
len(pages)

119

In [None]:
import os 

os.environ["HF_TOKEN"] = os.getenv("HUGGINGFACE_TOKEN")
os.environ["HF_HOME"] = os.getenv("HF_HOME")

os.environ["HF_HOME"]

In [26]:
# Step 2: Data split 

from langchain_text_splitters import RecursiveCharacterTextSplitter

recursive_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = recursive_splitter.split_documents(pages)
len(splits)

280

In [27]:
# Step 3: Data embedding (vector embedding)

from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [28]:
# Input text embedding 
query = "Wabi sabi"
query_result = embeddings.embed_query(query)
query_result

[-0.05371760204434395,
 0.11107083410024643,
 -0.07964161783456802,
 0.052210353314876556,
 -0.01343531720340252,
 0.010918653570115566,
 0.08037737756967545,
 -0.03682626411318779,
 0.027485646307468414,
 0.011231781914830208,
 0.04044893756508827,
 -0.09674616158008575,
 0.15219959616661072,
 0.003483816282823682,
 -0.009455897845327854,
 0.07023947685956955,
 0.01462798472493887,
 0.014244487509131432,
 -0.025932103395462036,
 -0.012699201703071594,
 -0.048773132264614105,
 0.046907272189855576,
 -0.013568388298153877,
 -0.03468267619609833,
 0.007025490049272776,
 -0.010639209300279617,
 -0.0003482998290564865,
 0.046070396900177,
 -0.04149552062153816,
 -0.0816008523106575,
 0.05641748011112213,
 0.09883572161197662,
 0.02175694890320301,
 0.022110817953944206,
 0.013082596473395824,
 0.012914290651679039,
 -0.06274289637804031,
 0.04149998724460602,
 0.020118992775678635,
 -0.03309468552470207,
 0.026330821216106415,
 -0.014529346488416195,
 0.06227346137166023,
 0.04690901190042

In [29]:
# Convert documents into vectors and save as Chroma store

from langchain_chroma import Chroma
vector_store = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory="./chroma_store")

In [30]:
# Load from chroma store
query = "Wabi sabi"
chroma_store = Chroma(
    persist_directory="./chroma_store",
    embedding_function=embeddings
)

results = chroma_store.similarity_search(query, k=5)

for doc in results:
    print(doc.page_content)

surround	us.
“All	things	human	are	short-lived	and	perishable,”	Seneca	tells	us.
2
The	temporary,	ephemeral,	and	impermanent	nature	of	the	world	is	central	to
every	Buddhist	discipline.	Keeping	this	always	in	mind	helps	us	avoid	excessive
pain	in	times	of	loss.
Wabi-sabi
	and	
ichi-go	ichi-e
Wabi-sabi
	is	a	Japanese	concept	that	shows	us	the	beauty	of	the	fleeting,
changeable,	and	imperfect	nature	of	the	world	around	us.	Instead	of	searching	for
beauty	in	perfection,	we	should	look	for	it	in	things	that	are	flawed,	incomplete.
This	is	why	the	Japanese	place	such	value,	for	example,	on	an	irregular	or
surround	us.
“All	things	human	are	short-lived	and	perishable,”	Seneca	tells	us.
2
The	temporary,	ephemeral,	and	impermanent	nature	of	the	world	is	central	to
every	Buddhist	discipline.	Keeping	this	always	in	mind	helps	us	avoid	excessive
pain	in	times	of	loss.
Wabi-sabi
	and	
ichi-go	ichi-e
Wabi-sabi
	is	a	Japanese	concept	that	shows	us	the	beauty	of	the	fleeting,
changeable,	and	imperfec