In [2]:
import textwrap
import chromadb
import numpy as np
import pandas as pd

import google.generativeai as genai
import google.ai.generativelanguage as glm
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from chromadb import Documents, EmbeddingFunction, Embeddings

import os
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dotenv()

# gemini_api_key = os.getenv("GOOGLE_API_KEY")

# genai.configure(api_key=gemini_api_key)

True

Lets work on chunking the pdfs

In [4]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.retrievers import ParentDocumentRetriever

from langchain.storage import InMemoryStore, LocalFileStore
from langchain.storage._lc_store import create_kv_docstore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [5]:
docs = []

for file in os.listdir("pdfs"):
    if file.endswith(".pdf"):
        loader = PyPDFLoader("pdfs/" + file)
        docs.extend(loader.load())

In [6]:
# Googles embedding is 768 dimensions
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [7]:
# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="genai_full_documents", embedding_function= GoogleGenerativeAIEmbeddings(model="models/embedding-001"), persist_directory="vdb"
)
# The storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

In [9]:
store

<langchain.storage.in_memory.InMemoryBaseStore at 0x28c310190>

In [12]:
retriever.add_documents(docs)

In [6]:
vectorstore = Chroma(
    collection_name="genai_full_documents", embedding_function= GoogleGenerativeAIEmbeddings(model="models/embedding-001"), persist_directory="vdb"
)

<langchain_community.vectorstores.chroma.Chroma at 0x28a477a50>

In [7]:
sub_docs = vectorstore.similarity_search("VA Loan Limitations", k=5)

[Document(page_content='1. What is the maximum amount I can borrow with a V A loan?\nThe ma ximum  amount  you  can  borrow  with  a  V A  loan  is  not  directly\ndetermined  by  the  V A . Instead , it’ s  set  by  your  lender , who  typically\nadheres  to  guidelines  such  as  the  conforming  loan  limit  established\nby the  Federal  Housing  Finance  Agency  ( FHF A ). The  cap  on  loan', metadata={'doc_id': 'b657749f-34de-4c0c-9727-bfe6e4b03e35', 'page': 54, 'source': 'pdfs/Your VA Loan Blueprint_ Dodge Pitfalls, Create Life-changing Wealth - Kyle Petitt.pdf'}),
 Document(page_content='How  The  VA  Loan  Works\n \nThe V A  home  loan  is  a  highly  beneﬁcial  mortgage  program .\n \nIt is important  to  note  that  the  V A  does  not  grant  the  mortgage\napproval  itself .\n \nInstead , they  establish  the  guidelines  and  provide  mortgage\ninsurance  in  case  of  default .\n \nThe actual  funding  is  provided  by  mortgage  lenders .', metadata={'doc_id': 'e34d3a8e

In [51]:
retrieved_docs = retriever.get_relevant_documents("VA Loan Limitations")

In [52]:
len(retrieved_docs[0].page_content)

2056

In [53]:
retrieved_docs

[Document(page_content='1. What is the maximum amount I can borrow with a V A loan?\nThe ma ximum  amount  you  can  borrow  with  a  V A  loan  is  not  directly\ndetermined  by  the  V A . Instead , it’ s  set  by  your  lender , who  typically\nadheres  to  guidelines  such  as  the  conforming  loan  limit  established\nby the  Federal  Housing  Finance  Agency  ( FHF A ). The  cap  on  loan\namounts  is  also  inﬂuenced  b y  factors  such  as  your  debt-to-income\nratio  and  credit  score . Howev er , your  entitlement , which  is  the  portion\nof the  loan  that  the  V A  guarantees , is  determined  by  the  V A .\nIf you  have  full  entitlement , there  is  no  speciﬁc  limit  on  t he  amount\nyou can  borrow  for  your  hom e  loan . This  information  comes  directly\nfrom  the  V A ’ s  website .\nY ou mi ght  be  wondering  what  happens  if  you  don’t  have  enough\nentitlement  left . In  such  case s , you  can  still  purchase  a  house , but  it\nwill require  

Lets try retrieving larger chunks

In [54]:
# This text splitter is used to create the parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="genai_split_parents", embedding_function=GoogleGenerativeAIEmbeddings(model="models/embedding-001")
)
# The storage layer for the parent documents
store = InMemoryStore()

In [55]:
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [56]:
retriever.add_documents(docs)

In [57]:
sub_docs = vectorstore.similarity_search("VA Loan Limitations")

In [58]:
print(sub_docs[0].page_content)

VA Loans  Made  Easy
 
By: Carlos  Scarpero


In [59]:
retrieved_docs = retriever.get_relevant_documents("VA Loan Limitations")

In [60]:
len(retrieved_docs[0].page_content)

43

In [61]:
print(retrieved_docs[0].page_content)

VA Loans  Made  Easy
 
By: Carlos  Scarpero


In [1]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain.storage import InMemoryStore, LocalFileStore
from langchain.storage._lc_store import create_kv_docstore
from langchain_community.vectorstores import Chroma
from langchain.retrievers import ParentDocumentRetriever
import os
from dotenv import load_dotenv


load_dotenv()

docs = []

for file in os.listdir("pdfs"):
    if file.endswith(".pdf"):
        loader = PyPDFLoader("pdfs/" + file)
        docs.extend(loader.load())


# This text splitter is used to create the parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
persist_directory = 'vdb'
fs = LocalFileStore("./parent_child_directory")
store = create_kv_docstore(fs)
vectorstore = Chroma(
    collection_name="genai_full_documents", 
    embedding_function= GoogleGenerativeAIEmbeddings(model="models/embedding-001"), 
    persist_directory=persist_directory
)
# The storage layer for the parent documents
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)
retriever.add_documents(docs)

  from .autonotebook import tqdm as notebook_tqdm
