In [None]:
! pip install langchain langchain-community sentence-transformers langchain_chroma 

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain.retrievers import ParentDocumentRetriever
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
import os
from google.colab import userdata
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI


In [None]:
GOOGLE_API_KEY = userdata.get("GOOGLE_API_KEY")
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [None]:
loaders = [
    TextLoader("/content/data/paul_graham_essay.txt"),
    TextLoader("/content/data/state_of_the_union.txt")
]

In [None]:
docs = []

for loader in loaders:
    docs.append(loader.load())

In [None]:
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

In [None]:
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)

Specify embedding model (Using huggingface sentence transformers or Gemini)

In [None]:
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"

model_kwargs = {"device":"cuda"}

embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    model_kwargs=model_kwargs
)

OR USING GEMINI

In [None]:
gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [None]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro")

result = llm.invoke("Write a ballad about Langchain")
print(result)

In [None]:
vector_store = Chroma(
    collection_name="full_documents", embedding_function=gemini_embeddings # Or embeddings from Huggingface
)

In [None]:
store = InMemoryStore()

In [None]:
retriever = ParentDocumentRetriever(
    vectorstore=vector_store,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter
)

In [None]:
retriever.add_documents(docs, ids=None)

In [None]:
list(store.yield_keys())

In [None]:
query_0 = "What did the president say about Ketanji Brown Jackson?"

In [None]:
retriever.invoke(query_0)

In [None]:
vector_store.similarity_search()

In [None]:
result_1 = llm.invoke("Write a ballad about Langchain")

In [None]:
print(result_1.content)

In [None]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever
)

In [None]:
qa.run(query_0)