Langchain implements a Document abstraction which is intented to represent a unit of text andassociated metadata. It has 2 attributes:

page_content: a string representing the content.

metadata: a Dict containing arbitrary metadata.

The metadata attribute can capture information about the source of the document, it's relationship to other documents, and other information. Note that an individual Document object often represents a chunk of a larger document.

Let's generate some sample docs.

In [9]:
from langchain_core.documents import Document

documents = [
    Document(
        page_content="Dogs are great pets, known for their loyalty and ability to provide companionship.",
        metadata={"source": "dog_info_doc"},
    ),
    Document(
        page_content="Cats are independent creatures, known for their aloofness and ability to hunt mice.",
        metadata={"source": "cat_info_doc"},
    ),
    Document(
        page_content="Birds are known for their ability to fly and sing.",
        metadata={"source": "bird_info_doc"},
    ),
    Document(
        page_content="Fish are known for their ability to swim and breathe underwater.",
        metadata={"source": "fish_info_doc"},
    ),
    Document(
        page_content="Dogs are known for their ability to bark and bite.",
        metadata={"source": "dog_info_doc"},
    ),
    Document(
        page_content="Cats are known for their ability to meow and purr.",
        metadata={"source": "cat_info_doc"},
    ),
]

In [2]:
documents

[Document(metadata={'source': 'dog_info_doc'}, page_content='Dogs are great pets, known for their loyalty and ability to provide companionship.'),
 Document(metadata={'source': 'cat_info_doc'}, page_content='Cats are independent creatures, known for their aloofness and ability to hunt mice.'),
 Document(metadata={'source': 'bird_info_doc'}, page_content='Birds are known for their ability to fly and sing.'),
 Document(metadata={'source': 'fish_info_doc'}, page_content='Fish are known for their ability to swim and breathe underwater.'),
 Document(metadata={'source': 'dog_info_doc'}, page_content='Dogs are known for their ability to bark and bite.'),
 Document(metadata={'source': 'cat_info_doc'}, page_content='Cats are known for their ability to meow and purr.')]

In [3]:
## vector store
from langchain_chroma import Chroma


In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

# Get API keys with proper error handling
groq_api_key = os.getenv("GROQ_API_KEY")
huggingface_token = os.getenv("HUGGINGFACE_API_KEY")

# Only set HF_TOKEN if the token exists
if huggingface_token:
    os.environ["HF_TOKEN"] = huggingface_token
    print("HuggingFace token loaded successfully")
else:
    print("HuggingFace token not found - some features may be limited")

from langchain_groq import ChatGroq

llm = ChatGroq(
    model_name="llama-3.1-8b-instant",
    api_key=groq_api_key,
    temperature=0.5,
    max_tokens=8192,
    top_p=1,
    frequency_penalty=0.0,
    presence_penalty=0.0,
)

HuggingFace token loaded successfully


In [5]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-V2" # converts to 384 dimension
)

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
## vector store
from langchain_chroma import Chroma

vector_store = Chroma.from_documents(documents, embedding=embeddings)
vector_store

<langchain_chroma.vectorstores.Chroma at 0x1fbb150b470>

In [12]:
vector_store.similarity_search("cat")


[Document(id='7b22b3c8-7415-4e89-a35b-80e8ecc32903', metadata={'source': 'cat_info_doc'}, page_content='Cats are known for their ability to meow and purr.'),
 Document(id='9f1c6050-fcac-47be-8428-7bdf1a009a34', metadata={'source': 'cat_info_doc'}, page_content='Cats are independent creatures, known for their aloofness and ability to hunt mice.'),
 Document(id='472fa54a-e8e6-4c9d-a17e-4cca5cd6e5b0', metadata={'source': 'dog_info_doc'}, page_content='Dogs are great pets, known for their loyalty and ability to provide companionship.'),
 Document(id='c667f365-080a-4d34-b92c-8d2c0f3788eb', metadata={'source': 'dog_info_doc'}, page_content='Dogs are known for their ability to bark and bite.')]

In [14]:

##Async query
await vector_store.asimilarity_search("cat")

[Document(id='7b22b3c8-7415-4e89-a35b-80e8ecc32903', metadata={'source': 'cat_info_doc'}, page_content='Cats are known for their ability to meow and purr.'),
 Document(id='9f1c6050-fcac-47be-8428-7bdf1a009a34', metadata={'source': 'cat_info_doc'}, page_content='Cats are independent creatures, known for their aloofness and ability to hunt mice.'),
 Document(id='472fa54a-e8e6-4c9d-a17e-4cca5cd6e5b0', metadata={'source': 'dog_info_doc'}, page_content='Dogs are great pets, known for their loyalty and ability to provide companionship.'),
 Document(id='c667f365-080a-4d34-b92c-8d2c0f3788eb', metadata={'source': 'dog_info_doc'}, page_content='Dogs are known for their ability to bark and bite.')]

In [15]:

vector_store.similarity_search_with_score("cat")


[(Document(id='7b22b3c8-7415-4e89-a35b-80e8ecc32903', metadata={'source': 'cat_info_doc'}, page_content='Cats are known for their ability to meow and purr.'),
  0.8909776210784912),
 (Document(id='9f1c6050-fcac-47be-8428-7bdf1a009a34', metadata={'source': 'cat_info_doc'}, page_content='Cats are independent creatures, known for their aloofness and ability to hunt mice.'),
  1.0181410312652588),
 (Document(id='472fa54a-e8e6-4c9d-a17e-4cca5cd6e5b0', metadata={'source': 'dog_info_doc'}, page_content='Dogs are great pets, known for their loyalty and ability to provide companionship.'),
  1.4765703678131104),
 (Document(id='c667f365-080a-4d34-b92c-8d2c0f3788eb', metadata={'source': 'dog_info_doc'}, page_content='Dogs are known for their ability to bark and bite.'),
  1.572890281677246)]

In [16]:
## Retrievers
# Langchain VectorStore objects do not subclass Runnable, and so cannot immediately be integrated into
# LCEL's.

#Langchain Retrievers are Runnables, so they implement a standard set of methods(e.g., synchronous and 
# asynchronous invoke and batch operations) & r designed to be incorporated in LCEL chains.

# We can create a simple version of this ourselves, without subclassing Retriever. If 
# we choose what methos we wish to use to retrieve docs, we can create a runnable easily. 
# Below we will build one around the similarity_search method.


In [18]:
from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda

retriever = RunnableLambda(vector_store.similarity_search).bind(k=2)
retriever.batch(["cat", "dog"])

[[Document(id='7b22b3c8-7415-4e89-a35b-80e8ecc32903', metadata={'source': 'cat_info_doc'}, page_content='Cats are known for their ability to meow and purr.'),
  Document(id='9f1c6050-fcac-47be-8428-7bdf1a009a34', metadata={'source': 'cat_info_doc'}, page_content='Cats are independent creatures, known for their aloofness and ability to hunt mice.')],
 [Document(id='472fa54a-e8e6-4c9d-a17e-4cca5cd6e5b0', metadata={'source': 'dog_info_doc'}, page_content='Dogs are great pets, known for their loyalty and ability to provide companionship.'),
  Document(id='c667f365-080a-4d34-b92c-8d2c0f3788eb', metadata={'source': 'dog_info_doc'}, page_content='Dogs are known for their ability to bark and bite.')]]

In [19]:
# 2nd technique


In [20]:
# Vectorstores implement an "as_retriever" method that will generate a Retriever, specifically a 
# VectorStoreRetriever. These retrievers include specific "search_type" & "search_kargws" attributes
# that identify what methods of the underlying vector store to call, & how to parameterize them.
# For instance, we can replicate the above with the following:

In [22]:
retriever = vector_store.as_retriever(
    search_type = "similarity",
    search_kwargs={"k":1}
)

retriever.batch(["dog", "cat"])

[[Document(id='472fa54a-e8e6-4c9d-a17e-4cca5cd6e5b0', metadata={'source': 'dog_info_doc'}, page_content='Dogs are great pets, known for their loyalty and ability to provide companionship.')],
 [Document(id='7b22b3c8-7415-4e89-a35b-80e8ecc32903', metadata={'source': 'cat_info_doc'}, page_content='Cats are known for their ability to meow and purr.')]]

In [24]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

message = """Answer this question using the provided context only.

{question}

Context:
{context}
"""

prompt = ChatPromptTemplate.from_messages([("human", message)])

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | llm

In [26]:
response = rag_chain.invoke("Tell me about dogs")
print(response.content)

BadRequestError: Error code: 400 - {'error': {'message': 'The model `llama3-8b-8192` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}