In [1]:
## Vector Stores and Retrievers

# Vector store and retriever abstractions
# These abstractions are designed to support retrieval of data -- from vector databased and other sources
# For integration with LLM workflows. Theya re important for applications that fetch data to be reasoned over as part of model inference, as
# in the case of retrieval-augmented generation.


In [None]:
# Documents
#     "LangChain implements a Document abstraction, which is intended to represent a unit of text and associated metadata. It has two attributes

#   page_content: a string representing the content
#   metadata: a dict containing arbitrary metadata
#   The metadata attribute can capture information about the source of the document, its relationship to other documents, and other information. 
# Note that an individual Document object often represents a chunk of a larger document


In [2]:
from langchain_core.documents import Document

In [3]:
documents = [
    Document( 
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
        ),
        Document(
            page_content="Cats are independent pets that often enjoy their own space.",
            metadata={"source": "mammal-pets-doc"},
        ),
        Document(
            page_content="Goldfish are popular pets for beginners, requiring relatively simple care.",
            metadata={"source": "fish-pets-doc"},
        ),
        Document(
            page_content="Parrots are intelligent birds capable of mimicking human speech.",
            metadata={"source": "bird-pets-doc"},
        ),
        Document(
            page_content="Rabbits are social animals that need plenty of space to hop around.",
            metadata={"source": "mammal-pets-doc"},
        ),
   ]

In [4]:
documents

[Document(metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.'),
 Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.'),
 Document(metadata={'source': 'fish-pets-doc'}, page_content='Goldfish are popular pets for beginners, requiring relatively simple care.'),
 Document(metadata={'source': 'bird-pets-doc'}, page_content='Parrots are intelligent birds capable of mimicking human speech.'),
 Document(metadata={'source': 'mammal-pets-doc'}, page_content='Rabbits are social animals that need plenty of space to hop around.')]

In [None]:
# Summary

#  library & env var > read document > calling Groq llm model >
# calling HuggingFace Embedding Model > Using Chromdb for vector store
# > Using Chroma DB, Embeddings model and document to store embedding >
# Using vectorstoreDb to search and performing other actions as well
# Using Retrievers (using two different method)
# Using Chain (RAG Chain) to integrate things like retriever|prompt|llm

In [5]:
import os
from dotenv import load_dotenv
load_dotenv()
from langchain_groq import ChatGroq
groq_api_key = os.getenv("GROQ_API_KEY")
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")

In [7]:
# gemma2Gemma2-9b-It
llm = ChatGroq(groq_api_key = groq_api_key, model="Llama3-8b-8192")
llm

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x0000019A60EAD930>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x0000019A60EDC370>, model_name='Llama3-8b-8192', groq_api_key=SecretStr('**********'))

In [8]:
# It will firstly download in your local machine before started working

from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")
embeddings


  from tqdm.autonotebook import tqdm, trange


HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [10]:
## Vector Stores (Embedding --> store in database)
from langchain_chroma import Chroma
vectorstore = Chroma.from_documents(documents, embedding = embeddings)

In [11]:
vectorstore

<langchain_chroma.vectorstores.Chroma at 0x19a7d4d1ed0>

In [12]:
vectorstore.similarity_search("cat")

[Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.'),
 Document(metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.'),
 Document(metadata={'source': 'mammal-pets-doc'}, page_content='Rabbits are social animals that need plenty of space to hop around.'),
 Document(metadata={'source': 'bird-pets-doc'}, page_content='Parrots are intelligent birds capable of mimicking human speech.')]

In [13]:
# Async Query
await vectorstore.asimilarity_search("cat")

[Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.'),
 Document(metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.'),
 Document(metadata={'source': 'mammal-pets-doc'}, page_content='Rabbits are social animals that need plenty of space to hop around.'),
 Document(metadata={'source': 'bird-pets-doc'}, page_content='Parrots are intelligent birds capable of mimicking human speech.')]

In [14]:
vectorstore.similarity_search_with_score("cat")

[(Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.'),
  0.9351057410240173),
 (Document(metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.'),
  1.5740898847579956),
 (Document(metadata={'source': 'mammal-pets-doc'}, page_content='Rabbits are social animals that need plenty of space to hop around.'),
  1.5956902503967285),
 (Document(metadata={'source': 'bird-pets-doc'}, page_content='Parrots are intelligent birds capable of mimicking human speech.'),
  1.6657923460006714)]

In [15]:
## Retriever 
# Langchain vectorStore object don't subclass runnable and so can't immediately be integrated into langchain expression language chains
# Langchain retrievers are runnables so they implement a standard set of methods (e.g. sync and async invoke and batch operations) and are designed to be incorporated in LCEL


In [17]:
from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda

In [None]:
# Here we will create retriever by two ways 
# 1. RunnableLamba
# 2. as_retriever method (preferred)

In [18]:
# Method 1 - Retriever using RunnableLambda

# Bind k=1, means get the first or top result
retriever = RunnableLambda(vectorstore.similarity_search).bind(k=1)
retriever.batch(["cat", "dog"])

[[Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.')],
 [Document(metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.')]]

In [20]:
# Method 2 - Retriever using as_retriever

retriever = vectorstore.as_retriever(
    search_type = "similarity",
    search_kwargs = {"k":1}
)

retriever.batch(["cat","dog"])


[[Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.')],
 [Document(metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.')]]

In [21]:
# RAG

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

message = """
Answer this question using the provided context only.
{question}

context: {context}
"""

prompt = ChatPromptTemplate.from_messages([("human", message)])

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | llm
response = rag_chain.invoke("tell me  about dogs")
print(response.content)

According to the context, dogs are great companions, known for their loyalty and friendliness.
