# LangChain process

In [2]:
from langchain.vectorstores import Chroma
from chromadb import Client as ChromaClient
from chromadb.config import Settings


from langchain_anthropic import ChatAnthropic
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.vectorstores import Chroma
from chromadb import Client as ChromaClient
from chromadb.config import Settings
from dotenv import load_dotenv
from voyageai import Client
from langchain.embeddings.base import Embeddings
from langchain.schema import Document
import logging
import os

# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Load environment variables
load_dotenv()
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
VO_API_KEY = os.getenv("VO_API_KEY")

# Initialize VoyageAI client
vo = Client(api_key=VO_API_KEY)

# Define a custom embedding class
class CustomVoyageEmbeddings(Embeddings):
    def embed_query(self, query: str):
        try:
            embedding = vo.embed(query, model="voyage-3-large", input_type="document", output_dimension=1024).embeddings[0]
            logging.info(f"Generated embedding for query: {query}")
            return embedding
        except Exception as e:
            logging.error(f"Error generating query embedding: {e}")
            raise

    def embed_documents(self, texts: list[str]):
        try:
            embeddings = [
                vo.embed(text, model="voyage-3-large", input_type="document", output_dimension=1024).embeddings[0]
                for text in texts
            ]
            logging.info(f"Generated embeddings for {len(texts)} documents.")
            return embeddings
        except Exception as e:
            logging.error(f"Error generating document embeddings: {e}")
            raise

# Initialize custom embedding class
voyage_embeddings = CustomVoyageEmbeddings()

# Initialize Chroma client and store
from langchain_chroma import Chroma  # Updated import

def initialize_chroma(persist_directory="./chroma_data"):
    try:
        # Ensure the persistence directory exists
        if not os.path.exists(persist_directory):
            os.makedirs(persist_directory)
            logging.info(f"Created persistence directory: {persist_directory}")

        # Initialize Chroma vector store with persistence
        chroma_store = Chroma(
            collection_name="qa_embeddings_voyage-3-large_1024",
            embedding_function=voyage_embeddings,
            persist_directory=persist_directory,  # Enable persistence
        )
        logging.info(f"Chroma vector store initialized with persistence at: {persist_directory}")
        return chroma_store
    except Exception as e:
        logging.error(f"Error initializing Chroma vector store: {e}")
        raise


# Initialize Anthropic client
def initialize_anthropic():
    try:
        anthropic_llm = ChatAnthropic(
            model="claude-3-haiku-20240307",
            anthropic_api_key=ANTHROPIC_API_KEY
        )
        logging.info("Anthropic client initialized.")
        return anthropic_llm
    except Exception as e:
        logging.error(f"Error initializing Anthropic client: {e}")
        raise

# Define Retrieval-based QA chain
def initialize_qa_chain(chroma_store, anthropic_llm, n_neighbors=5):
    # Define a retriever with the specified number of neighbors
    retriever = chroma_store.as_retriever(search_kwargs={"k": n_neighbors})
    
    # Define a prompt template
    prompt_template = """You are a helpful assistant. Use the following retrieved documents to answer the question, be very specific and use only data attached:
    {context}

    Question: {question}

    Answer:"""
    
    qa_prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

    # Define a custom chain to combine retrieved documents
    def combine_documents(docs):
        """Combine the content of retrieved documents."""
        return "\n\n".join([doc.page_content for doc in docs])

    class CustomRetriever:
        """Custom retriever to integrate document joining."""
        def __init__(self, retriever):
            self.retriever = retriever

        def get_relevant_documents(self, query):
            """Retrieve documents and combine them."""
            docs = self.retriever.get_relevant_documents(query)
            context = combine_documents(docs)
            return context, docs

    custom_retriever = CustomRetriever(retriever)

    # Define a custom QA chain
    class CustomRetrievalQA:
        """Custom Retrieval QA to integrate combined context."""
        def __init__(self, llm, retriever, prompt_template):
            self.llm = llm
            self.retriever = retriever
            self.prompt_template = prompt_template

        def __call__(self, inputs):
            query = inputs["query"]
            context, source_documents = self.retriever.get_relevant_documents(query)
            prompt = self.prompt_template.format(context=context, question=query)
            answer = self.llm(prompt)
            return {"result": answer, "source_documents": source_documents}

    qa_chain = CustomRetrievalQA(
        llm=anthropic_llm,
        retriever=custom_retriever,
        prompt_template=prompt_template
    )
    logging.info("Custom Retrieval-based QA chain initialized.")
    return qa_chain

# Main function to ask questions
def ask_question(qa_chain, query):
    logging.info(f"Asking question: {query}")
    result = qa_chain({"query": query})
    print("Answer:", result["result"])
    print("\nRetrieved Documents:")
    for doc in result["source_documents"]:
        print(f"Context: {doc.page_content}")
        print(f"Metadata: {doc.metadata}")

# Initialize components
chroma_store = initialize_chroma()
anthropic_llm = initialize_anthropic()
qa_chain = initialize_qa_chain(chroma_store, anthropic_llm, n_neighbors=10)

# Example query
query = 'VH1 declared what song the "Greatest song of the 2000s?"'
result = ask_question(qa_chain, query)


2025-01-19 19:09:42,413 - INFO - Chroma vector store initialized with persistence at: ./chroma_data
2025-01-19 19:09:42,414 - INFO - Anthropic client initialized.
2025-01-19 19:09:42,414 - INFO - Custom Retrieval-based QA chain initialized.
2025-01-19 19:09:42,415 - INFO - Asking question: VH1 declared what song the "Greatest song of the 2000s?"
2025-01-19 19:09:42,675 - INFO - Generated embedding for query: VH1 declared what song the "Greatest song of the 2000s?"
2025-01-19 19:09:43,675 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Answer: content='According to the information provided, VH1 declared Beyoncé\'s debut single "Crazy in Love" as the "Greatest Song of the 2000s".' additional_kwargs={} response_metadata={'id': 'msg_01D5sLvZYu5JBneojqeRmyCo', 'model': 'claude-3-haiku-20240307', 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0, 'input_tokens': 1950, 'output_tokens': 39}} id='run-2cf5aa96-dec8-4c72-93e5-8f04f98ae2fa-0' usage_metadata={'input_tokens': 1950, 'output_tokens': 39, 'total_tokens': 1989, 'input_token_details': {'cache_read': 0, 'cache_creation': 0}}

Retrieved Documents:
Context: On February 6, 2016, one day before her performance at the Super Bowl, Beyoncé released a new single exclusively on music streaming service Tidal called "Formation".
Metadata: {'context': 'On February 6, 2016, one day before her performance at the Super Bowl, Beyoncé released a new single exclusively on music streaming service Tidal called "Forma

In [None]:
all_records = chroma_store._collection.get(include=["documents", "metadatas", "embeddings"])
print(all_records)