In [59]:
from dotenv import load_dotenv
load_dotenv()

True

In [65]:
"""Hypothetical Document Embeddings.

https://arxiv.org/abs/2212.10496
"""

from __future__ import annotations

from typing import Any, Dict, List, Optional

import numpy as np
from langchain_core.callbacks import CallbackManagerForChainRun
from langchain_core.embeddings import Embeddings
from langchain_core.language_models import BaseLanguageModel
from langchain_core.prompts import BasePromptTemplate
from langchain_core.pydantic_v1 import Extra

from langchain.chains.base import Chain
from langchain.chains.hyde.prompts import PROMPT_MAP
from langchain.chains.llm import LLMChain


class HypotheticalDocumentEmbedder(Chain, Embeddings):
    """Generate hypothetical document for query, and then embed that.

    Based on https://arxiv.org/abs/2212.10496
    """

    base_embeddings: Embeddings
    llm_chain: LLMChain

    class Config:
        """Configuration for this pydantic object."""

        extra = Extra.forbid
        arbitrary_types_allowed = True

    @property
    def input_keys(self) -> List[str]:
        """Input keys for Hyde's LLM chain."""
        return self.llm_chain.input_keys

    @property
    def output_keys(self) -> List[str]:
        """Output keys for Hyde's LLM chain."""
        return self.llm_chain.output_keys

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Call the base embeddings."""
        return self.base_embeddings.embed_documents(texts)

    def combine_embeddings(self, embeddings: List[List[float]]) -> List[float]:
        """Combine embeddings into final embeddings."""
        return list(np.array(embeddings).mean(axis=0))

    def embed_query(self, text: str) -> List[float]:
        """Generate a hypothetical document and embedded it."""
        var_name = self.llm_chain.input_keys[0]
        result = self.llm_chain.generate([{var_name: text}])
        documents = [generation.text for generation in result.generations[0]]
        for ii, doc in enumerate(documents):
            print(f"### Hyde Document {ii+1} ###\n{doc}\n")
        embeddings = self.embed_documents(documents)
        return self.combine_embeddings(embeddings)

    def _call(
        self,
        inputs: Dict[str, Any],
        run_manager: Optional[CallbackManagerForChainRun] = None,
    ) -> Dict[str, str]:
        """Call the internal llm chain."""
        _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
        return self.llm_chain(inputs, callbacks=_run_manager.get_child())

    @classmethod
    def from_llm(
        cls,
        llm: BaseLanguageModel,
        base_embeddings: Embeddings,
        prompt_key: Optional[str] = None,
        custom_prompt: Optional[BasePromptTemplate] = None,
        **kwargs: Any,
    ) -> HypotheticalDocumentEmbedder:
        """Load and use LLMChain with either a specific prompt key or custom prompt."""
        if custom_prompt is not None:
            prompt = custom_prompt
        elif prompt_key is not None and prompt_key in PROMPT_MAP:
            prompt = PROMPT_MAP[prompt_key]
        else:
            raise ValueError(
                f"Must specify prompt_key if custom_prompt not provided. Should be one "
                f"of {list(PROMPT_MAP.keys())}."
            )

        llm_chain = LLMChain(llm=llm, prompt=prompt)
        print(f"### Hyde Propmt ###\n{prompt}\n")
        return cls(base_embeddings=base_embeddings, llm_chain=llm_chain, **kwargs)

    @property
    def _chain_type(self) -> str:
        return "hyde_chain"

In [71]:
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI, OpenAIEmbeddings

base_embeddings = OpenAIEmbeddings()

multi_llm = OpenAI(n=1, best_of=1)
embeddings = HypotheticalDocumentEmbedder.from_llm(
    multi_llm, base_embeddings, "web_search"
)
result = embeddings.embed_query("Where is the Taj Mahal?")

### Hyde Propmt ###
input_variables=['QUESTION'] template='Please write a passage to answer the question \nQuestion: {QUESTION}\nPassage:'

### Hyde Document 1 ###
 The majestic Taj Mahal is located in Agra, a city in the northern state of Uttar Pradesh in India. It is situated on the banks of the Yamuna River and is approximately 200 kilometers south of the capital city of New Delhi. The iconic monument stands as a symbol of love and is one of the most famous attractions in the world, drawing millions of visitors each year. It was built by the Mughal emperor Shah Jahan in the 17th century as a mausoleum for his beloved wife, Mumtaz Mahal. The Taj Mahal is a UNESCO World Heritage Site and is considered a masterpiece of Mughal architecture, featuring intricate marble work, beautiful gardens, and a symmetrical design. It is a must-visit destination for anyone traveling to India and is a testament to the enduring power of love and beauty. 



In [72]:
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter

with open("data/state_of_the_union.txt") as f:
    state_of_the_union = f.read()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_text(state_of_the_union)

In [79]:
prompt_template = """Please answer the user's question about the most recent state of the union address
Question: {question}
Answer:"""

prompt = PromptTemplate(input_variables=["question"], template=prompt_template)
multi_llm = OpenAI(n=2, best_of=2)
llm_chain = LLMChain(llm=multi_llm, prompt=prompt)


embedding_fn = HypotheticalDocumentEmbedder(
    llm_chain=llm_chain, base_embeddings=base_embeddings
)


# docsearch = Chroma.from_texts(texts, embedding_fn, persist_directory="./data/.chroma_db")


# query = "What did the president say about Ketanji Brown Jackson"
# docs = docsearch.similarity_search(query)

### Hyde Document 1 ###
 In the most recent state of the union address, President Joe Biden praised Ketanji Brown Jackson for her dedication and achievements as a judge. He also announced his nomination of her to the U.S. Court of Appeals for the District of Columbia, which if confirmed, would make her the first Black woman to serve on that court. 

### Hyde Document 2 ###
 During the most recent state of the union address, President Joe Biden praised the historic confirmation of Ketanji Brown Jackson as the first Black woman to serve on the U.S. Court of Appeals for the D.C. Circuit, calling her a "trailblazing jurist" and a "brilliant legal mind." He also urged the Senate to swiftly confirm her to the Supreme Court, should a vacancy arise.

