In [1]:
# from langchain_openai import ChatOpenAI, OpenAIEmbeddings  # Updated import
# from langchain.text_splitter import CharacterTextSplitter
# from langchain.embeddings import CacheBackedEmbeddings
# from langchain.document_loaders import TextLoader
# from langchain.vectorstores import FAISS
# from langchain.prompts import ChatPromptTemplate
# from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
# from langchain.storage import LocalFileStore

# llm = ChatOpenAI(
#     temperature=0.1,
#     model_name="gpt-4o-mini",
# )

# cache_dir = LocalFileStore("./.cache/")

# splitter = CharacterTextSplitter.from_tiktoken_encoder(
#     separator="\n",
#     chunk_size=600,
#     chunk_overlap=100,
# )
# loader = TextLoader("./files/chapter_one.txt")

# docs = loader.load_and_split(text_splitter=splitter)

# embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

# vectorstore = FAISS.from_documents(docs, cached_embeddings)

# retriever = vectorstore.as_retriever()


# map_doc_prompt = ChatPromptTemplate.from_messages(
#     [
#         (
#             "system",
#             """
#             Use the following portion of a long document to see if any of the text is relevant to answer the question. Return any relevant text verbatim. If there is no relevant text, return : ''
#             -------
#             {context}
#             """,
#         ),
#         ("human", "{question}"),
#     ]
# )

# map_doc_chain = map_doc_prompt | llm


# def map_docs(inputs):
#     documents = inputs["documents"]
#     question = inputs["question"]
#     return "\n\n".join(
#         map_doc_chain.invoke(
#             {"context": doc.page_content, "question": question}
#         ).content
#         for doc in documents
#     )


# map_chain = {
#     "documents": retriever,
#     "question": RunnablePassthrough(),
# } | RunnableLambda(map_docs)

# final_prompt = ChatPromptTemplate.from_messages(
#     [
#         (
#             "system",
#             """
#             Given the following extracted parts of a long document and a question, create a final answer. 
#             If you don't know the answer, just say that you don't know. Don't try to make up an answer.
#             ------
#             {context}
#             """,
#         ),
#         ("human", "{question}"),
#     ]
# )

# chain = {"context": map_chain, "question": RunnablePassthrough()} | final_prompt | llm

# chain.invoke("What do you know about Victory Mansion?")


In [2]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings  # Updated import
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import CacheBackedEmbeddings
from langchain.document_loaders import TextLoader
from langchain.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from langchain.storage import LocalFileStore

llm = ChatOpenAI(
    temperature=0.1,
    model_name="gpt-4o-mini",
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = TextLoader("./files/chapter_one.txt")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

retriever = vectorstore.as_retriever()

map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Use the following portion of a long document to see if any of the text is relevant to answer the question. Return any relevant text verbatim. If there is no relevant text, return : ''
            -------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

map_doc_chain = map_doc_prompt | llm

def map_docs(inputs):
    documents = inputs["documents"]
    question = inputs["question"]
    return "\n\n".join(
        map_doc_chain.invoke(
            {"context": doc.page_content, "question": question}
        ).content
        for doc in documents
    )

map_chain = {"documents": retriever, "question": RunnablePassthrough()} | RunnableLambda(map_docs)

final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "Given the following extracted parts of a long document and a question, create a final answer. If you don't know the answer, just say that you don't know. Don't try to make up an answer. \n\n {context}"),
        ("human", "{question}"),
    ]
)

final_chain = {"question": RunnablePassthrough(), "context": map_chain}| final_prompt | llm

final_chain.invoke("Describe where Winston lives.")

AIMessage(content='Winston lives in a place called Victory Mansions, which is described as grimy and situated in a landscape filled with rotting nineteenth-century houses. The buildings around him are shored up with timber, patched with cardboard, and have roofs made of corrugated iron. The area is marked by bombed sites, plaster dust swirling in the air, and overgrown with willow-herb, as well as sordid colonies of wooden dwellings resembling chicken-houses. Inside his flat, which is located seven flights up, there is a small kitchen with dark-colored bread and a bottle of colorless liquid labeled VICTORY GIN, and a living room with a small table and a telescreen. The overall atmosphere of his environment is bleak and oppressive, reflecting the totalitarian regime in which he lives.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 162, 'prompt_tokens': 524, 'total_tokens': 686, 'completion_tokens_details': {'accepted_prediction_tokens': 0,