In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

# 🤖 Initialize the LLM (OpenAI ChatGPT model) with low randomness
llm = ChatOpenAI(
    temperature = 0.1
)

# 💾 Set up a local cache to store/reuse embeddings and save costs
cache_dir = LocalFileStore("./.cache/")

# 📄 Step 1: Load a text file and split it into overlapping chunks
loader = UnstructuredFileLoader("./files/chapter_one.txt")
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
docs = loader.load_and_split(text_splitter=splitter)

# 🔍 Step 2: Generate (and cache) OpenAI embeddings for each chunk
embeddings = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings,
    cache_dir
)

# 🧠 Step 3: Build FAISS vectorstore for fast similarity search
vectorstore = FAISS.from_documents(docs, cached_embeddings)

print("✅ Vector store created with", len(docs), "chunks")

# 🔎 Step 4: Convert vectorstore into a retriever
retriever = vectorstore.as_retriever()

map_doc_prompt = ChatPromptTemplate.format_messages(
    [
        (
            "system",
            """
            Use the following portion of a long document to see if any of the text is relevant to answer the question. Return any relevant text verbatim.
            ------
            {context}
            """),
        ("hunan", "{question}")
    ]
)

map_doc_chain = map_doc_prompt | llm

def map_docs(inputs):
    print(inputs)
    documents = inputs['documents']
    question = inputs['question']
    results = []
    for document in documents:
        result = map_doc_chain.invoke({
            "context": document.page_content,
            "question": question
        }).content
        results.append(result)
    results = "\n\n".join(results)
    return results

map_chain = {"documents": retriever, "question": RunnablePassthrough()} | RunnableLambda(map_docs)

final_prompt = ChatPromptTemplate.from_messages([
    ("system",
     """
     Given the following extracted parts of a long document and a question, create a final answer.
     If you don't know the answer, just say that you don't know. Don't try to make up a n answer.
     ------
     {context}
     """
     ),
    ("human": "{question}")
])


chain = {"context": map_chain,"question": RunnablePassthrough()} | final_prompt | llm


# ❓ Step 7: Run a test question through the full pipeline
chain.invoke("Describe Victory Mansions")


## How it works with Map-Reduce LCEL CHAIN :)

# list of docs

# for doc in list of docs | prompt | llm

# for response in list of llm responses | put them all together

# final doc | prompt | llm

SyntaxError: invalid syntax (3656495009.py, line 70)