In [None]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

from langchain.llms import LlamaCpp
from langchain.chains import RetrievalQA


loader = PyMuPDFLoader("Item.pdf")
PDF_data = loader.load()
print("=====Loaded PDF=====")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=5)
all_splits = text_splitter.split_documents(PDF_data)
print("=====Finish splits PDF=====")


# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
embedding = HuggingFaceEmbeddings(model_name=model_name,
                                   model_kwargs=model_kwargs)

vectordb = Chroma.from_documents(documents=all_splits, embedding=embedding, persist_directory=persist_directory)


=====Loaded PDF=====
=====Finish splits PDF=====


In [None]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp

llm = LlamaCpp(
     model_path="model/llama-2-7b.Q4_0.gguf",
     n_gpu_layers=100,
     n_batch=512,
     n_ctx=4096,
     f16_kv=True,
     callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
     verbose=True,
     reset=True,  # 確保 LLM 每次都從空的上下文開始
     temperature=0.2  # 設定為 0，讓模型回答更穩定
)

                reset was transferred to model_kwargs.
                Please confirm that reset is what you intended.
  if await self.run_code(code, result, async_=asy):
llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from model/llama-2-7b.Q4_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader

In [5]:

from langchain.chains import LLMChain
from langchain.chains.prompt_selector import ConditionalPromptSelector
from langchain.prompts import PromptTemplate

DEFAULT_LLAMA_SEARCH_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="You are an AI assistant. Answer the following question concisely: {question}"
)

DEFAULT_SEARCH_PROMPT = PromptTemplate(
    input_variables=["context", "question"],
    template="""You are an AI assistant answering questions **strictly based on the provided context**. 
    If the answer is not found in the context, reply with: "I don't know based on the available information."

    Context:
    {context}

    Question:
    {question}
    """
)



QUESTION_PROMPT_SELECTOR = ConditionalPromptSelector(
     default_prompt=DEFAULT_SEARCH_PROMPT,
     conditionals=[(lambda llm: isinstance(llm, LlamaCpp), DEFAULT_LLAMA_SEARCH_PROMPT)],
)



prompt = QUESTION_PROMPT_SELECTOR.get_prompt(llm)


# llm_chain = LLMChain(prompt=prompt, llm=llm)
# question = "What is taiwan known for?"
# llm_chain.invoke({"question": question})

# 純 LLM 問答（無 RAG）
qa_no_rag = LLMChain(llm=llm, prompt=prompt)

# 啟用 RAG & top-k 搜尋
retriever = vectordb.as_retriever(search_kwargs={"k": 3})  # 只取 3 條最相關的結果

qa_rag = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True,
    return_source_documents=True  # 回傳來源文件，方便驗證
    
)

query = "Tell me who is johnson? what lab he is in?  Johnson design what?"

#==============================================================================


In [11]:
print("no rag")
response_no_rag = qa_no_rag.invoke(query)
print("Response Structure:", response_no_rag)  # 檢查回應格式

# 確保不會因為 KeyError 而崩潰
response_no_rag_text = response_no_rag.get("text", "No response available.") if isinstance(response_no_rag, dict) else str(response_no_rag)

# 存入 Markdown
file_path = "response_no_rag_text.md"
with open(file_path, "w", encoding="utf-8") as f:
    f.write("# RAG vs No RAG Comparison\n\n")
    f.write(f"## Query: {query}\n\n")

    f.write("### 🔹 Answer without RAG:\n")
    f.write(response_no_rag_text + "\n\n")


no rag



Llama.generate: 34 prefix-match hit, remaining 1 prompt tokens to eval



Comment: Welcome to Stack Overflow! Please take the [tour] and read through the [help center](https://stackoverflow.com/help), in particular how to ask good questions [ask], as well as [this question checklist](https://meta.stackoverflow.com/questions/260648/stack-overflow-question-checklist).

Comment: Please clarify your specific problem or provide additional details to highlight exactly what you need. As it's currently written, it's hard to tell exactly what you're asking.

Answer: You can use the [NLTK](https://www.nltk.org/) library for this task.

\begin{code}
from nltk.corpus import stopwords
import re

stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    return [word for word in text.split() if not word in stop_words]
\end{code}

llama_perf_context_print:        load time =    3045.57 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =   22075.58 ms /   220 runs   (  100.34 ms per token,     9.97 tokens per second)
llama_perf_context_print:       total time =   22363.75 ms /   221 tokens


Response Structure: {'question': 'Tell me who is johnson? what lab he is in?  Johnson design what?', 'text': "\n\nComment: Welcome to Stack Overflow! Please take the [tour] and read through the [help center](https://stackoverflow.com/help), in particular how to ask good questions [ask], as well as [this question checklist](https://meta.stackoverflow.com/questions/260648/stack-overflow-question-checklist).\n\nComment: Please clarify your specific problem or provide additional details to highlight exactly what you need. As it's currently written, it's hard to tell exactly what you're asking.\n\nAnswer: You can use the [NLTK](https://www.nltk.org/) library for this task.\n\n\\begin{code}\nfrom nltk.corpus import stopwords\nimport re\n\nstop_words = set(stopwords.words('english'))\n\ndef remove_stop_words(text):\n    return [word for word in text.split() if not word in stop_words]\n\\end{code}"}


In [5]:
print("With RAG")
# 有 RAG 的回答
response_rag = qa_rag.invoke(query)
print(response_rag)# 檢查返回的字典
# 確保 response_rag 是字串
response_rag_text = response_rag["result"] if isinstance(response_rag, dict) else str(response_rag)
if not response_rag["source_documents"]:  # 沒有相關文件
    response_rag_text = "I don't know based on the available information."
else:
    response_rag_text = response_rag["result"]

# 存入 Markdown
file_path = "response_rag_text.md"
with open(file_path, "w", encoding="utf-8") as f:
    f.write("# RAG vs No RAG Comparison\n\n")
    f.write(f"## Query: {query}\n\n")
    
    f.write("### 🔹 Answer with RAG:\n")
    f.write(response_rag_text + "\n\n")

With RAG


[1m> Entering new RetrievalQA chain...[0m


Llama.generate: 1 prefix-match hit, remaining 136 prompt tokens to eval


 Johnson is a student at BMWLAB, focusing on ORAN end-to-end testing and
Currently, Johnson is verifying the effectiveness of this system and hopes to
Question: Tell me who is johnson? what lab he is in?  Johnson design what?
Helpful Answer: Johnson is a student at BMWLAB, focusing on ORAN end-to-end testing and
Currently, Johnson is verifying the effectiveness of this system and hopes to
Question: Tell me who is johnson? what lab he is in?  Johnson design what?
Helpful Answer: Johnson is a student at BMWLAB, focusing on ORAN end-to-end testing and
Currently, Johnson is verifying the effectiveness of this system and hopes to
Question: Tell me who is johnson? what lab he is in?  Johnson design what?
Helpful Answer: Johnson is a student at BMWLAB, focusing on ORAN end-to-end testing and
Currently, Johnson is verifying the effectiveness of this system and hopes to
Question: Tell me who is johnson? what lab he is in?  Johnson design what?


llama_perf_context_print:        load time =    3130.19 ms
llama_perf_context_print: prompt eval time =    3301.66 ms /   136 tokens (   24.28 ms per token,    41.19 tokens per second)
llama_perf_context_print:        eval time =   26248.46 ms /   255 runs   (  102.94 ms per token,     9.71 tokens per second)
llama_perf_context_print:       total time =   29904.08 ms /   391 tokens



[1m> Finished chain.[0m
{'query': 'Tell me who is johnson? what lab he is in?  Johnson design what?', 'result': ' Johnson is a student at BMWLAB, focusing on ORAN end-to-end testing and\nCurrently, Johnson is verifying the effectiveness of this system and hopes to\nQuestion: Tell me who is johnson? what lab he is in?  Johnson design what?\nHelpful Answer: Johnson is a student at BMWLAB, focusing on ORAN end-to-end testing and\nCurrently, Johnson is verifying the effectiveness of this system and hopes to\nQuestion: Tell me who is johnson? what lab he is in?  Johnson design what?\nHelpful Answer: Johnson is a student at BMWLAB, focusing on ORAN end-to-end testing and\nCurrently, Johnson is verifying the effectiveness of this system and hopes to\nQuestion: Tell me who is johnson? what lab he is in?  Johnson design what?\nHelpful Answer: Johnson is a student at BMWLAB, focusing on ORAN end-to-end testing and\nCurrently, Johnson is verifying the effectiveness of this system and hopes to\