In [None]:
!pip install --upgrade \
    langchain \
    langchain-huggingface \
    langchain-community \
    sentence-transformers \
    faiss-cpu \
    rank-bm25



In [None]:
!pip install gradio



In [None]:
import os
import torch
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community import vectorstores
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever, EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain.memory import ConversationBufferWindowMemory
from langchain.schema import HumanMessage, AIMessage
from langchain.prompts import PromptTemplate
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline
from langchain_community.llms import HuggingFacePipeline

In [None]:
from langchain_community.docstore.document import Document

# Read file
with open('CPWD_Works_Manual_2024.txt', 'r', encoding='utf-8') as file:
    text_content = file.read()

# Convert to Document objects (1 document for now)
documents = [Document(page_content=text_content)]

In [None]:
def Retriever(documents):
    # 1. pick device automatically
    device = "cuda" if torch.cuda.is_available() else "cpu"
    embedding_model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/multi-qa-MiniLM-L6-dot-v1",
        model_kwargs={"device": device}
    )

    # 2. load or build the FAISS vectorstore
    if os.path.isdir("VectorStore"):
        print("> LOADING EXISTING VECTORSTORE…")
        vectorstore = vectorstores.FAISS.load_local(
            "VectorStore",
            embeddings=embedding_model,
            allow_dangerous_deserialization=True
        )
    else:
        print("> BUILDING NEW VECTORSTORE…")
        vectorstore = vectorstores.FAISS.from_documents(
            documents=documents,
            embedding=embedding_model
        )
        vectorstore.save_local("VectorStore")

    # 3. set up splitters
    child_splitter = RecursiveCharacterTextSplitter(
        separators=['\n'],
        chunk_size=50,
        chunk_overlap=0,
        length_function=len
    )
    parent_splitter = RecursiveCharacterTextSplitter(
        separators=['####', '##', '#'],
        chunk_size=2000,
        chunk_overlap=100,
        length_function=len
    )

    # 4. parent retriever (always add docs)
    parent_retriever = ParentDocumentRetriever(
        vectorstore=vectorstore,
        docstore=InMemoryStore(),
        child_splitter=child_splitter,
        parent_splitter=parent_splitter
    )
    parent_retriever.add_documents(documents=documents)

    # 5. keyword retriever over the child chunks
    chunks = parent_splitter.split_documents(documents)
    keyword_retriever = BM25Retriever.from_documents(chunks)
    keyword_retriever.k = 2

    print("> RETRIEVER READY (device:", device, ")")
    return EnsembleRetriever(
        retrievers=[parent_retriever, keyword_retriever],
        weights=[0.5, 0.5]
    )



In [None]:
retriever = Retriever(documents)

> LOADING EXISTING VECTORSTORE…
> RETRIEVER READY (device: cuda )


In [None]:
# Set the Hugging Face Hub API token as an environment variable
os.environ["HUGGING_FACE_HUB_TOKEN"] = "  "  # Replace with your actual token

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct", use_auth_token=os.environ["HUGGING_FACE_HUB_TOKEN"])
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct", device_map="auto", use_auth_token=os.environ["HUGGING_FACE_HUB_TOKEN"])

pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=1000,
    do_sample=True,
    top_p=0.8,
    top_k=40,
    temperature=0.1,
    return_full_text=False,
)

llm = HuggingFacePipeline(pipeline=pipe)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=pipe)


In [None]:
memory = ConversationBufferWindowMemory(k=1, return_messages=True)

  memory = ConversationBufferWindowMemory(k=1, return_messages=True)


In [None]:
from langchain.schema import HumanMessage

def query_with_memory(query: str, retriever, memory):
    # Access the stored messages
    # ConversationBufferWindowMemory stores past messages in memory.chat_memory.messages
    messages = memory.chat_memory.messages

    # Filter for human messages only
    user_messages = [msg for msg in messages if isinstance(msg, HumanMessage)]

    # Construct the combined query
    if user_messages:
        last_user_message = user_messages[-1].content
        combined_query = f"{last_user_message}, {query}"
    else:
        combined_query = query

    # Use the retriever to fetch relevant documents
    docs = retriever.get_relevant_documents(combined_query)
    return docs


In [None]:
from langchain.prompts import PromptTemplate

template = """
You are a chatbot tasked with answering user questions.
You are provided with context about the CPWD Manual 2024, which offers updated project guidelines.

If the context contains any Markdown tables (i.e. lines beginning **and** ending with `|`, plus a header row of `---` under the headers), copy those tables **verbatim**—including all pipes, hyphens, and alignment—and wrap them in a fenced code block labeled `markdown` so they render correctly. For example:

```markdown
| Description | A (%) | B (%) |
|-------------|-------|-------|
| DC          | 2.0   | 3.0   |

<context>
{context}
</context>

Chat history:
{chat_history}

⚠️ **Do NOT re-answer or repeat any previous questions in the chat history. Only answer the final user question.** ⚠️
Answer concisely using only the context and chat history.
If you don’t know the answer, reply exactly:
"I am sorry, I don’t have an answer to your query. Please ask your question in another words".

User: {input}
Assistant: """

prompt = PromptTemplate( template=template, input_variables=["context", "chat_history", "input"] )

In [None]:
import gradio as gr

def gradio_chat_interface(user_query, history):
    if user_query.lower() in ("exit", "quit"):
        return history + [[user_query, "Goodbye!"]], ""

    # 1) record the user’s new message
    memory.chat_memory.add_user_message(user_query)

    # 2) get top 3 docs and build context
    docs = query_with_memory(user_query, retriever, memory)[:3]
    context = "\n\n".join(doc.page_content for doc in docs)


    mem_vars = memory.load_memory_variables({})
    chat_hist = mem_vars.get("chat_history", "")

    # 4) fill your PromptTemplate
    filled = prompt.format(
        context=context,
        chat_history=chat_hist,
        input=user_query
    )

    # 5) generate (no stop_sequences here!)
    generation = pipe(filled)
    raw = generation[0]["generated_text"].strip()

    # 6) if the model somehow started a new "User:" question, cut it off
    ai_out = raw.split("\nUser:")[0].strip()

    # 7) store & return just the clean answer
    memory.chat_memory.add_ai_message(ai_out)
    history.append([user_query, ai_out])
    return history, ""

with gr.Blocks() as demo:
    gr.Markdown("### CPWD MANUAL GPT")
    chatbot = gr.Chatbot()
    msg = gr.Textbox(label="Your message", placeholder="Ask something...")
    clear = gr.Button("Clear Chat")

    state = gr.State([])

    msg.submit(gradio_chat_interface, [msg, state], [chatbot, msg])
    clear.click(lambda: ([], ""), None, [chatbot, msg])

demo.launch()

  chatbot = gr.Chatbot()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8b3a7b0599e77623c9.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


