In [1]:
# --- Step 0: Download and Prepare Knowledge Base ---
!pip install -q PyPDF2 sentence-transformers faiss-cpu transformers gradio

import os
from PyPDF2 import PdfReader

# üîπ Replace this link with the *raw* link to your PDF in GitHub
pdf_url = "https://raw.githubusercontent.com/<your-username>/<your-repo-name>/main/Human-Resource-HR-Policy-.pdf"
pdf_path = "Human-Resource-HR-Policy-.pdf"

# Download the PDF from GitHub
os.system(f"wget -O {pdf_path} {pdf_url}")

# Convert PDF to text
reader = PdfReader(pdf_path)
knowledge_text = ""
for page in reader.pages:
    text = page.extract_text()
    if text:
        knowledge_text += text + "\n"

# Save as a text file for reference
with open("my_knowledge.txt", "w", encoding="utf-8") as f:
    f.write(knowledge_text)

print("‚úÖ PDF downloaded and converted to text successfully!")

# --- Step 1: Load the knowledge text ---
with open("my_knowledge.txt", "r") as f:
    knowledge_text = f.read()

# --- Step 2: Chunking ---
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=20,
    length_function=len
)
chunks = text_splitter.split_text(knowledge_text)
print(f"‚úÖ We have {len(chunks)} chunks.")

# --- Step 3: Embeddings ---
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
chunk_embeddings = model.encode(chunks)

# --- Step 4: Vector Store with FAISS ---
import faiss, numpy as np
d = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(np.array(chunk_embeddings).astype('float32'))
print(f"‚úÖ FAISS index created with {index.ntotal} vectors.")

# --- Step 5: Load Generator Model ---
from transformers import pipeline
generator = pipeline('text2text-generation', model='google/flan-t5-small')

# --- Step 6: Initialize Conversation History ---
conversation_history = []

# --- Step 7: Define RAG Function with Memory + Citation ---
def answer_question(query):
    # Embed query
    query_embedding = model.encode([query]).astype('float32')
    k = 2
    distances, indices = index.search(query_embedding, k)
    retrieved_chunks = [chunks[i] for i in indices[0]]
    context = "\n\n".join(retrieved_chunks)

    # Include last 3 exchanges in context
    past_context = "\n\n".join(
        [f"User: {h['query']}\nBot: {h['answer']}" for h in conversation_history[-3:]]
    )

    prompt_template = f"""
    Use only the provided context to answer the user's question.
    If the answer isn't found, say "I don't have that information."

    Previous conversation:
    {past_context}

    Context:
    {context}

    Question:
    {query}

    Answer:
    """

    # Generate answer
    response = generator(prompt_template, max_length=150)[0]['generated_text']

    # Save conversation
    conversation_history.append({"query": query, "answer": response})

    # Add citation
    final_answer = f"{response}\n\n(Source: Human-Resource-HR-Policy-.pdf)"
    return final_answer

# --- Step 8: Test it ---
query_1 = "What is the WFH policy?"
print(f"Q: {query_1}")
print(answer_question(query_1))

# --- Step 9: Gradio Interface ---
import gradio as gr

def chatbot_interface(query):
    answer = answer_question(query)
    if "(Source:" in answer:
        main_answer, source = answer.split("(Source:", 1)
        source = "(Source:" + source
    else:
        main_answer, source = answer, "No source found"
    return main_answer.strip(), source.strip()

iface = gr.Interface(
    fn=chatbot_interface,
    inputs=gr.Textbox(label="Ask about company policy"),
    outputs=[
        gr.Textbox(label="Chatbot Answer"),
        gr.Textbox(label="Source")
    ],
    title="Company Policy Chatbot (RAG with Memory + Citation)",
    description="Retrieves answers from company policy PDF and cites sources. Remembers recent conversation."
)

iface.launch(share=True)


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m232.6/232.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m31.4/31.4 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25h

FileNotFoundError: [Errno 2] No such file or directory: 'Human-Resource-HR-Policy-.pdf'