# MSA Handbook RAG Chatbot App

This notebook creates a Retrieval-Augmented Generation (RAG) chatbot app using Streamlit and OpenAI's GPT-4o. It processes a PDF handbook, builds a FAISS index, and deploys an interactive QA interface via Streamlit.

In [None]:
!pip install streamlit openai faiss-cpu sentence-transformers python-dotenv numpy

In [None]:
%%writefile rag_pipeline.py
import faiss
import fitz  # PyMuPDF
import numpy as np
import json
from sentence_transformers import SentenceTransformer

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text("text") for page in doc])

def chunk_text(text, chunk_size=500, chunk_overlap=100):
    chunks = []
    i = 0
    while i < len(text):
        chunks.append(text[i:i + chunk_size])
        i += chunk_size - chunk_overlap
    return chunks

def store_embeddings(chunks, model_name="all-MiniLM-L6-v2", index_path="faiss_index"):
    embedder = SentenceTransformer(model_name)
    embeddings = np.array(embedder.encode(chunks))
    embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)
    faiss.write_index(index, index_path)
    return index

# Example usage
text = extract_text_from_pdf("MSA_2025_Handbook.pdf")
chunks = chunk_text(text)
index = store_embeddings(chunks)

with open("chunks.json", "w") as f:
    json.dump(chunks, f)

print(f"Stored {len(chunks)} chunks.")


In [None]:
%%writefile tinyllama_inference.py
from openai import OpenAI
import json
import faiss
import numpy as np
import streamlit as st
from sentence_transformers import SentenceTransformer

api_key = st.secrets["OPENAI_API_KEY"]
client = OpenAI(api_key=api_key)

index = faiss.read_index("faiss_index")
with open("chunks.json", "r") as f:
    chunks = json.load(f)

embedder = SentenceTransformer("all-MiniLM-L6-v2")

def retrieve_context(query, k=3):
    query_embedding = embedder.encode([query])
    query_embedding = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)
    _, indices = index.search(query_embedding, k)
    return "\n\n".join([chunks[i] for i in indices[0] if i < len(chunks)])

def generate_response(query):
    context = retrieve_context(query)
    prompt = f"Use the context below to answer the question.\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that answers questions based only on the provided context from the MSA 2025 Handbook."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.2,
            max_tokens=300
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error: {e}"


In [None]:
%%writefile app.py
import streamlit as st
from tinyllama_inference import generate_response, retrieve_context

st.set_page_config(page_title="MSA Handbook Assistant", page_icon="📘")
st.markdown("<h1 style='color:#2F4F4F;'>📘 MSA 2025 Handbook Assistant</h1>", unsafe_allow_html=True)
st.write("Ask questions about the MSA 2025 Handbook.")

query = st.text_input("Enter your question:")

if query:
    with st.spinner("Thinking... generating response..."):
        response = generate_response(query)
        context = retrieve_context(query)

    st.subheader("💬 Answer")
    st.write(response)


In [None]:
%%writefile .streamlit/secrets.toml
OPENAI_API_KEY=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



In [None]:
%%writefile requirements.txt
streamlit
openai
faiss-cpu
sentence-transformers
python-dotenv
numpy


In [None]:
from tinyllama_inference import generate_response
generate_response("What is the attendance policy?")


## ✅ Submission Notes

- All files are generated from this notebook and available in the repo.
- To run locally:
  ```bash
  pip install -r requirements.txt
  streamlit run app.py
  ```
- The app uses GPT-4o and a FAISS-based retrieval system.
