In [None]:
# ===============================
# RAG INGESTION NOTEBOOK
# ===============================

!pip install -q langchain langchain-community faiss-cpu sentence-transformers

In [None]:
import os
import json
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS


In [None]:
def flatten_json(data, prefix="", source=""):
    docs = []
    if isinstance(data, dict):
        for k, v in data.items():
            new_prefix = f"{prefix}.{k}" if prefix else k
            docs.extend(flatten_json(v, new_prefix, source))
    elif isinstance(data, list):
        for item in data:
            docs.extend(flatten_json(item, prefix, source))
    else:
        text = str(data).strip()
        if text:
            docs.append(
                Document(
                    page_content=f"{prefix}: {text}",
                    metadata={"source": source}
                )
            )
    return docs


In [None]:
DATA_DIR = "/content/drive/MyDrive/new_rag"

documents = []

for file in os.listdir(DATA_DIR):
    if not file.endswith(".json"):
        continue
    with open(os.path.join(DATA_DIR, file), "r", encoding="utf-8") as f:
        data = json.load(f)
    documents.extend(flatten_json(data, source=file))

print("Raw documents:", len(documents))


In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200
)

chunks = splitter.split_documents(documents)
print("Final chunks:", len(chunks))


In [None]:
DYNAMIC_URLS = {
    "Admission deadlines": [
        "https://ewubd.edu/undergraduate-dates-deadline",
        "https://ewubd.edu/graduate-dates-deadline"
    ],
    "Events": ["https://ewubd.edu/events"],
    "Faculty": [
        "https://fse.ewubd.edu/computer-science-engineering/faculty-members",
        "https://fse.ewubd.edu/electrical-electronic-engineering/faculty-members",
        "https://fse.ewubd.edu/electronics-communications-engineering/faculty-members",
        "https://fse.ewubd.edu/genetic-engineering-biotechnology/faculty-members",
        "https://fse.ewubd.edu/pharmacy-department/faculty-members",
        "https://fse.ewubd.edu/civil-engineering/faculty-members",
        "https://fse.ewubd.edu/mathematical-physical-science/faculty-members",
        "https://fbe.ewubd.edu/business-administration/faculty-members",
        "https://fbe.ewubd.edu/economics-department/faculty-members",
        "https://flass.ewubd.edu/english-department/faculty-members",
        "https://flass.ewubd.edu/law-department/faculty-members",
        "https://flass.ewubd.edu/social-relations-department/faculty-members",
        "https://flass.ewubd.edu/information-studies-library-management/faculty-members",
        "https://flass.ewubd.edu/sociology-department/faculty-members"
    ],
    "Grading": ["https://www.ewubd.edu/grades-rules-and-regulations"],
    "Tuition fees": ["https://ewubd.edu/undergraduate-tuition-fees"]
}


In [None]:
import requests
from bs4 import BeautifulSoup
from langchain_core.documents import Document

def scrape_dynamic_docs(urls_dict):
    docs = []
    headers = {"User-Agent": "Mozilla/5.0"}
    print("üîç Scraping live EWU website data...")

    for category, urls in urls_dict.items():
        for url in urls:
            try:
                r = requests.get(url, headers=headers, timeout=15)
                r.raise_for_status()
                soup = BeautifulSoup(r.text, "html.parser")

                for script in soup(["script", "style"]):
                    script.decompose()

                text = soup.get_text(separator="\n", strip=True)
                if not text:
                    continue

                docs.append(Document(
                    page_content=f"LATEST {category.upper()} INFO from official site ({url}):\n{text}",
                    metadata={"source": url, "category": category}
                ))
                print(f"‚úÖ Scraped: {url}")
            except Exception as e:
                print(f"‚ö†Ô∏è Failed to scrape {url}: {e}")

    print(f"\nüåê Total dynamic pages scraped: {len(docs)}")
    return docs

# Now scrape and combine
dynamic_docs = scrape_dynamic_docs(DYNAMIC_URLS)
chunked_dynamic_docs = splitter.split_documents(dynamic_docs)  # reuse same splitter
all_docs = chunks + chunked_dynamic_docs
print(f"üìö Total documents (JSON + live): {len(all_docs)}")

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    encode_kwargs={"normalize_embeddings": True}
)

vectorstore = FAISS.from_documents(all_docs, embeddings)


In [None]:
SAVE_PATH = "/content/vectorstore"
vectorstore.save_local(SAVE_PATH)
print("‚úÖ Vectorstore saved at:", SAVE_PATH)
