<a href="https://colab.research.google.com/github/happymondaynkanta/Web-RAG-Pipeline-Intelligent-Website-Knowledge-Assistant/blob/main/LLM_RAG_Project_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
!pip -q install -U langchain-community langchain-text-splitters chromadb sentence-transformers beautifulsoup4


In [5]:
from langchain_community.document_loaders import RecursiveUrlLoader
from bs4 import BeautifulSoup
from pathlib import Path
import re

START_URL = "https://www.educosys.com"
MAX_DEPTH = 2  # raise to 3–4 to go deeper

def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    for t in soup(["script","style","noscript","header","footer","nav"]):
        t.decompose()
    txt = soup.get_text(separator="\n")
    return "\n".join(ln.strip() for ln in txt.splitlines() if ln.strip())

loader = RecursiveUrlLoader(
    url=START_URL,
    max_depth=MAX_DEPTH,
    extractor=bs4_extractor,
    prevent_outside=True,
    use_async=False,             # important in notebooks
    timeout=30,
    check_response_status=True,
    exclude_dirs=["/logout",".pdf",".zip",".jpg",".jpeg",".png",".svg",".gif",".ico"],
)

docs_raw = loader.load()
docs = [
    d for d in docs_raw
    if d.metadata.get("source","").startswith("https://www.educosys.com")
    and not re.search(r"\.(pdf|zip|jpg|jpeg|png|svg|gif|ico)$", d.metadata.get("source",""), re.I)
    and len(d.page_content.split()) > 30
]
print(f"Kept {len(docs)} pages")


Kept 19 pages


In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200, chunk_overlap=150, separators=["\n\n","\n"," ",""]
)
chunks = splitter.split_documents(docs)
print(f"{len(chunks)} chunks")


209 chunks


In [7]:
print(chunks[0])
print(chunks[1])
print(chunks[2])

page_content='Educosys
Join a passionate community
Unlock Your Potential
with
Keerti Purswani
Login
Signup
Checkout Educosys Courses
Free Content
Professional courses
Courses that make you Skilled, Confident, and Industry-Ready
Hands-On Generative AI
Duration: 7 Weeks
Learn more
Bengaluru GenAI Offline Weekend B..
Duration: 2 Days
Learn more
Hands-On High Level System Design
Duration: 8 Weeks
Learn more
High Level Design
Duration: 5 Weeks
Learn more
Low Level Design
Duration: 5 Weeks
Learn more
Data Structures & Algorithms
Duration: 6 Weeks
Learn more
Advanced C++
Duration: 2 Weeks
Learn more
Advanced C++ with Networking
Duration: 6 Weeks
Learn more
Bundle Courses
Smart Bundles with Multiple Courses
More Value, One Consistent Teaching Style
All Courses
DSA, HLD, LLD, HHLD, GenAI, Advanced C++
Learn more
Enroll
Interview Prep
DSA, HLD & LLD
Learn more
Enroll
System Design
LLD & HLD
Learn more
Enroll
HLD - Interview Prep and Hands-on
HLD & HHLD
Learn more
Enroll
Advanced C++
Learn more
E

In [8]:
print(f"{len(chunks)} chunks")

209 chunks


In [3]:
!pip -q install -U langchain-community langchain-text-splitters langchain-chroma sentence-transformers chromadb beautifulsoup4


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m111.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m91.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m447.5/447.5 kB[0m [31m38.2 MB/s[0m eta [36m0:00:

In [9]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

# Use GPU if you switched Colab to GPU: Runtime → Change runtime type → T4/A100 (Pro benefit)
emb = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": "cuda"}  # or "cpu" if no GPU
)

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=emb,
    collection_name="educosys",
    persist_directory="chroma_db"
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})


In [15]:
# Works only inside Google Colab
from google.colab import ai

def rag_answer(question, k=5, max_chars=1200):
    docs = retriever.get_relevant_documents(question)
    context = "\n\n".join(
        f"[{i+1}] {d.metadata.get('source','')}\n{d.page_content[:max_chars]}"
        for i, d in enumerate(docs)
    )
    prompt = (
        "You are a helpful assistant. Answer using ONLY the context.\n\n"
        f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
    )
    reply = ai.generate_text(prompt)  # no API key needed on Colab Pro/Pro+
    return reply, docs

answer, sources = rag_answer("Is neural networks part of the courses offered?")
print(answer)
print("\nSources:")
for s in sources: print("-", s.metadata.get("source",""))


Yes, the course covers the basics of neural networks. [3, 5]


Sources:
- https://www.educosys.com/course/genai
- https://www.educosys.com/course/genai
- https://www.educosys.com/course/genai
- https://www.educosys.com/course/hld
- https://www.educosys.com/course/genai
