In [1]:
pip install langchain-community

Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import time
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from urllib.parse import urlparse
from datetime import datetime
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document
from langchain_hyperbrowser import HyperbrowserCrawlTool

# 📦 Load environment variables
load_dotenv()
api_key = os.getenv("HYPERBROWSER_API_KEY")
if not api_key:
    raise EnvironmentError("❌ Missing HYPERBROWSER_API_KEY in .env")
os.environ["HYPERBROWSER_API_KEY"] = api_key

# 🔧 Toggle crawling
use_crawling = False

# 🔗 Step 1: Define URLs
if use_crawling:
    print("🌐 Crawling Techify website...")
    crawl_tool = HyperbrowserCrawlTool()
    crawl_result = crawl_tool.invoke({
        "url": "https://techifysolutions.com/industries/",
        "max_pages": 3
    })

    urls = []
    if isinstance(crawl_result, dict) and "data" in crawl_result:
        for page in crawl_result["data"]:
            url = page.metadata.get("url")
            if url:
                urls.append(url)
    else:
        print("❌ Unexpected crawl result:", crawl_result)

    if not urls:
        print("❌ No URLs found. Exiting.")
        exit()
else:
    # 📝 Manual URLs
    urls = [
        "https://techifysolutions.com/industries/",
        "https://techifysolutions.com/industries/information-technology/",
        "https://techifysolutions.com/industries/real-estate/",
        "https://techifysolutions.com/industries/retail/",
        "https://techifysolutions.com/industries/automobiles/",
        "https://techifysolutions.com/industries/healthcare-and-wellness/",
    ]

print(f"🔗 Using {len(urls)} URLs:")
for u in urls:
    print("  •", u)

# 🧹 Step 2: Scrape with enriched metadata
def scrape_content(urls, delay=1, snippet_len=500):
    session = requests.Session()
    session.headers.update({"User-Agent": "Mozilla/5.0"})
    documents = []

    for idx, url in enumerate(urls, start=1):
        print(f"\n[{idx}/{len(urls)}] Scraping: {url}")
        try:
            resp = session.get(url, timeout=10)
            resp.raise_for_status()
        except requests.RequestException as e:
            print(f"  ❌ Error fetching {url}: {e}")
            continue

        soup = BeautifulSoup(resp.text, "html.parser")
        title_tag = soup.find("h1")
        title = title_tag.get_text(strip=True) if title_tag else "No Title"
        paragraphs = soup.find_all("p", limit=50)
        content = title + "\n\n" + "\n".join(p.get_text(strip=True) for p in paragraphs).strip()

        print("📄 Snippet:", content[:snippet_len].replace("\n", " "), "...\n")

        metadata = {
            "source": url,
            "title": title,
            "domain": urlparse(url).netloc,
            "scraped_at": datetime().isoformat()
        }

        doc = Document(page_content=content, metadata=metadata)
        documents.append(doc)
        time.sleep(delay)

    return documents

# 🏁 Scrape all pages
docs = scrape_content(urls, delay=2)
print(f"\n🧩 Preparing {len(docs)} documents for vector storage...")

# ✂️ Step 3: Chunk content
splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
chunks = splitter.split_documents(docs)

# 🧠 Step 4: Embedding model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 🗃️ Step 5: Store in ChromaDB
persist_directory = "./chroma_techify"
vectordb = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=persist_directory)
vectordb.persist()

print(f"✅ Stored {len(chunks)} chunks in ChromaDB   at: {persist_directory}")

# 🔍 Step 6: View first 5 stored embeddings and metadata
print("\n🔢 Fetching first 5 stored vector embeddings...\n")
stored_data = vectordb._collection.get(include=["embeddings", "documents", "metadatas"])

for i in range(min(5, len(stored_data['ids']))):
    print(f"📄 Document {i + 1}")
    print(f"🧠 Embedding (first 10 dims): {stored_data['embeddings'][i][:10]}")
    content_snippet = stored_data['documents'][i][:150].replace('\n', ' ')
    print(f"📑 Content Snippet: {content_snippet}")
    print(f"🔗 Metadata: {stored_data['metadatas'][i]}")
    print("-" * 80)


In [2]:
pip install agno chromadb

Note: you may need to restart the kernel to use updated packages.


In [17]:
from agno.agent import Agent
from agno.models.groq import Groq
from agno.vectordb.chroma import ChromaDb
from agno.embedder.huggingface import HuggingfaceCustomEmbedder
import os
from dotenv import load_dotenv


# ✅ Load environment variables
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY_temp_2")
if not GROQ_API_KEY:
    raise EnvironmentError("❌ Missing GROQ_API_KEY_temp_2 in .env")


# ✅ Set your Chroma vector DB path and collection name
KB_PATH = "./chroma_techify"
COLLECTION_NAME = "techify_collection"  # Should match the name used during embedding


# ✅ Create embedding model (must match one used in ingestion)
embedder = HuggingfaceCustomEmbedder(
    id="sentence-transformers/all-MiniLM-L6-v2"
)


# ✅ Set up ChromaDb vector store
vectordb = ChromaDb(
    collection=COLLECTION_NAME,
    path=KB_PATH,
    persistent_client=True,
    embedder=embedder
)


# ✅ Minimal wrapper to add `validate_filters()` for Agno compatibility
class ChromaKnowledgeBase:
    def __init__(self, vectordb):
        self.vectordb = vectordb
        self.name = "Techify Solutions KB"
        self.description = "Knowledge base scraped from Techify website pages."

    def query(self, query: str, filters=None, top_k: int = 4):
        return self.vectordb.search(query, top_k=top_k)

    def validate_filters(self, filters):
        # No filters currently used; pass-through
        return filters or {}, []


# ✅ Wrap Chroma DB
kb = ChromaKnowledgeBase(vectordb)


# ✅ Define the Agent using Groq model and Knowledge base
agent = Agent(
    model=Groq(id="deepseek-r1-distill-llama-70b", api_key=GROQ_API_KEY),
    knowledge=kb,
    description="You are a domain expert on Techify Solutions' industries and services.",
    instructions=[
        "Only use the provided knowledge base to answer questions.",
        "If no relevant information is found, say you don't know.",
        "Cite source URLs from the metadata when applicable."
    ],
    markdown=True,
    show_tool_calls=True
)


# ✅ Interactive CLI Query Loop
if __name__ == "__main__":
    print("💡 Ask me anything about Techify Solutions. Type 'exit' to stop.\n")
    while True:
        query = input("🧠 Question: ")
        if query.strip().lower() == "exit":
            print("👋 Exiting. Goodbye!")
            break
        agent.print_response(query, stream=True)
        print("\n" + "-" * 80 + "\n")


  from .autonotebook import tqdm as notebook_tqdm


💡 Ask me anything about Techify Solutions. Type 'exit' to stop.

[?25l[32m▰▱▱▱▱▱▱[0m Thinking...
[36m┌─[0m[36m Message [0m[36m──────────────────────────────────────────────────────────────────[0m[36m─┐[0m
[36m│[0m                                                                             [36m│[0m
[36m│[0m [32mWhat IT services does Techify offer for the healthcare sector?[0m              [36m│[0m
[36m│[0m                                                                             [36m│[0m
[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2K[32m▰▰▱▱▱▱▱[0m Thinking...───────────┘[0m
[36m┌─[0m[36m Message [0m[36m──────────────────────────────────────────────────────────────────[0m[36m─┐[0m
[36m│[0m                                                                             [36m│[0m
[36m│[0m [32mWhat IT services does Techify offer for the healthcare sector?[0m              [36m│[0m
[36m│[0m                                                             