# LangChain Web Scraper to Chroma Vector Store (Manual Setup, No ONNX)
This notebook loads webpage content with `WebBaseLoader`, splits it into chunks, embeds it with HuggingFace, and stores it in Chroma — without triggering `onnxruntime`.

In [1]:
!pip install langchain chromadb sentence-transformers --quiet

In [3]:
!pip show chromadb
!pip show onyxruntime

Name: chromadb
Version: 0.6.3
Summary: Chroma.
Home-page: 
Author: 
Author-email: Jeff Huber <jeff@trychroma.com>, Anton Troynikov <anton@trychroma.com>
License: 
Location: C:\ProgramData\anaconda3\Lib\site-packages
Requires: bcrypt, build, chroma-hnswlib, fastapi, grpcio, httpx, importlib-resources, kubernetes, mmh3, numpy, onnxruntime, opentelemetry-api, opentelemetry-exporter-otlp-proto-grpc, opentelemetry-instrumentation-fastapi, opentelemetry-sdk, orjson, overrides, posthog, pydantic, pypika, PyYAML, rich, tenacity, tokenizers, tqdm, typer, typing_extensions, uvicorn
Required-by: langchain-chroma




In [5]:
# Step 1: Load webpage using WebBaseLoader
from langchain_community.document_loaders import WebBaseLoader

os.environ['USER_AGENT'] = 'automated-code-review/1.0'
url = "https://google.github.io/styleguide/pyguide.html"  # Replace with your own
loader = WebBaseLoader(url)
documents = loader.load()
print(f"Loaded {len(documents)} document(s)")

USER_AGENT environment variable not set, consider setting it to identify your requests.


Loaded 1 document(s)


In [7]:
# Step 2: Split into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = splitter.split_documents(documents)
print(f"Split into {len(docs)} chunks")

Split into 251 chunks


In [9]:
# Step 3: Embed using HuggingFace and save to Chroma (manual client setup)
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.chroma import Chroma
from chromadb import Client
from chromadb.config import Settings

embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
persist_directory = "./chroma_langchain_store"

# Manually create Chroma client to avoid fallback
client_settings = Settings(persist_directory=persist_directory, anonymized_telemetry=False)
client = Client(client_settings)

# Manually create and populate Chroma vector store
vectordb = Chroma(
    collection_name="web_docs",
    embedding_function=embedding,
    client=client,
    persist_directory=persist_directory
)
vectordb.add_documents(docs)
vectordb.persist()
print("Chroma vector store created and persisted.")

ValueError: The onnxruntime python package is not installed. Please install it with `pip install onnxruntime`

In [None]:
# Step 4: Query the vector store
query = "What are Python naming conventions?"
results = vectordb.similarity_search(query, k=3)
for i, res in enumerate(results):
    print(f"Result {i+1}:")
    print(res.page_content)
    print("-" * 80)