# Load Webpage Data into Chroma Vector Store using LangChain
This notebook demonstrates how to use `WebBaseLoader` from LangChain to load data from a webpage and store it in Chroma.

In [34]:
!pip install langchain chromadb sentence-transformers --quiet
!pip install langchain-huggingface --quiet

In [50]:
!pip show chromadb
!pip show onnxruntime

Name: chromadb
Version: 0.6.3
Summary: Chroma.
Home-page: 
Author: 
Author-email: Jeff Huber <jeff@trychroma.com>, Anton Troynikov <anton@trychroma.com>
License: 
Location: C:\ProgramData\anaconda3\Lib\site-packages
Requires: bcrypt, build, chroma-hnswlib, fastapi, grpcio, httpx, importlib-resources, kubernetes, mmh3, numpy, onnxruntime, opentelemetry-api, opentelemetry-exporter-otlp-proto-grpc, opentelemetry-instrumentation-fastapi, opentelemetry-sdk, orjson, overrides, posthog, pydantic, pypika, PyYAML, rich, tenacity, tokenizers, tqdm, typer, typing_extensions, uvicorn
Required-by: langchain-chroma
Name: onnxruntime
Version: 1.21.0
Summary: ONNX Runtime is a runtime accelerator for Machine Learning models
Home-page: https://onnxruntime.ai
Author: Microsoft Corporation
Author-email: onnxruntime@microsoft.com
License: MIT License
Location: C:\ProgramData\anaconda3\Lib\site-packages
Requires: coloredlogs, flatbuffers, numpy, packaging, protobuf, sympy
Required-by: chromadb


In [None]:
!pip uninstall onnxruntime onnx onnxruntime-tools

In [36]:
# Step 1: Load webpage content using LangChain
import os
from langchain.document_loaders import WebBaseLoader
os.environ['USER_AGENT'] = 'automated-code-review/1.0'
PYTHON_STYLES_WEB_SOURCE = "https://google.github.io/styleguide/pyguide.html"
loader = WebBaseLoader(PYTHON_STYLES_WEB_SOURCE)
documents = loader.load()
print(f"Loaded {len(documents)} documents")

Loaded 1 documents


In [38]:
import json

data = [{"content": doc.page_content, "metadata": doc.metadata} for doc in documents]

with open("web_documents.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2)


In [40]:
# Step 2: Split documents into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
TEXT_SPLITTER_CHUNK_SIZE = 1000
TEXT_SPLITTER_CHUNK_OVERLAP = 200

splitter = RecursiveCharacterTextSplitter(chunk_size=TEXT_SPLITTER_CHUNK_SIZE, chunk_overlap=TEXT_SPLITTER_CHUNK_OVERLAP)
docs = splitter.split_documents(documents)
print(f"Split into {len(docs)} chunks")

Split into 122 chunks


In [46]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.chroma import Chroma
from chromadb import Client
from chromadb.config import Settings

# 1. Set up the client manually
persist_directory = "./chroma_langchain_store"
client_settings = Settings(
    persist_directory=persist_directory,
    anonymized_telemetry=False
)
client = Client(client_settings)

# 2. Load documents & create embedding
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# 3. Add documents to Chroma manually
vectordb = Chroma(
    collection_name="web_docs",
    embedding_function=embedding,
    client=client,
    persist_directory=persist_directory
)
vectordb.add_documents(docs)
vectordb.persist()

print("Chroma vector store created without onnxruntime.")


ValueError: The onnxruntime python package is not installed. Please install it with `pip install onnxruntime`

In [None]:
# Step 4: Perform a similarity search
query = "What are Python naming conventions?"
results = vectordb.similarity_search(query, k=3)
for i, res in enumerate(results):
    print(f"Result {i+1}:")
    print(res.page_content)
    print("-"*80)