In [None]:
pip install langchain


In [None]:
pip install langchain-huggingface sentence-transformers chromadb

In [None]:
"""
embed_to_chroma.py
- Embeds a list of texts using sentence-transformers/all-MiniLM-L6-v2
- Stores them into a Chroma vector store (persisted)
"""

# Robust imports (tries a few common LangChain/Chroma package layouts)
try:
    # newer partner package
    from langchain_huggingface import HuggingFaceEmbeddings
except Exception:
    try:
        from langchain.embeddings import HuggingFaceEmbeddings
    except Exception:
        # fallback (older community package)
        from langchain_community.embeddings import HuggingFaceEmbeddings

# Chroma import fallbacks
try:
    from langchain_chroma import Chroma
except Exception:
    try:
        from langchain_chroma import Chroma
    except Exception:
        from langchain_community.vectorstores import Chroma

# Example usage
def create_and_persist_chroma(
    texts,
    metadatas=None,
    persist_dir="chroma_db",
    collection_name="my_collection",
    model_name="sentence-transformers/all-MiniLM-L6-v2",
):
    """
    texts: list[str]
    metadatas: list[dict] or None
    """
    # 1) init embeddings (runs locally using sentence-transformers)
    hf_emb = HuggingFaceEmbeddings(model_name=model_name)

    # 2) create Chroma vectorstore from raw texts (and persist to disk)
    # Some LangChain versions expect param name `embedding` vs `embedding_function`
    try:
        vectordb = Chroma.from_texts(
            texts,
            embedding=hf_emb,
            metadatas=metadatas,
            collection_name=collection_name,
            persist_directory=persist_dir,
        )
    except TypeError:
        # fallback param name
        vectordb = Chroma.from_texts(
            texts,
            embedding_function=hf_emb,
            metadatas=metadatas,
            collection_name=collection_name,
            persist_directory=persist_dir,
        )

    # try to force a persist if the wrapper exposes it
    try:
        if hasattr(vectordb, "persist"):
            vectordb.persist()
    except Exception as e:
        print("Warning: vectordb.persist() raised:", e)

    return vectordb, hf_emb


if __name__ == "__main__":
    sample_texts = [
        "LangChain is a framework for building applications with LLMs.",
        "Chroma is an open-source vector database for embeddings and semantic search.",
        "all-MiniLM-L6-v2 is a fast sentence-transformer for embeddings."
    ]
    sample_metas = [{"source": "doc1"}, {"source": "doc2"}, {"source": "doc3"}]

    vectordb, embeddings = create_and_persist_chroma(
        sample_texts, metadatas=sample_metas, persist_dir="chroma_db", collection_name="demo"
    )

    # Quick test: similarity search
    hits = vectordb.similarity_search("What is Chroma?", k=2)
    for i, doc in enumerate(hits):
        print(f"Hit {i}:")
        print("  text:", doc.page_content)
        print("  metadata:", doc.metadata)
        print("-" * 40)


  from .autonotebook import tqdm as notebook_tqdm


Hit 0:
  text: Chroma is an open-source vector database for embeddings and semantic search.
  metadata: {'source': 'doc2'}
----------------------------------------
Hit 1:
  text: LangChain is a framework for building applications with LLMs.
  metadata: {'source': 'doc1'}
----------------------------------------


  vectordb.persist()


In [9]:
pip install -U langchain-chroma

Collecting langchain-chroma
  Downloading langchain_chroma-0.2.6-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_chroma-0.2.6-py3-none-any.whl (12 kB)
Installing collected packages: langchain-chroma
Successfully installed langchain-chroma-0.2.6
Note: you may need to restart the kernel to use updated packages.


In [10]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

# same embedding model you used when creating the DB
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# load the persisted Chroma collection
vectordb = Chroma(
    collection_name="demo",          # must match what you used before
    persist_directory="chroma_db",   # must match the folder used before
    embedding_function=embeddings
)

# test query
docs = vectordb.similarity_search("Tell me about embeddings", k=2)
for i, doc in enumerate(docs, 1):
    print(f"Result {i}: {doc.page_content}")
    print("Metadata:", doc.metadata)
    print("-" * 40)


Result 1: Chroma is an open-source vector database for embeddings and semantic search.
Metadata: {'source': 'doc2'}
----------------------------------------
Result 2: all-MiniLM-L6-v2 is a fast sentence-transformer for embeddings.
Metadata: {'source': 'doc3'}
----------------------------------------
