In [2]:
import pandas
pandas.set_option('display.max_colwidth', 100)

In [3]:
df = pandas.read_csv("../data/sample.csv")
df.shape

(50, 2)

In [4]:
df.head()

Unnamed: 0,text,category
0,Meditation apps are gaining popularity among students,Health
1,Regular morning walks can help reduce stress levels,Health
2,Doctors recommend balanced diets for heart patients,Health
3,Yoga retreats are attracting working professionals,Health
4,A new skincare brand launched its eco-friendly range,Fashion


In [5]:
# create embeddings from the text column
from sentence_transformers import SentenceTransformer

model = "sentence-transformers/all-MiniLM-L6-v2"
encoder = SentenceTransformer(model)
vectors = encoder.encode(df.text)

  from .autonotebook import tqdm as notebook_tqdm
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [6]:
vectors.shape

(50, 384)

In [8]:
import chromadb

# In-memory client (everything stored in RAM)
client = chromadb.Client()

# Create or get collection
collection = client.create_collection(name="news_collection")


In [9]:
# Add documents + embeddings

# Each row must have a unique ID (strings required)
ids = [str(i) for i in range(len(df))]

collection.add(
    ids=ids,
    embeddings=vectors.tolist(),
    documents=df["text"].tolist(),
    metadatas=[{"category": c} for c in df["category"]]
)

print(f"Added {collection.count()} documents to ChromaDB")

Added 50 documents to ChromaDB


In [10]:
# Query Chroma

query = "What are the benefits of doing regular exercise?"
query_vec = encoder.encode(query).tolist()

results = collection.query(
    query_embeddings=[query_vec],
    n_results=3
)

print("Query:", query)
for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
    print(f"-> {doc} [{meta['category']}]")


Query: What are the benefits of doing regular exercise?
-> Cycling is being promoted as a healthy lifestyle [Health]
-> Tokyo Olympics inspired new fitness programs [Sports]
-> Regular morning walks can help reduce stress levels [Health]


In [11]:
# Filter by metadata

results_health = collection.query(
    query_embeddings=[query_vec],
    n_results=3,
    where={"category": "Health"}
)

print("\nFiltered to only Health category:")
for doc, meta in zip(results_health["documents"][0], results_health["metadatas"][0]):
    print(f"-> {doc} [{meta['category']}]")


Filtered to only Health category:
-> Cycling is being promoted as a healthy lifestyle [Health]
-> Regular morning walks can help reduce stress levels [Health]
-> Yoga retreats are attracting working professionals [Health]
