In [2]:
from datasets import load_dataset

dataset_name = 'reuters21578'  # Example: 'cnn_dailymail', 'xsum'

dataset = load_dataset(dataset_name, "ModApte", split='test')

print(dataset)


Dataset({
    features: ['text', 'text_type', 'topics', 'lewis_split', 'cgis_split', 'old_id', 'new_id', 'places', 'people', 'orgs', 'exchanges', 'date', 'title'],
    num_rows: 3299
})


In [3]:
# convert dataset to pandas dataframe

import pandas as pd

df = pd.DataFrame(dataset)
df.head()

Unnamed: 0,text,text_type,topics,lewis_split,cgis_split,old_id,new_id,places,people,orgs,exchanges,date,title
0,Mounting trade friction between the\nU.S. And ...,"""NORM""",[trade],"""TEST""","""TRAINING-SET""","""3809""","""14826""","[hong-kong, usa, japan, taiwan, malaysia, sout...",[],[],[],8-APR-1987 01:03:47.52,ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT
1,A survey of 19 provinces and seven cities\nsho...,"""NORM""",[grain],"""TEST""","""TRAINING-SET""","""3811""","""14828""",[china],[],[],[],8-APR-1987 01:19:17.29,CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STOCKS
2,The Ministry of International Trade and\nIndus...,"""NORM""","[crude, nat-gas]","""TEST""","""TRAINING-SET""","""4356""","""14829""",[japan],[],[],[],8-APR-1987 01:22:17.25,JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWARDS
3,Thailand's trade deficit widened to 4.5\nbilli...,"""NORM""","[trade, grain, rice, corn, sugar, tin, rubber]","""TEST""","""TRAINING-SET""","""3815""","""14832""",[thailand],[],[],[],8-APR-1987 01:45:09.09,THAI TRADE DEFICIT WIDENS IN FIRST QUARTER
4,Indonesia expects crude palm oil (CPO)\nprices...,"""NORM""","[veg-oil, palm-oil]","""TEST""","""TRAINING-SET""","""3816""","""14833""","[indonesia, malaysia]",[],[],[],8-APR-1987 01:48:20.11,INDONESIA SEES CPO PRICE RISING SHARPLY


In [4]:
# let create a new chroma collection and ingest test dataset into it

import chromadb
chroma_client = chromadb.PersistentClient(path="./chroma_reuters21578/")

collection = chroma_client.get_or_create_collection(name="test")

# Convert metadata values to strings
metadata_strings = df.drop(columns=["text"]).astype(str).to_dict(orient="records")

collection.add(
    documents=df.text.to_list(),
    # convert IDs to strings
    ids=[str(id) for id in df.index.to_list()],
    # add the rest of fields as metadata
    metadatas=metadata_strings
)


KeyboardInterrupt: 

In [None]:
collection.count()

3299

In [6]:
# query

import chromadb

def chroma_query(path, collection_name, query):
    chroma_client = chromadb.PersistentClient(path=path)
    collection = chroma_client.get_or_create_collection(name=collection_name)
    results = collection.query(query_texts=[query], n_results=2)
    return results

In [7]:
query = "Which company acquired a 50% stake in Steuart Petroleum Co?"
results = chroma_query("./chroma_reuters21578/", "test", query)
results

{'ids': [['1677', '1685']],
 'distances': [[0.5689047574996948, 0.5837762951850891]],
 'metadatas': [[{'cgis_split': '"TRAINING-SET"',
    'date': ' 2-JUN-1987 08:15:26.19',
    'exchanges': '[]',
    'lewis_split': '"TEST"',
    'new_id': '"17958"',
    'old_id': '"7403"',
    'orgs': '[]',
    'people': '[]',
    'places': "['italy', 'usa']",
    'text_type': '"NORM"',
    'title': "ITALY'S AGIP PETROLI BUYS STAKE IN U.S. COMPANY",
    'topics': "['acq']"},
   {'cgis_split': '"TRAINING-SET"',
    'date': ' 2-JUN-1987 08:54:46.11',
    'exchanges': '[]',
    'lewis_split': '"TEST"',
    'new_id': '"17968"',
    'old_id': '"7413"',
    'orgs': '[]',
    'people': '[]',
    'places': "['usa', 'italy']",
    'text_type': '"NORM"',
    'title': 'ENI UNIT AGIP PETROLI BUYS STAKE IN U.S. COMPANY',
    'topics': "['acq']"}]],
 'embeddings': None,
 'documents': [['State oil firm Agip Petroli Spa said it has\nacquired a 50 pct stake in Steuart Petroleum Co, an independent\nU.S. Oil products co