## Chroma DB Tutorial

## 1. Instantiate

In [1]:
import chromadb

In [2]:
chroma_client = chromadb.Client()

In [3]:
chroma_client = chromadb.PersistentClient(path="./vector_db")

In [4]:
collection = chroma_client.get_or_create_collection(name="documents")


In [5]:
collection.add(
    documents=[
        "This is a document about pineapple",
        "This is a document about oranges"
    ],
    ids=["id1", "id2"]
)

C:\Users\lolen\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:20<00:00, 4.03MiB/s]


## 2. Query

In [6]:
from pprint import pprint
results = collection.query(
    query_texts=["This is a query document about hawaii"], # Chroma will embed this for you
    n_results=2, # how many results to return
    where_document={"$contains": "pineapple"}, # filter by document content
)
pprint(results)


{'data': None,
 'distances': [[1.0404009819030762]],
 'documents': [['This is a document about pineapple']],
 'embeddings': None,
 'ids': [['id1']],
 'included': ['metadatas', 'documents', 'distances'],
 'metadatas': [[None]],
 'uris': None}


## 3. Work with data

In [7]:
from pathlib import Path
import pandas as pd
import os

# Define a relative or absolute path to the CSV file
file_path = Path("Articles.csv").resolve()

# Check if the file exists before reading
if file_path.exists():
    articles = pd.read_csv(file_path, encoding="ISO-8859-1")
    articles['row_id'] = articles.index+1
    print("File successfully loaded.")
else:
    raise FileNotFoundError(f"File not found at: {file_path}")

File successfully loaded.


In [8]:
articles.head(3)

Unnamed: 0,Article,Date,Heading,NewsType,row_id
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business,1
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business,2
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business,3


In [9]:
N= 50
articles = articles.sample(N, random_state=1)

In [10]:
len(articles)

50

In [13]:
len(articles)

50

In [14]:
N= 50
articles = articles.sample(N, random_state=1)

In [15]:
len(articles)

50

In [16]:
articles

Unnamed: 0,Article,Date,Heading,NewsType,row_id
1039,CAPE TOWN: Poor weather brought a limp end to ...,1/6/2016,England hold on for draw as weather has final say,sports,1040
598,ISLAMABAD: The World Bank (WB) has approved a ...,5/3/2016,WB approves 1 billion to support economic refo...,business,599
231,Brussels: The EU on Tuesday announced a free t...,8/4/2015,eu announces free trade deal with vietnam in p...,business,232
1417,DHARAMSALA: Oman captain Sultan Ahmed won the ...,3/13/2016,Bangladesh sent into bat by Oman in knockout g,sports,1418
1945,LONDON: Jonny Bairstow continued a brilliant y...,6/11/2016,Sri Lanka hit back after Bairstow steers Engla...,sports,1946
2387,BIRMINGHAM: Sohail Khan marked his return to T...,8/3/2016,Pakistan dismiss England for 297 on 1st day of...,sports,2388
559,strong>WASHINGTON: The US government posted a ...,4/12/2016,US posts 108 billion budget deficit in Mar,business,560
1782,strong>PARIS: Amelie Mauresmo ended her coach...,5/21/2016,Andy Murray Amelie Maur,sports,1783
2663,strong>BEIJING: The Xinjiang Uygur autonomous ...,3/17/2017,CPEC China approves huge infrastructure projec...,business,2664
660,strong>LONDON: British Prime Minister David Ca...,5/22/2016,UK PM Cameron warns Brexit drive up food pri,business,661


### -Create embeddings using default

In [18]:
from chromadb.utils import embedding_functions
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)





In [19]:
sentence_transformer_ef([articles['Article'].iloc[0]])

[array([-3.41819674e-02,  5.46282120e-02,  1.54261040e-02,  9.80661064e-02,
         1.06982738e-01,  2.10012626e-02,  3.64807504e-03,  3.36985216e-02,
        -6.35368153e-02,  8.65028873e-02, -9.37567949e-02, -8.71905610e-02,
         3.10266130e-02,  1.44060617e-02,  1.06339920e-02, -5.04973670e-03,
        -7.82416984e-02, -1.13809995e-01, -5.17362058e-02,  1.36661595e-02,
        -1.77428443e-02,  7.69995227e-02, -5.72778061e-02,  1.29597997e-02,
         6.85508996e-02, -7.25407433e-03, -3.60377170e-02,  2.77856681e-02,
         8.59170407e-03,  3.69019471e-02,  5.84067218e-02,  8.22000485e-03,
         6.12862706e-02, -1.41577898e-02, -3.34187783e-02, -6.10702299e-02,
         4.31840718e-02,  6.29868999e-04, -1.11442460e-02, -6.74751699e-02,
         4.11755331e-02, -6.75475737e-03,  1.24304732e-02, -6.33606911e-02,
         3.14845406e-02, -1.36434464e-02,  3.12366374e-02,  2.54803635e-02,
        -9.46241245e-03, -2.93892473e-02,  6.83730748e-03,  5.90335317e-02,
        -8.9

In [20]:
articles_list = articles["Article"].tolist()
vectors = sentence_transformer_ef(articles_list)

In [21]:
len(vectors)

50

In [22]:
ids = [f"id{x}" for x in articles["row_id"].tolist()]
ids

['id1040',
 'id599',
 'id232',
 'id1418',
 'id1946',
 'id2388',
 'id560',
 'id1783',
 'id2664',
 'id661',
 'id1655',
 'id29',
 'id331',
 'id5',
 'id734',
 'id315',
 'id1129',
 'id2280',
 'id18',
 'id1801',
 'id731',
 'id621',
 'id1727',
 'id786',
 'id1952',
 'id1864',
 'id1533',
 'id1422',
 'id1593',
 'id1657',
 'id1986',
 'id1803',
 'id1194',
 'id2660',
 'id2394',
 'id1540',
 'id2474',
 'id443',
 'id7',
 'id430',
 'id1514',
 'id1855',
 'id1738',
 'id1614',
 'id439',
 'id283',
 'id1013',
 'id2291',
 'id2362',
 'id813']

In [23]:
# chroma_client.delete_collection(name="articles")

In [24]:
collection = chroma_client.get_or_create_collection(name="articles")

In [25]:
collection.add(
    documents=articles_list,
    ids=ids,
    embeddings=vectors,
)

In [26]:
collection.count()

50

In [27]:
query = "public transport fares by 7 per cent"
query_embedding = sentence_transformer_ef([query])

collection.query(
    query_embeddings=query_embedding,
    n_results=5, # how many results to return
)

{'ids': [['id661', 'id29', 'id7', 'id2664', 'id734']],
 'embeddings': None,
   'ISLAMABAD:  Federal Minister for Finance Ishaq Dar on Saturday announced a five percent increase in the General Sales Tax (GST) on petroleum products.Dar said that the increment would enable a recovery of 12 billion rupees.The minister, however, went on to say that the ministry would still face a loss of 40 billion rupees.Earlier today, Prime Minister Nawaz Sharif announced a decrease in the price of petroleum products.Petrol has been decreased by Rs 7.99, Hi-Octane by Rs 11.82, Light Diesel by Rs 9.56 and kerosene oil by Rs 10.48 per litre. \r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n',
   'KARACHI: Strong bulls on Friday pulled the benchmark KSE-100 Index at Karachi Stock Exchange (KSE) and taking it across the psychological barrier of 33,000 points with a single day\x92s gain of 207 points.The investors remained active right from the opening bells of today\x92s trade, contributing 207 points to the major

In [28]:
from pprint import pprint
results = collection.query(
    query_texts=["public transport fares by 7 per cent"], # Chroma will embed this for you
    n_results=2, # how many results to return
)
pprint(results)

{'data': None,
 'distances': [[1.3120756149291992, 1.3165346384048462]],
 'documents': [['strong>LONDON: British Prime Minister David Cameron warned '
                'voters on Sunday that they would face higher grocery bills if '
                'the country decides to leave the European Union at a June 23 '
                'referendum, citing a potential drop in the value of '
                'sterling.</strongCameron is leading the push to keep Britain '
                'inside the European Union ahead of the referendum, the '
                'outcome of which will have far-reaching consequences for the '
                "country's economy, its role in world trade and its global "
                'diplomatic status.His comments mark a shift in campaign '
                "tactics by the 'In' side: a push to make explicit the direct "
                'link between the macroeconomic risks that have dominated the '
                "Brexit debate so far, and their potential impact on Br