<a href="https://colab.research.google.com/github/kimdesok/Learning/blob/main/chroma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# New Section

In [2]:
!pip install chromadb tiktoken
!pip install python-dotenv


Collecting chromadb
  Downloading chromadb-1.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.23.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_2

In [3]:
from dotenv import load_dotenv
import os
import openai, chromadb

# Load .env file
load_dotenv()

# Retrieve the key
OPENAI_API_TOKEN = os.getenv("OPENAI_API_KEY")
#print(OPENAI_API_TOKEN)

## Create or get Collection

In [29]:
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

# Create a persistant client
db_path = "./data"
client = chromadb.PersistentClient(path=db_path)

# Create a netflix_title collection using the OpenAI Embedding function
collection = client.get_or_create_collection(
    name="netflix_titles",
    embedding_function=OpenAIEmbeddingFunction(
        model_name="text-embedding-3-small",
        api_key=OPENAI_API_TOKEN
    )
)

In [5]:
# List the collections
print(client.list_collections())


[Collection(name=netflix_titles)]


## Dataset loading to populate Collection with the ids & documents

In [30]:
import csv

ids = []
documents = []

with open('./netflix_titles_1000.csv') as csvfile:
  reader = csv.DictReader(csvfile)
  for i, row in enumerate(reader):
    ids.append(row['show_id'])
    text = f"Title: {row['title']} ({row['type']})\nDescription: {row['description']}\nCategories: {row['listed_in']}"
    documents.append(text)

##Retrieve collection

In [33]:
import os
import sys
import contextlib

# Assuming you already have the client and collection loaded
collection = client.get_collection("netflix_titles")

# Silent add block
with open(os.devnull, 'w') as f, contextlib.redirect_stdout(f):
    collection.add(
        ids=ids,
        documents=documents
    )

result = collection.get()
#print(result)

In [8]:
# Fetch all documents (or as many as you need)
all_items = collection.get()  # This returns a dict with keys like "documents", "ids", etc.
#print(len(all_items), all_items.keys(), all_items)

# Extract just the texts into a list
documents = all_items["documents"]

# Optional: preview
print("📄 Number of documents:", collection.count())

print("🔹 First document:", collection.peek(3))

collection.delete(ids=[ids[0], ids[1]])

📄 Number of documents: 1000
🔹 First document: {'ids': ['s1', 's2', 's3'], 'embeddings': array([[-0.10890239, -0.04683724, -0.07819176, ...,  0.00722835,
         0.02060966, -0.04477739],
       [-0.09149237,  0.0402753 , -0.0268563 , ...,  0.03136477,
         0.00951984, -0.01715871],
       [-0.00504848,  0.00746851, -0.05256566, ..., -0.01810597,
         0.01158333, -0.04438727]]), 'documents': ['Title: Dick Johnson Is Dead (Movie)\nDescription: As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.\nCategories: Documentaries', 'Title: Blood & Water (TV Show)\nDescription: After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth.\nCategories: International TV Shows, TV Dramas, TV Mysteries', 'Title: Ganglands (TV Show)\nDescription: To protect his family from a powerful drug lord, skilled thief M

## Query

In [9]:
result = collection.query(
    query_texts = ["movies where people sing a lot"],
    n_results=3
)

In [10]:
print(result.keys())
print(result['ids'])
print(result['distances'])
print(result['documents'][0]) #for i in range(3)
print(result['embeddings'])

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas', 'distances'])
[['s293', 's376', 's851']]
[[1.0587867498397827, 1.140680193901062, 1.1417112350463867]]
['Title: Quartet (Movie)\nDescription: To save their posh retirement home, former opera stars plan a gala recital — until the biggest diva among them refuses to sing.\nCategories: Comedies, Dramas, Independent Movies', 'Title: Resort to Love (Movie)\nDescription: Reeling from a broken heart and career meltdown, a singer takes a gig at a swanky island resort — where her ex-fiancé happens to be getting married.\nCategories: Comedies, Romantic Movies', 'Title: 99 Songs (Movie)\nDescription: Challenged to compose 100 songs before he can marry the girl he loves, a tortured but passionate singer-songwriter embarks on a poignant musical journey.\nCategories: Dramas, International Movies, Music & Musicals']
None


## Updating

In [11]:
new_data = [{"id": "s1001", "document": "Title: Cats & Dogs (Movie)\nDescription: A look at the top-secret, high-tech espionage war going on between cats and dogs, of which their human owners are blissfully unaware."},
 {"id": "s6884", "document": 'Title: Goosebumps 2: Haunted Halloween (Movie)\nDescription: Three teens spend their Halloween trying to stop a magical book, which brings characters from the "Goosebumps" novels to life.\nCategories: Children & Family Movies, Comedies'}]

In [12]:
collection.update(
    ids=[doc['id'] for doc in new_data],
    documents=[doc['document'] for doc in new_data]
)

In [13]:
collection.upsert(
    ids=[doc['id'] for doc in new_data],
    documents=[doc['document'] for doc in new_data]
)

In [14]:
result = collection.query(
    query_texts = ["movies showing ocean sailors"],
    n_results=3
)

print(result['ids'])
print(result['distances'])
print(result['documents'][0]) #for i in range(3)
print(result['embeddings'])

[['s333', 's941', 's634']]
[[0.8409919738769531, 0.8583800792694092, 0.8980426788330078]]
["Title: Deep Blue Sea (Movie)\nDescription: Scientists conduct research on sharks in search of an Alzheimer's cure. But a dangerous shortcut leads to huge sharks with near-human intelligence.\nCategories: Action & Adventure, Horror Movies, Sci-Fi & Fantasy", 'Title: Motu Patlu: Deep Sea Adventure (Movie)\nDescription: Friends Motu and Patlu get more maritime excitement than anticipated when the sinking of their ship sends them on a journey through the ocean floor!\nCategories: Children & Family Movies, Comedies', 'Title: Into the Wind (Movie)\nDescription: Two siblings set sail as a crew on a yacht on the Aegean Sea. Circumstances soon change when a young documentary filmmaker comes aboard.\nCategories: Dramas, Independent Movies, International Movies']
None


In [15]:
reference_ids = result['ids'][0]
reference_texts = collection.get(ids=reference_ids)['documents']
print(reference_texts)

result = collection.query(
    query_texts=reference_texts,
    n_results=3
)

print(result['ids'])
print(result['distances'])
print(result)

["Title: Deep Blue Sea (Movie)\nDescription: Scientists conduct research on sharks in search of an Alzheimer's cure. But a dangerous shortcut leads to huge sharks with near-human intelligence.\nCategories: Action & Adventure, Horror Movies, Sci-Fi & Fantasy", 'Title: Into the Wind (Movie)\nDescription: Two siblings set sail as a crew on a yacht on the Aegean Sea. Circumstances soon change when a young documentary filmmaker comes aboard.\nCategories: Dramas, Independent Movies, International Movies', 'Title: Motu Patlu: Deep Sea Adventure (Movie)\nDescription: Friends Motu and Patlu get more maritime excitement than anticipated when the sinking of their ship sends them on a journey through the ocean floor!\nCategories: Children & Family Movies, Comedies']
[['s333', 's44', 's835'], ['s634', 's192', 's823'], ['s941', 's940', 's938']]
[[0.0, 0.8070592284202576, 0.8218811750411987], [0.0, 0.8925493955612183, 0.9623372554779053], [0.0, 0.5948403477668762, 0.6144980192184448]]
{'ids': [['s333

In [16]:
def print_s(documents):
  for entry in documents:  # assuming your list is named `documents`
    lines = entry.split("\n")
    for line in lines:
        if line.startswith("Title:"):
            print(f"📌 {line}")
        elif line.startswith("Description:"):
            print(f"📝 {line}")
        elif line.startswith("Categories:"):
            print(f"🏷️ {line}")
    print()  # blank line between entries

print_s(result['documents'][0])
print_s(result['documents'][1])
print_s(result['documents'][2])

📌 Title: Deep Blue Sea (Movie)
📝 Description: Scientists conduct research on sharks in search of an Alzheimer's cure. But a dangerous shortcut leads to huge sharks with near-human intelligence.
🏷️ Categories: Action & Adventure, Horror Movies, Sci-Fi & Fantasy

📌 Title: Jaws 3 (Movie)
📝 Description: After the staff of a marine theme park try to capture a young great white shark, they discover its mother has invaded the enclosure and is out for blood.
🏷️ Categories: Action & Adventure, Horror Movies, Thrillers

📌 Title: Blue Miracle (Movie)
📝 Description: To save their cash-strapped orphanage, a guardian and his kids partner with a washed-up boat captain for a chance to win a lucrative fishing competition.
🏷️ Categories: Children & Family Movies, Dramas, Faith & Spirituality

📌 Title: Into the Wind (Movie)
📝 Description: Two siblings set sail as a crew on a yacht on the Aegean Sea. Circumstances soon change when a young documentary filmmaker comes aboard.
🏷️ Categories: Dramas, Independ

## Metadata adding

In [17]:
import csv
ids = []
metadatas = []
with open('netflix_titles_1000.csv') as csvfile:
  reader = csv.DictReader(csvfile)
  for i, row in enumerate(reader):
    ids.append(row['show_id'])
    metadatas.append({
        'type': row['type'],
        'release_year': int(row['release_year']),
    })

## Adding metadata

In [18]:
#collection.delete(ids=['id-1', 'id-2'])
ids = []
metadatas = []
with open('netflix_titles_1000.csv') as csvfile:
  reader = csv.DictReader(csvfile)
  for i, row in enumerate(reader):
    ids.append(row['show_id'])
    metadatas.append({
        'type': row['type'],
        'release_year': int(row['release_year']),
    })


Adding and quering metadatas

In [23]:
collection.update(ids=ids, metadatas=metadatas)

result=collection.query(
    query_texts=reference_texts,
    n_results=3,
    where={
        #"type":"Movie"
        "type": {"$eq": "Movie"}
    }
)

In [24]:
result=collection.query(
    query_texts=reference_texts,
    n_results=3,
    where={
        "$and": [
          {"type": {"$eq": "Movie"}},
          {"release_year": {"$gt": 2020}}
        ]
    }
)

In [35]:
client.delete_collection("netflix_titles")

In [36]:
collection = client.create_collection(
  name="netflix_titles",
  embedding_function=OpenAIEmbeddingFunction(model_name="text-embedding-3-small", api_key=OPENAI_API_TOKEN)
)

In [39]:
ids = []
metadatas = []
with open('netflix_titles_1000.csv') as csvfile:
  reader = csv.DictReader(csvfile)
  for i, row in enumerate(reader):
    ids.append(row['show_id'])
    metadatas.append({
        "rating":row['rating'],
        "release_year": int(row['release_year'])
    })


collection.add(
    ids=ids,
    documents=documents,
    metadatas=metadatas
)

In [56]:
reference_texts = ["children's story about a car", "Sailor's story"]

# Query two results using reference_texts
result = collection.query(
  query_texts=reference_texts,
  n_results=2,
  # Filter for titles with a G rating released before 2019
  where={
    "$and": [
        {"rating":
        	{"$eq":"G"}
        },
        {"release_year":
         	{"$lt":2019}
        }
    ]
  }
)


print_s(result['documents'][0])
print_s(result['documents'][1])

📌 Title: A Champion Heart (Movie)
📝 Description: When a grieving teen must work off her debt to a ranch, she cares for a wounded horse that teaches her more about healing than she expected.
🏷️ Categories: Children & Family Movies, Dramas

📌 Title: Hachi: A Dog's Tale (Movie)
📝 Description: When his master dies, a loyal pooch named Hachiko keeps a vigil for more than a decade at the train station where he once greeted his owner every day.
🏷️ Categories: Children & Family Movies, Dramas

📌 Title: Hachi: A Dog's Tale (Movie)
📝 Description: When his master dies, a loyal pooch named Hachiko keeps a vigil for more than a decade at the train station where he once greeted his owner every day.
🏷️ Categories: Children & Family Movies, Dramas

📌 Title: A Champion Heart (Movie)
📝 Description: When a grieving teen must work off her debt to a ranch, she cares for a wounded horse that teaches her more about healing than she expected.
🏷️ Categories: Children & Family Movies, Dramas



In [21]:
import tiktoken

# Load the encoder for the OpenAI text-embedding-3-small model
enc = tiktoken.encoding_for_model("text-embedding-3-small")

# Encode each text in documents and calculate the total tokens
total_tokens = sum(len(enc.encode(text)) for text in documents)

cost_per_1k_tokens = 0.00002

# Display number of tokens and cost
print('Total tokens:', total_tokens)
print('Cost:', total_tokens*cost_per_1k_tokens/1000)

Total tokens: 51226
Cost: 0.0010245200000000001
