In [1]:
from openai import OpenAI
import os
import csv

import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

import tiktoken

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

api_key=os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=api_key)

# Vector databases for embedding systems

## Getting started with ChromaDB

In [2]:
client = chromadb.PersistentClient()

client.delete_collection(name="netflix_titles")
collection = client.create_collection(
  name="netflix_titles",
  embedding_function=OpenAIEmbeddingFunction(api_key=api_key)
)

print(client.list_collections())

[Collection(name=netflix_titles)]


## Estimating embedding costs with tiktoken

In [3]:
ids = []
documents = []

with open('data/netflix_titles_1000.csv') as csvfile:
  reader = csv.DictReader(csvfile)
  for i, row in enumerate(reader):
    ids.append(row['show_id'])
    text = f"Title: {row['title']} ({row['type']})\nDescription: {row['description']}\nCategories: {row['listed_in']}"
    documents.append(text)

enc = tiktoken.encoding_for_model("text-embedding-ada-002")

total_tokens = sum(len(enc.encode(text)) for text in documents)

cost_per_1k_tokens = 0.0001

print('Total tokens:', total_tokens)
print('Cost:', cost_per_1k_tokens * total_tokens/1000)

Total tokens: 51226
Cost: 0.005122600000000001


## Adding data to the collection

In [4]:
client.delete_collection(name="netflix_titles")
collection = client.create_collection(
  name="netflix_titles",
  embedding_function=OpenAIEmbeddingFunction(api_key=api_key)
)

collection.add(
ids=ids,
documents=documents
)

print(f"No. of documents: {collection.count()}")
print(f"First ten documents: {collection.peek()}")

No. of documents: 1000
First ten documents: {'ids': ['s1', 's10', 's100', 's1000', 's101', 's102', 's103', 's104', 's105', 's106'], 'embeddings': [[-0.005237183533608913, 0.007339796517044306, -0.017865760251879692, -0.007978320121765137, -0.005917630158364773, 0.02492176927626133, -0.0047760275192558765, -0.019194405525922775, -0.009906790219247341, -0.01013898104429245, 0.00680446857586503, 0.025463547557592392, -0.000901350227650255, 0.01315100584179163, -0.014060418121516705, 0.0014560272684320807, 0.026443906128406525, 0.007552637718617916, 0.02956557646393776, -0.03549933061003685, 0.0002452914195600897, 0.01968458667397499, -0.008029918186366558, 0.021838797256350517, 0.022083885967731476, 0.013647635467350483, 0.0019655562937259674, -0.015505158342421055, 0.006417484488338232, 0.008494298905134201, 0.0030861974228173494, -0.00911992322653532, 0.006798018701374531, -0.05041111633181572, -0.008642642758786678, 0.0001702124864095822, -0.02812083624303341, -0.02419939823448658, 0.0

# Querying and updating the database

## Querying the collection

In [5]:
collection = client.get_collection(
  name="netflix_titles",
  embedding_function=OpenAIEmbeddingFunction(api_key=api_key)
)

result = collection.query(
  query_texts=["films about dogs"],
  n_results=3
)

print(result)

{'ids': [['s95', 's830', 's500']], 'distances': [[0.2509266138076782, 0.2638828456401825, 0.285917192697525]], 'metadatas': [[None, None, None]], 'embeddings': None, 'documents': [['Title: Show Dogs (Movie)\nDescription: A rough and tough police dog must go undercover with an FBI agent as a prim and proper pet at a dog show to save a baby panda from an illegal sale.\nCategories: Children & Family Movies, Comedies', 'Title: Dog Gone Trouble (Movie)\nDescription: The privileged life of a pampered dog named Trouble is turned upside-down when he gets lost and must learn to survive on the big-city streets.\nCategories: Children & Family Movies, Comedies', 'Title: Dogs (TV Show)\nDescription: These six intimate stories explore the abiding emotional bonds that form between dogs and their caregivers, no matter the circumstances.\nCategories: Docuseries']], 'uris': None, 'data': None}


## Updating and deleting items from a collection

In [6]:
collection = client.get_collection(
  name="netflix_titles",
  embedding_function=OpenAIEmbeddingFunction(api_key=api_key)
)

collection.upsert(
  ids=["s1001", "s6884"],
  documents=[
    "Title: Cats & Dogs (Movie)\nDescription: A look at the top-secret, high-tech espionage war going on between cats and dogs, of which their human owners are blissfully unaware.", 
    'Title: Goosebumps 2: Haunted Halloween (Movie)\nDescription: Three teens spend their Halloween trying to stop a magical book, which brings characters from the "Goosebumps" novels to life.\nCategories: Children & Family Movies, Comedies'
  ]
)

collection.delete(ids=["s95"])

result = collection.query(
  query_texts=["films about dogs"],
  n_results=3
)

print(result)

{'ids': [['s830', 's1001', 's500']], 'distances': [[0.2637295126914978, 0.2761459603132039, 0.28586024045944214]], 'metadatas': [[None, None, None]], 'embeddings': None, 'documents': [['Title: Dog Gone Trouble (Movie)\nDescription: The privileged life of a pampered dog named Trouble is turned upside-down when he gets lost and must learn to survive on the big-city streets.\nCategories: Children & Family Movies, Comedies', 'Title: Cats & Dogs (Movie)\nDescription: A look at the top-secret, high-tech espionage war going on between cats and dogs, of which their human owners are blissfully unaware.', 'Title: Dogs (TV Show)\nDescription: These six intimate stories explore the abiding emotional bonds that form between dogs and their caregivers, no matter the circumstances.\nCategories: Docuseries']], 'uris': None, 'data': None}
