In [12]:
from openai import OpenAI
import os
import csv

import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

import tiktoken

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

api_key=os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=api_key)

# Vector databases for embedding systems

## Getting started with ChromaDB

In [13]:
client = chromadb.PersistentClient()

client.delete_collection(name="netflix_titles")
collection = client.create_collection(
  name="netflix_titles",
  embedding_function=OpenAIEmbeddingFunction(api_key=api_key)
)

print(client.list_collections())

[Collection(name=netflix_titles)]


## Estimating embedding costs with tiktoken

In [14]:
ids = []
documents = []

with open('data/netflix_titles_1000.csv') as csvfile:
  reader = csv.DictReader(csvfile)
  for i, row in enumerate(reader):
    ids.append(row['show_id'])
    text = f"Title: {row['title']} ({row['type']})\nDescription: {row['description']}\nCategories: {row['listed_in']}"
    documents.append(text)

enc = tiktoken.encoding_for_model("text-embedding-ada-002")

total_tokens = sum(len(enc.encode(text)) for text in documents)

cost_per_1k_tokens = 0.0001

print('Total tokens:', total_tokens)
print('Cost:', cost_per_1k_tokens * total_tokens/1000)

Total tokens: 51226
Cost: 0.005122600000000001


## Adding data to the collection

In [16]:
client.delete_collection(name="netflix_titles")
collection = client.create_collection(
  name="netflix_titles",
  embedding_function=OpenAIEmbeddingFunction(api_key=api_key)
)

collection.add(
ids=ids,
documents=documents
)

print(f"No. of documents: {collection.count()}")
print(f"First ten documents: {collection.peek()}")

No. of documents: 1000
First ten documents: {'ids': ['s1', 's10', 's100', 's1000', 's101', 's102', 's103', 's104', 's105', 's106'], 'embeddings': [[-0.005237183533608913, 0.007339796517044306, -0.017865760251879692, -0.007978320121765137, -0.005917630158364773, 0.02492176927626133, -0.0047760275192558765, -0.019194405525922775, -0.009906790219247341, -0.01013898104429245, 0.00680446857586503, 0.025463547557592392, -0.000901350227650255, 0.01315100584179163, -0.014060418121516705, 0.0014560272684320807, 0.026443906128406525, 0.007552637718617916, 0.02956557646393776, -0.03549933061003685, 0.0002452914195600897, 0.01968458667397499, -0.008029918186366558, 0.021838797256350517, 0.022083885967731476, 0.013647635467350483, 0.0019655562937259674, -0.015505158342421055, 0.006417484488338232, 0.008494298905134201, 0.0030861974228173494, -0.00911992322653532, 0.006798018701374531, -0.05041111633181572, -0.008642642758786678, 0.0001702124864095822, -0.02812083624303341, -0.02419939823448658, 0.0