In [None]:
from openai import OpenAI
import os
import csv

import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

import tiktoken

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

api_key=os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=api_key)

# Vector databases for embedding systems

## Getting started with ChromaDB

In [None]:
client = chromadb.PersistentClient()

#client.delete_collection(name="netflix_titles")
collection = client.create_collection(
  name="netflix_titles",
  embedding_function=OpenAIEmbeddingFunction(api_key=api_key)
)

print(client.list_collections())

## Estimating embedding costs with tiktoken

In [None]:
ids = []
documents = []

with open('data/netflix_titles_1000.csv') as csvfile:
  reader = csv.DictReader(csvfile)
  for i, row in enumerate(reader):
    ids.append(row['show_id'])
    text = f"Title: {row['title']} ({row['type']})\nDescription: {row['description']}\nCategories: {row['listed_in']}"
    documents.append(text)

enc = tiktoken.encoding_for_model("text-embedding-ada-002")

total_tokens = sum(len(enc.encode(text)) for text in documents)

cost_per_1k_tokens = 0.0001

print('Total tokens:', total_tokens)
print('Cost:', cost_per_1k_tokens * total_tokens/1000)

## Adding data to the collection

In [None]:
client.delete_collection(name="netflix_titles")
collection = client.create_collection(
  name="netflix_titles",
  embedding_function=OpenAIEmbeddingFunction(api_key=api_key)
)

collection.add(
ids=ids,
documents=documents
)

print(f"No. of documents: {collection.count()}")
print(f"First ten documents: {collection.peek()}")

# Querying and updating the database

## Querying the collection

In [None]:
collection = client.get_collection(
  name="netflix_titles",
  embedding_function=OpenAIEmbeddingFunction(api_key=api_key)
)

result = collection.query(
  query_texts=["films about dogs"],
  n_results=3
)

print(result)

## Updating and deleting items from a collection

In [None]:
collection = client.get_collection(
  name="netflix_titles",
  embedding_function=OpenAIEmbeddingFunction(api_key=api_key)
)

collection.upsert(
  ids=["s1001", "s6884"],
  documents=[
    "Title: Cats & Dogs (Movie)\nDescription: A look at the top-secret, high-tech espionage war going on between cats and dogs, of which their human owners are blissfully unaware.", 
    'Title: Goosebumps 2: Haunted Halloween (Movie)\nDescription: Three teens spend their Halloween trying to stop a magical book, which brings characters from the "Goosebumps" novels to life.\nCategories: Children & Family Movies, Comedies'
  ]
)

collection.delete(ids=["s95"])

result = collection.query(
  query_texts=["films about dogs"],
  n_results=3
)

print(result)

# Multiple queries and filtering

## Querying with multiple texts

In [None]:
collection = client.get_collection(
  name="netflix_titles",
  embedding_function=OpenAIEmbeddingFunction(api_key=api_key)
)

reference_ids = ['s999', 's1000']

reference_texts = collection.get(ids=reference_ids)["documents"]

result = collection.query(
  query_texts=reference_texts,
  n_results=3
)

print(result['documents'])

## Filtering using metadata

In [None]:
collection = client.get_collection(
  name="netflix_titles",
  embedding_function=OpenAIEmbeddingFunction(api_key=api_key)
)

reference_texts = ["children's story about a car", "lions"]

result = collection.query(
  query_texts=reference_texts,
  n_results=2,
  where={
    "$and": [
        {"rating": 
        	{"$eq": "G"}
        },
        {"release_year": 
         	{"$gt": 2000}
        }
    ]
  }
)

print(result['documents'])