In [1]:
import pathlib
import polars as pl
import chromadb
import os
import json
from openai import OpenAI
from chromadb.utils import embedding_functions
from more_itertools import batched

In [2]:
def prepare_car_reviews_data(data_path: pathlib.Path, vehicle_years: list[int] = [2017]):
    """Prepare the car reviews dataset for ChromaDB"""

    # Define the schema to ensure proper data types are enforced
    dtypes = {
        "": pl.Int64,
        "Review_Date": pl.Utf8,
        "Author_Name": pl.Utf8,
        "Vehicle_Title": pl.Utf8,
        "Review_Title": pl.Utf8,
        "Review": pl.Utf8,
        "Rating": pl.Float64,
    }

    # Scan the car reviews dataset(s)
    car_reviews = pl.scan_csv(data_path, dtypes=dtypes)

    # Extract the vehicle title and year as new columns
    # Filter on selected years
    car_review_db_data = (
        car_reviews.with_columns(
            [
                (
                    pl.col("Vehicle_Title").str.split(
                        by=" ").list.get(0).cast(pl.Int64)
                ).alias("Vehicle_Year"),
                (pl.col("Vehicle_Title").str.split(by=" ").list.get(1)).alias(
                    "Vehicle_Model"
                ),
            ]
        )
        .filter(pl.col("Vehicle_Year").is_in(vehicle_years))
        .select(["Review_Title", "Review", "Rating", "Vehicle_Year", "Vehicle_Model"])
        .sort(["Vehicle_Model", "Rating"])
        .collect()
    )

    # Create ids, documents, and metadatas data in the format chromadb expects
    ids = [f"review{i}" for i in range(car_review_db_data.shape[0])]
    documents = car_review_db_data["Review"].to_list()
    metadatas = car_review_db_data.drop("Review").to_dicts()

    return {"ids": ids, "documents": documents, "metadatas": metadatas}

In [3]:
def build_chroma_collection(
    chroma_path: pathlib.Path,
    collection_name: str,
    embedding_func_name: str,
    ids: list[str],
    documents: list[str],
    metadatas: list[dict],
    distance_func_name: str = "cosine",
):
    """Create a ChromaDB collection"""

    chroma_client = chromadb.PersistentClient(chroma_path)

    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=embedding_func_name
    )

    collection = chroma_client.create_collection(
        name=collection_name,
        embedding_function=embedding_func,
        metadata={"hnsw:space": distance_func_name},
    )

    document_indices = list(range(len(documents)))

    for batch in batched(document_indices, 166):
        start_idx = batch[0]
        end_idx = batch[-1]

        collection.add(
            ids=ids[start_idx:end_idx],
            documents=documents[start_idx:end_idx],
            metadatas=metadatas[start_idx:end_idx],
        )

In [9]:
def print_review_summaries(review_summaries):
    # Access and print the content field
    print(review_summaries.choices[0].message.content)

    # Print the attributes of the review_summaries object
    for attr, value in review_summaries.__dict__.items():
        print(f"{attr}: {value}")

In [5]:
DATA_PATH = "/Users/davidkolb/Documents/Code/kolbeuk-data/vectordb/archive/*"
CHROMA_PATH = "/Users/davidkolb/Documents/Code/kolbeuk-data/vectordb/car_review_embeddings"
EMBEDDING_FUNC_NAME = "multi-qa-MiniLM-L6-cos-v1"
COLLECTION_NAME = "car_reviews"

chroma_car_reviews_dict = prepare_car_reviews_data(DATA_PATH)

build_chroma_collection(
    CHROMA_PATH,
    COLLECTION_NAME,
    EMBEDDING_FUNC_NAME,
    chroma_car_reviews_dict["ids"],
    chroma_car_reviews_dict["documents"],
    chroma_car_reviews_dict["metadatas"]
)

In [6]:
client = chromadb.PersistentClient(CHROMA_PATH)
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=EMBEDDING_FUNC_NAME
    )
collection = client.get_collection(name=COLLECTION_NAME, embedding_function=embedding_func)

In [7]:
great_reviews = collection.query(
    query_texts=["Find me some positive reviews that discuss the car's performance"],
    n_results=5,
    include=["documents", "distances", "metadatas"]
)

great_reviews["documents"][0][0]

' Great all around car with great balance of performance and comfort. Terrific technology too.'

In [8]:
collection.peek()

{'ids': ['review0',
  'review1',
  'review10',
  'review100',
  'review1000',
  'review1001',
  'review1002',
  'review1003',
  'review1004',
  'review1005'],
 'embeddings': [[-0.007107960991561413,
   -0.04246281832456589,
   0.05003391206264496,
   0.03374449908733368,
   -0.03472192585468292,
   -0.06995291262865067,
   0.09642210602760315,
   -0.010946568101644516,
   0.004524622578173876,
   -0.0807584747672081,
   0.1146438792347908,
   -0.028937509283423424,
   0.0036295833997428417,
   -0.00577996950596571,
   -0.08995964378118515,
   -0.002704076236113906,
   0.051943592727184296,
   -0.10538087040185928,
   -0.010452251881361008,
   -0.002081404672935605,
   -0.08264221996068954,
   0.004843683913350105,
   -0.07408193498849869,
   -0.046264152973890305,
   -0.017171526327729225,
   0.08780661970376968,
   0.001152361393906176,
   -0.039216428995132446,
   -0.017377350479364395,
   -0.08566072583198547,
   0.0011874429183080792,
   0.005545011721551418,
   -0.0194988604635000

In [10]:
client = OpenAI()

context = "You are a customer success employee at a large car dealership."
question = "What's the key to great customer satisfaction?"

completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": context},
        {"role": "user", "content": question},
    ],
    temperature=0
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
# Call the function with your review_summaries object as an argument
print_review_summaries(completion)

The key to great customer satisfaction is providing exceptional customer service. This involves understanding and anticipating the needs of customers, being responsive and attentive to their inquiries and concerns, and going above and beyond to exceed their expectations. Building strong relationships with customers, being knowledgeable about the products and services offered, and consistently delivering a positive and personalized experience are also crucial. Additionally, actively seeking feedback and continuously improving based on customer input is essential for maintaining high levels of customer satisfaction.
id: chatcmpl-8ZdKM9Ldu3WOqdkA9dz47TYrEwrNT
choices: [Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='The key to great customer satisfaction is providing exceptional customer service. This involves understanding and anticipating the needs of customers, being responsive and attentive to their inquiries and concerns, and going above and beyond to exc

In [12]:
client = OpenAI()

chroma_client = chromadb.PersistentClient(CHROMA_PATH)
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBEDDING_FUNC_NAME
)

collection = chroma_client.get_collection(
    name=COLLECTION_NAME, embedding_function=embedding_func
)

In [13]:
context = """ You are a customer success employee at a large car dealership. Use the following car reviews to answer questions: {} """

question = """What's the key to great customer satisfaction based on detailed positive reviews?"""

good_reviews = collection.query(
    query_texts=[question],
    n_results=10,
    include=["documents"],
    where={"Rating": {"$gte": 3}},
)
reviews_str = ",".join(good_reviews["documents"][0])

In [14]:
good_review_summaries = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": context.format(reviews_str)},
        {"role": "user", "content": question},
    ]
)

In [15]:
# Call the function with your review_summaries object as an argument
print_review_summaries(good_review_summaries)

Based on the detailed positive reviews, the key to great customer satisfaction appears to be a combination of several factors:

1. Value for Money: Customers appreciate getting more for their money, whether it's in terms of features, warranty coverage, or fuel efficiency. Providing a competitive price and offering more value than the competitors can greatly enhance customer satisfaction.

2. Reliability: Customers value reliability and dependability in their vehicles. If a car has a reputation for being reliable, customers are more likely to feel satisfied with their purchase and have faith in the longevity of the vehicle.

3. Performance and Comfort: The performance and comfort of the car are mentioned as important factors in customer satisfaction. Customers appreciate smooth acceleration, quiet ride, comfortable seating, and good ergonomics.

4. Unique Features: Having unique features or characteristics can draw customers in. In one review, the customer was attracted to the Nissan Ti

In [16]:
question = """Which of these poor reviews has the worst implications about our dealership? Explain why."""

poor_reviews = collection.query(
    query_texts=[question],
    n_results=5,
    include=["documents"],
    where={"Rating": {"$lte": 3}},
)

reviews_str = ",".join(poor_reviews["documents"][0])

poor_review_analysis = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": context.format(reviews_str)},
        {"role": "user", "content": question},
    ]
)

In [17]:
# Call the function with your review_summaries object as an argument
print_review_summaries(poor_review_analysis)

The first review has the worst implications about the dealership. This is because the customer states that they have been to the dealership four times and still have unresolved electrical issues with the car. The customer specifically mentions numerous problems with various features of the car, including Bluetooth, backup camera, trunk, black screen, clock, and seatbelts. The fact that the customer has visited the dealership multiple times and still has unresolved issues suggests that the dealership has been unable to effectively address and fix the problems with the car. Additionally, the customer mentions their frustration with the vehicle and their intention to drop off the car at the dealership and purchase something else. This indicates a high level of dissatisfaction with the dealership and their ability to provide a reliable and satisfactory vehicle.
id: chatcmpl-8ZdKVw2OHpw2dc1HQ6kc5WgbL8u4Z
choices: [Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='