In [None]:
import pathlib
import polars as pl
import chromadb
import os
import json
from openai import OpenAI
from chromadb.utils import embedding_functions
from more_itertools import batched

In [None]:
def prepare_car_reviews_data(data_path: pathlib.Path, vehicle_years: list[int] = [2017]):
    """Prepare the car reviews dataset for ChromaDB"""

    # Define the schema to ensure proper data types are enforced
    dtypes = {
        "": pl.Int64,
        "Review_Date": pl.Utf8,
        "Author_Name": pl.Utf8,
        "Vehicle_Title": pl.Utf8,
        "Review_Title": pl.Utf8,
        "Review": pl.Utf8,
        "Rating": pl.Float64,
    }

    # Scan the car reviews dataset(s)
    car_reviews = pl.scan_csv(data_path, dtypes=dtypes)

    # Extract the vehicle title and year as new columns
    # Filter on selected years
    car_review_db_data = (
        car_reviews.with_columns(
            [
                (
                    pl.col("Vehicle_Title").str.split(
                        by=" ").list.get(0).cast(pl.Int64)
                ).alias("Vehicle_Year"),
                (pl.col("Vehicle_Title").str.split(by=" ").list.get(1)).alias(
                    "Vehicle_Model"
                ),
            ]
        )
        .filter(pl.col("Vehicle_Year").is_in(vehicle_years))
        .select(["Review_Title", "Review", "Rating", "Vehicle_Year", "Vehicle_Model"])
        .sort(["Vehicle_Model", "Rating"])
        .collect()
    )

    # Create ids, documents, and metadatas data in the format chromadb expects
    ids = [f"review{i}" for i in range(car_review_db_data.shape[0])]
    documents = car_review_db_data["Review"].to_list()
    metadatas = car_review_db_data.drop("Review").to_dicts()

    return {"ids": ids, "documents": documents, "metadatas": metadatas}

In [None]:
def build_chroma_collection(
    chroma_path: pathlib.Path,
    collection_name: str,
    embedding_func_name: str,
    ids: list[str],
    documents: list[str],
    metadatas: list[dict],
    distance_func_name: str = "cosine",
):
    """Create a ChromaDB collection"""

    chroma_client = chromadb.PersistentClient(chroma_path)

    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=embedding_func_name
    )

    collection = chroma_client.create_collection(
        name=collection_name,
        embedding_function=embedding_func,
        metadata={"hnsw:space": distance_func_name},
    )

    document_indices = list(range(len(documents)))

    for batch in batched(document_indices, 166):
        start_idx = batch[0]
        end_idx = batch[-1]

        collection.add(
            ids=ids[start_idx:end_idx],
            documents=documents[start_idx:end_idx],
            metadatas=metadatas[start_idx:end_idx],
        )

In [None]:
# # from car_data_etl import prepare_car_reviews_data
# DATA_PATH = "/Users/davidkolb/Documents/Code/kolbeuk-data/vectordb/archive/*"
# chroma_car_reviews_dict = prepare_car_reviews_data(DATA_PATH)
# chroma_car_reviews_dict.keys()
# chroma_car_reviews_dict["ids"][-10]
# print(chroma_car_reviews_dict["documents"][-10])
# chroma_car_reviews_dict["metadatas"][-10]

In [None]:
DATA_PATH = "/Users/davidkolb/Documents/Code/kolbeuk-data/vectordb/archive/*"
CHROMA_PATH = "/Users/davidkolb/Documents/Code/kolbeuk-data/vectordb/car_review_embeddings"
EMBEDDING_FUNC_NAME = "multi-qa-MiniLM-L6-cos-v1"
COLLECTION_NAME = "car_reviews"

chroma_car_reviews_dict = prepare_car_reviews_data(DATA_PATH)

build_chroma_collection(
    CHROMA_PATH,
    COLLECTION_NAME,
    EMBEDDING_FUNC_NAME,
    chroma_car_reviews_dict["ids"],
    chroma_car_reviews_dict["documents"],
    chroma_car_reviews_dict["metadatas"]
)

In [None]:
client = chromadb.PersistentClient(CHROMA_PATH)
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=EMBEDDING_FUNC_NAME
    )
collection = client.get_collection(name=COLLECTION_NAME, embedding_function=embedding_func)

In [None]:
great_reviews = collection.query(
    query_texts=["Find me some positive reviews that discuss the car's performance"],
    n_results=5,
    include=["documents", "distances", "metadatas"]
)

great_reviews["documents"][0][0]

In [None]:
collection.peek()

In [62]:
def print_review_summaries(review_summaries):
    # Access and print the content field
    print(review_summaries.choices[0].message.content)

    # Print the attributes of the review_summaries object
    for attr, value in review_summaries.__dict__.items():
        print(f"{attr}: {value}")

In [56]:
client = OpenAI()

context = "You are a customer success employee at a large car dealership."
question = "What's the key to great customer satisfaction?"

completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": context},
        {"role": "user", "content": question},
    ],
    temperature=0
)

ChatCompletionMessage(content='The key to great customer satisfaction is providing exceptional customer service. This involves understanding and anticipating the needs of customers, being responsive and attentive to their inquiries and concerns, and going above and beyond to exceed their expectations. Building strong relationships with customers, being knowledgeable about the products and services offered, and consistently delivering a positive and personalized experience are also crucial. Additionally, actively seeking feedback and continuously improving based on customer input is essential for maintaining high levels of customer satisfaction.', role='assistant', function_call=None, tool_calls=None)


In [63]:
# Call the function with your review_summaries object as an argument
print_review_summaries(completion)

The key to great customer satisfaction is providing exceptional customer service. This involves understanding and anticipating the needs of customers, being responsive and attentive to their inquiries and concerns, and going above and beyond to exceed their expectations. Building strong relationships with customers, being knowledgeable about the products and services offered, and consistently delivering a positive and personalized experience are also crucial. Additionally, actively seeking feedback and continuously improving based on customer input is essential for maintaining high levels of customer satisfaction.
id: chatcmpl-8YWlqKkfFQqTiouspnnaxmVLEyBKV
choices: [Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='The key to great customer satisfaction is providing exceptional customer service. This involves understanding and anticipating the needs of customers, being responsive and attentive to their inquiries and concerns, and going above an

In [None]:
client = OpenAI()

chroma_client = chromadb.PersistentClient(CHROMA_PATH)
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBEDDING_FUNC_NAME
)

collection = chroma_client.get_collection(
    name=COLLECTION_NAME, embedding_function=embedding_func
)

In [58]:
context = """ You are a customer success employee at a large car dealership. Use the following car reviews to answer questions: {} """

question = """What's the key to great customer satisfaction based on detailed positive reviews?"""

good_reviews = collection.query(
    query_texts=[question],
    n_results=10,
    include=["documents"],
    where={"Rating": {"$gte": 3}},
)
reviews_str = ",".join(good_reviews["documents"][0])

In [60]:
good_review_summaries = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": context.format(reviews_str)},
        {"role": "user", "content": question},
    ]
)

ChatCompletionMessage(content='Based on the detailed positive reviews, the key to great customer satisfaction is a combination of factors, including:\n\n1. Value for money: Customers mention that the cars offer great value and are a good deal. This suggests that customers feel they are getting their money\'s worth and are happy with the overall pricing.\n\n2. Reliability: The cars are praised for their reliability, indicating that customers value a vehicle that they can depend on and trust.\n\n3. Quality and comfort: Customers appreciate the excellent quality, technology, and comfort provided by the cars. This suggests that the vehicles are built to a high standard and offer a comfortable driving experience.\n\n4. Performance: Customers mention that they love the performance of the cars, highlighting smooth acceleration and excellent features.\n\n5. Low maintenance costs: The mention of low maintenance costs is seen as a positive factor. Customers appreciate a car that doesn\'t require

In [61]:
# Call the function with your review_summaries object as an argument
print_review_summaries(good_review_summaries)

Based on the detailed positive reviews, the key to great customer satisfaction is a combination of factors, including:

1. Value for money: Customers mention that the cars offer great value and are a good deal. This suggests that customers feel they are getting their money's worth and are happy with the overall pricing.

2. Reliability: The cars are praised for their reliability, indicating that customers value a vehicle that they can depend on and trust.

3. Quality and comfort: Customers appreciate the excellent quality, technology, and comfort provided by the cars. This suggests that the vehicles are built to a high standard and offer a comfortable driving experience.

4. Performance: Customers mention that they love the performance of the cars, highlighting smooth acceleration and excellent features.

5. Low maintenance costs: The mention of low maintenance costs is seen as a positive factor. Customers appreciate a car that doesn't require frequent and expensive repairs.

6. Positi

In [64]:
question = """Which of these poor reviews has the worst implications about our dealership? Explain why."""

poor_reviews = collection.query(
    query_texts=[question],
    n_results=5,
    include=["documents"],
    where={"Rating": {"$lte": 3}},
)

reviews_str = ",".join(poor_reviews["documents"][0])

poor_review_analysis = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": context.format(reviews_str)},
        {"role": "user", "content": question},
    ]
)

In [65]:
# Call the function with your review_summaries object as an argument
print_review_summaries(poor_review_analysis)

The first review has the worst implications about the dealership. This is because the customer mentions several unresolved electrical issues with the car, including problems with Bluetooth, backup camera, trunk, black screen, and seatbelts. These issues directly affect the functionality and safety of the vehicle. Additionally, the customer expresses frustration at the dealership's inability to resolve these issues despite multiple visits. This suggests a lack of expertise or responsiveness on the dealership's part, which reflects poorly on their customer service and ability to address customer concerns.
id: chatcmpl-8YWsLHf5gzMekeVz8sqF8ePDYkUJR
choices: [Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="The first review has the worst implications about the dealership. This is because the customer mentions several unresolved electrical issues with the car, including problems with Bluetooth, backup camera, trunk, black screen, and seatbelts. The