In [None]:
DATA_PATH = "../../data/sample/comments.csv"
COLLECTION_NAME = "comments_openai"

In [40]:
import os
from dotenv import load_dotenv
import chromadb
from chromadb.config import Settings

load_dotenv(".env")

CHROMA_SERVER_AUTH_CREDENTIALS = os.getenv("CHROMA_SERVER_AUTH_CREDENTIALS")

client = chromadb.HttpClient(
    host="https://chroma.liara.run",
    settings=Settings(
        chroma_client_auth_provider="chromadb.auth.token_authn.TokenAuthClientProvider",
        chroma_client_auth_credentials=CHROMA_SERVER_AUTH_CREDENTIALS,
    ),
)

In [41]:
import pandas as pd

sample_comments = pd.read_csv(DATA_PATH)

# create a map out of product_id to comments
product_comments_map = {}
for _, row in sample_comments.iterrows():
    product_id = row["product_id"]
    comment = row
    if product_id not in product_comments_map:
        product_comments_map[product_id] = []
    product_comments_map[product_id].append(comment)

# sort product_comments_map by length of comments
product_comments_map = dict(
    sorted(product_comments_map.items(), key=lambda item: len(item[1]), reverse=True)
)

In [42]:
# filter product_comments_map for more than 5 comments
product_comments_map = {
    product_id: comments
    for product_id, comments in product_comments_map.items()
    if len(comments) > 5
}

print(f"Number of products with more than 5 comments: {len(product_comments_map)}")

Number of products with more than 5 comments: 131


In [58]:
import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv(".env")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")


openai_client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)

def get_summary(comments):
    system_prompt = "You are a helpful assistant. You should not engage in a conversation with the user. Your response should be in persian language."

    prompt = f"""I have a list of comments that need to be summarized. Each comment contains various points and details that are crucial for understanding the overall feedback. The summary should be detailed enough to enable effective semantic search for the most relevant results later on.

                1. The main idea or topic of the comments.
                2. Specific points and details mentioned.
                3. Any notable examples or anecdotes provided.

                Adhere to these guidelines:
                1. Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and conciseness.
                2. Incorporate main ideas and essential information, eliminating extraneous language and focusing on critical aspects.
                3. Rely strictly on the provided text, without including external information.
                4. Your response should be in persian language. (زبان فارسی)
                5. Your response should be in a single paragraph and contains only the summary of the comments.


                COMMENTS: {comments}"""

    response = openai_client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": prompt,
            },
        ],
    )

    return response.choices[0].message.content

In [None]:
product_comments_summary_map = {}

import tiktoken

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")


for item in product_comments_map.items():
    print(f"Remaining products: {len(product_comments_map) - len(product_comments_summary_map)}")
    product_id = item[0]
    product_comments = item[1]
    product_comments_body = [comment["body"] for comment in product_comments]

    encoded = encoding.encode(str(product_comments_body))
    chunk_size = 1
    if len(encoded) > 15000:
        chunk_size = 2
        while True:
            # compute the encoding for each chunk
            for i in range(0, chunk_size):
                chunk = product_comments_body[
                    i
                    * len(product_comments_body)
                    // chunk_size : (i + 1)
                    * len(product_comments_body)
                    // chunk_size
                ]
                encoded = encoding.encode(str(chunk))
                if len(encoded) > 15000:
                    chunk_size *= 2
                    continue
            break

    # for each chunk of comments, get the summary
    for i in range(0, chunk_size):
        chunk = product_comments_body[
            i
            * len(product_comments_body)
            // chunk_size : (i + 1)
            * len(product_comments_body)
            // chunk_size
        ]
        summary = get_summary(chunk)
        if product_id not in product_comments_summary_map:
            product_comments_summary_map[product_id] = []
        product_comments_summary_map[product_id].append(summary)

In [None]:
from pprint import pprint
pprint(len(product_comments_summary_map))

In [None]:
client.create_collection(name=COLLECTION_NAME)

In [None]:
documents = []
ids = []
metadatas = []
embeddings = []

for row in product_comments_summary_map.items():
    print(f"Remaining products: {len(product_comments_summary_map) - len(ids)}")
    id = row[0]
    comments = row[1]
    aggregated_comments = " ".join(comments)
    embedding = (
        openai_client.embeddings.create(
            input=aggregated_comments, model="text-embedding-3-small"
        )
        .data[0]
        .embedding
    )
    ids.append(id)
    documents.append(aggregated_comments)
    embeddings.append(embedding)

In [None]:
comments_collection = client.get_collection(name=COLLECTION_NAME)
# convert ids to string
ids = [str(id) for id in ids]
metadatas = [{"product_id": id} for id in ids]

comments_collection.upsert(
    documents=documents,
    ids=ids,
    embeddings=embeddings,
    metadatas=metadatas,
)

print(comments_collection.peek(limit=5))

In [None]:
query = "محصولات پنبه‌ای و معطر"

query_embedding = (
    openai_client.embeddings.create(input=query, model="text-embedding-3-small")
    .data[0]
    .embedding
)

result = comments_collection.query(query_embeddings=[query_embedding], n_results=5)

pprint(result)