In [17]:
DATA_PATH = "../../data/books-sample/products.csv"
NORMALIZED_DATA_PATH = "../../data/books-sample/normalized_products.csv"
COLLECTION_NAME = "products_books_openai"

In [5]:
import os
from dotenv import load_dotenv
import chromadb
from chromadb.config import Settings

load_dotenv(".env")

CHROMA_SERVER_AUTH_CREDENTIALS = os.getenv("CHROMA_SERVER_AUTH_CREDENTIALS")

client = chromadb.HttpClient(
    host="https://chroma.liara.run",
    settings=Settings(
        chroma_client_auth_provider="chromadb.auth.token_authn.TokenAuthClientProvider",
        chroma_client_auth_credentials=CHROMA_SERVER_AUTH_CREDENTIALS,
    ),
)

In [1]:
from hazm import *

normalizer = Normalizer()

In [14]:
import pandas as pd

sample_products = pd.read_csv(DATA_PATH)
# drop duplicates
sample_products = sample_products.drop_duplicates(subset=["id"])
sample_products["normalized_title_fa"] = sample_products["title_fa"].apply(
    lambda x: normalizer.normalize(x)
)

# save to file
sample_products.to_csv("../../data/books-sample/normalized_products.csv", index=False)

In [10]:
import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv(".env")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")


openai_client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)

In [7]:
client.create_collection(name=COLLECTION_NAME)

Collection(name=products_books_openai)

In [None]:
import pandas as pd

sample_products = pd.read_csv(NORMALIZED_DATA_PATH)

documents = []
ids = []
metadatas = []
embeddings = []

for i, row in sample_products.iterrows():
    print(i, row["normalized_title_fa"])
    embedding = (
        openai_client.embeddings.create(
            input=row["normalized_title_fa"], model="text-embedding-3-small"
        )
        .data[0]
        .embedding
    )

    ids.append(str(row["id"]))
    documents.append(row["normalized_title_fa"])
    embeddings.append(embedding)
    metadatas.append(row.to_dict())

In [None]:
product_collection = client.get_collection(name=COLLECTION_NAME)

product_collection.upsert(
    documents=documents,
    metadatas=metadatas,
    ids=ids,
    embeddings=embeddings,
)

print(product_collection.peek(limit=5))