In [None]:
from typing import List, Dict, Union
from embeddings_util import EmbeddingsUtil

import pandas as pd
import tiktoken

embedding_encoding = "cl100k_base"
max_tokens = 8000

embedding_client = EmbeddingsUtil(
    api_key = "",
    verbose = True,  
    embedding_model = "text-embedding-3-small",
    chunk_max_tokens = 8000, 
    )


In [None]:
# load & inspect dataset
input_datapath = "data_test1.csv"  # to save space, we provide a pre-filtered dataset
df = pd.read_csv(input_datapath, index_col=0)
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()
df["combined"] = (
    "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)
df.head(2)

In [None]:
# subsample to 1k most recent reviews and remove samples that are too long

top_n = 1000
df = df.sort_values("Time").tail(top_n * 2)  # first cut to first 2k entries, assuming less than half will be filtered out
df.drop("Time", axis=1, inplace=True)

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)

In [None]:
df.to_csv("data_test1.csv")

In [None]:
def generate_embedding_for_text(self,text: str) -> List[float]:
    """
    Generates an embedding for a given text using the OpenAI API.

    Args:
        text (str): The text to generate an embedding for.

    Returns:
        List[float]: The generated embedding as a list of floats.
    """
    try:
        embedding = self.openai_call(text, "/v1/embeddings", self.embedding_model)
        return embedding
    except Exception as e:
        print(f"Error generating embedding: {e}")
        return []

In [None]:

text_to_embed = "This is an example text for embedding."

embedding_result = generate_embedding_for_text(text_to_embed)

In [34]:
from typing import List

class EmbeddingGenerator:
    def __init__(self, embedding_model: str, api_key: str):
        self.embedding_model = embedding_model
        self.api_key = api_key
        # Initialize any other necessary properties or dependencies

    def openai_call(self, text: str, endpoint: str, model: str) -> List[float]:
        # Implementation for making the OpenAI API call goes here
        # This function should return the embedding as a list of floats
        pass

    def generate_embedding_for_text(self, text: str) -> List[float]:
        try:
            embedding = self.openai_call(text, "/v1/embeddings", self.embedding_model)
            return embedding
        except Exception as e:
            print(f"Error generating embedding: {e}")
            return []

# Create an instance of the EmbeddingGenerator class
embedding_generator = EmbeddingGenerator(embedding_model="text-embedding-3-small", api_key="")

# Call the generate_embedding_for_text method
text = "This is an example text for generating embeddings."
embedding = embedding_generator.generate_embedding_for_text(text)

if embedding:
    print(f"Embedding for '{text}': {embedding}")
else:
    print("Failed to generate embedding.")

Failed to generate embedding.
