### Embedding 기초

In [None]:
!pip install openai
!pip install pandas

In [None]:
from openai import OpenAI

# defaults to getting the key using os.environ.get("OPENAI_API_KEY")
# if you saved the key under a different environment variable name, you can do something like:
client = OpenAI(
  api_key="",
)

response = client.embeddings.create(
    input="Your text string goes here",
    model="text-embedding-3-small"
)

print(response.data[0].embedding)

In [None]:
import pandas as pd

input_datapath = "./data/fine_food_reviews_1k.csv"
df = pd.read_csv(input_datapath, index_col=0)
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()
df["combined"] = (
    "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)
df.head(2)

In [None]:
import numpy as np


def get_embedding(text: str, model="text-embedding-3-small", **kwargs) -> list[float]:
    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    response = client.embeddings.create(input=[text], model=model, **kwargs)

    return response.data[0].embedding


df["embedding2"] = df.combined[:300].apply(lambda x: get_embedding(x))
#df.to_csv("fine_food_reviews_with_embeddings_1k.csv")

In [None]:
df.combined[:10].apply(lambda x: get_embedding(x))

In [None]:
df.loc[:, ["combined", "embedding"]]

df.loc[:300].to_csv("./data/fine_food_reviews_with_embeddings_300.csv")

### Embedding Search

In [None]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [None]:
from ast import literal_eval

df = pd.read_csv("./data/fine_food_reviews_with_embeddings_300.csv")
df = df.drop_duplicates("combined")
df["embedding"] = df.embedding.apply(literal_eval).apply(np.array)
df.loc[:, ["combined", "embedding"]]

In [None]:
user_input_embedding = get_embedding(
      "delicous food",
      model="text-embedding-3-small"
  )
df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, user_input_embedding))

res = df.sort_values("similarity", ascending=False).head(5).combined.str.replace("Title: ", "").str.replace("; Content:", ": ")
"\n".join(res.to_list())

In [None]:
def search_reviews(df, user_input, n=3, pprint=True):
    user_input_embedding = get_embedding(
        user_input,
        model="text-embedding-3-small"
    )
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, user_input_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .combined.str.replace("Title: ", "")
        .str.replace("; Content:", ": ")
    )
    if pprint:
        for idx, r in enumerate(results):
            print(f"{idx})", r[:200])
    return results[:200]

In [None]:
results = search_reviews(df, "delicious beans", n=5)

In [None]:
results = search_reviews(df, "맛있는 콩", n=5)

In [None]:
results = search_reviews(df, "bad taste", n=5)

In [None]:
def get_gpt_response(user_mesage, search_result):
  full_message = f"[Related User Review Data]\n{search_result}\n\n{user_message}"

  completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
      {"role": "system", "content": "유저의 질문과 관련 있는 음식점 리뷰 데이터가 제공될 예정이야. 리뷰 데이터를 활용해서 데이터 분석을 도와줘."},
      {"role": "user", "content": full_message}
    ],
    temperature=0,
  )

  return completion.choices[0].message.content


while True:
  user_message = input(">>> ")

  if user_message == "break":
    break

  search_result_df = search_reviews(df, user_message, n=5, pprint=False)
  search_result = "\n".join((search_result_df.str[:200] + "...").to_list())
  print(f"[Related Review Data]\n{search_result}")
  print("--------\n")

  output = get_gpt_response(user_message, search_result)
  print(output)
  print("========\n")