### Embeddings (임베딩)

In [8]:
from dotenv import load_dotenv

load_dotenv()

True

In [9]:
from openai import OpenAI
import pandas as pd

client = OpenAI()

text = "내가 오늘 점심을..."
response = client.embeddings.create(
    model="text-embedding-3-small",
    input=[text]
)

print(len(response.data[0].embedding))

pd.Series(response.data[0].embedding).head()

1536


0    0.034861
1    0.013542
2   -0.054741
3   -0.014818
4    0.007424
dtype: float64

In [10]:
df = pd.read_csv("fine_food_reviews_1k.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Time,ProductId,UserId,Score,Summary,Text
0,0,1351123200,B003XPF9BO,A3R7JR3FMEBXQB,5,where does one start...and stop... with a tre...,Wanted to save some to bring to my Chicago fam...
1,1,1351123200,B003JK537S,A3JBPC3WFUT5ZP,1,Arrived in pieces,"Not pleased at all. When I opened the box, mos..."
2,2,1351123200,B000JMBE7M,AQX1N6A51QOKG,4,"It isn't blanc mange, but isn't bad . . .",I'm not sure that custard is really custard wi...
3,3,1351123200,B004AHGBX4,A2UY46X0OSNVUQ,3,These also have SALT and it's not sea salt.,I like the fact that you can see what you're g...
4,4,1351123200,B001BORBHO,A1AFOYZ9HSM2CZ,5,Happy with the product,My dog was suffering with itchy skin. He had ...


In [11]:
import tiktoken

# 토크나이저를 가져온다
gpt5nano_encoding = tiktoken.encoding_for_model("gpt-5-nano")

# 각 리뷰 텍스트가 몇 개의 토큰인지 계산해서 새 컬럼에 저장한다.
df['n_tokens'] = df['Text'].apply(lambda x : len(gpt5nano_encoding.encode(x)))

df['n_tokens'].describe()

count    1000.000000
mean       83.818000
std        71.905308
min        22.000000
25%        38.000000
50%        59.000000
75%       104.000000
max       614.000000
Name: n_tokens, dtype: float64

In [12]:
# 전체 데이터 임베딩
def texts_to_embedding(texts):
    # 줄바꿈 문자를 공백으로 바꿔주면 성능이 조금 더 좋아진다
    texts = [ text.replace('\n', ' ') for text in texts ]

    response = client.embeddings.create(
        model='text-embedding-3-small',
        input=texts
    )
    # 결과에서 벡터 리스트만 뽑아 반환
    return [data.embedding for data in response.data]

df['embedding'] = texts_to_embedding(df['Text'].tolist())
df['embedding'].head()

0    [0.01677853614091873, -0.008555943146348, -0.0...
1    [-0.005216312129050493, 0.040469057857990265, ...
2    [0.005564768798649311, -0.012970144860446453, ...
3    [-0.016292475163936615, 0.008886804804205894, ...
4    [-0.004322985652834177, -0.06378211826086044, ...
Name: embedding, dtype: object

In [3]:
%pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.8.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Using cached scipy-1.16.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Using cached joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.8.0-cp312-cp312-win_amd64.whl (8.0 MB)
Using cached joblib-1.5.3-py3-none-any.whl (309 kB)
Using cached scipy-1.16.3-cp312-cp312-win_amd64.whl (38.6 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn

   ---------- ----------------------------- 1/4 [scipy]
   ---------- ----------------------------- 1/4 [scipy]
   ---------- ----------------------------- 1/4 [scipy]
   ---------- ----------------------------- 1/4 [scipy]
   ---------- --

In [13]:
# 의미 기반 검색 구현
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_similar_texts(query_text, df, top_k=7):
    # 사용자의 검색어도 벡터로 변환
    query_vector = texts_to_embedding([query_text])[0]

    # 데이터프레임에 있는 벡터들을 계산하기 쉽게 numpy 배열로 바꿔줌
    embeddings = np.array(df['embedding'].tolist())

    # query_vector를 2차월 배열로 만들어줘야 해서 []로 감싼다.
    cos_sim = cosine_similarity([query_vector], embeddings)

    df['cos_sim'] = cos_sim[0]

    return df.sort_values(by='cos_sim', ascending=False)[['Text', 'cos_sim']].head(top_k)


In [17]:
# 검색 테스트
search_result = get_similar_texts("milk", df)
search_result


Unnamed: 0,Text,cos_sim
442,Nifty hot chocolate discs added to your warm m...,0.404077
463,I purchased this in a local health foods store...,0.344565
252,This is the best tea i have had yet. It remind...,0.34273
602,Hands down one of the best milk chocolate bars...,0.336708
327,"This is a high quality Ghee , superior tasting...",0.329324
320,This matcha is smooth and creamy-and smells di...,0.321098
405,This matcha is smooth and creamy-and smells di...,0.321098
