In [15]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
from openai import OpenAI
openai_client = OpenAI()

In [15]:
import numpy as np

def cosine_similarity(vec1, vec2):
    """
    두 벡터(vec1, vec2) 사이의 코사인 유사도를 계산하는 함수입니다.
    코사인 유사도는 두 벡터가 이루는 각도의 코사인 값을 의미하며,
    값의 범위는 -1(완전히 반대) ~ 1(완전히 동일) 사이입니다.
    1에 가까울수록 두 벡터의 방향이 비슷하다는 뜻입니다.
    주로 텍스트 임베딩 등에서 두 문장의 유사도를 비교할 때 사용합니다.
    """

    # 두 벡터의 내적(dot product)을 계산합니다.
    # 내적은 두 벡터의 각 성분을 곱한 뒤 모두 더한 값입니다.
    dot_product = np.dot(vec1, vec2)

    # 각 벡터의 크기(노름, norm)를 계산합니다.
    # 벡터의 크기는 각 성분의 제곱을 모두 더한 뒤, 제곱근을 취한 값입니다.
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    # 만약 두 벡터 중 하나라도 크기가 0이면(즉, 영벡터라면)
    # 코사인 유사도를 정의할 수 없으므로 0.0을 반환합니다.
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0

    # 코사인 유사도는 내적을 두 벡터의 크기의 곱으로 나눈 값입니다.
    # 즉, 두 벡터가 이루는 각도의 코사인 값을 의미합니다.
    return dot_product / (norm_vec1 * norm_vec2)


In [None]:
king_enbedding_response = openai_client.embeddings.create(
    input="king",
    model="text-embedding-3-large"
)

In [8]:
king_vector = king_enbedding_response.data[0].embedding
print(king_vector)
print(np.array(king_vector))

[0.01040416955947876, 0.024995191022753716, -0.0014775966992601752, 0.0033329545985907316, 0.0006571469129994512, 0.02008429728448391, 0.013937810435891151, 0.010349079966545105, -0.013662359677255154, 0.04429248720407486, 0.002467252081260085, -0.004214397165924311, -0.059245530515909195, -0.056569721549749374, 0.0314643494784832, 0.006961035076528788, -0.06522674858570099, -0.004119956865906715, -0.05521607771515846, -0.00014768588880542666, 0.01193095464259386, -0.05143846943974495, -0.0010417941957712173, 0.004533133003860712, 0.029449624940752983, -0.019438955932855606, -0.007405691314488649, 0.0033604996278882027, -0.015244233421981335, 0.008609804324805737, 0.006846919655799866, 0.05952885001897812, 0.013882719911634922, 0.038311269134283066, -0.011852254159748554, 0.0010801606113091111, 0.01693628914654255, -0.00979817844927311, -0.011364312842488289, -0.02227216400206089, -0.015771524980664253, -0.04136483743786812, -0.06862659752368927, 0.010553699918091297, 0.017487190663814

In [9]:
queen_enbedding_response = openai_client.embeddings.create(
    input="queen",
    model="text-embedding-3-large"
)

In [10]:
queen_vector = queen_enbedding_response.data[0].embedding
print(queen_vector)
print(np.array(queen_vector))

[-0.013857284560799599, 0.0008810127037577331, -0.01679670810699463, 0.00881827250123024, 0.00604174705222249, -0.002595525933429599, 0.001059749280102551, -0.0058281682431697845, 0.000932144932448864, 0.010830981656908989, -0.017404865473508835, 0.012655450962483883, -0.032087504863739014, -0.0026570656336843967, 0.006859862711280584, 0.009679827839136124, -0.004629954695701599, -0.011402938514947891, -0.02965487912297249, 0.019345175474882126, -0.00236565712839365, -0.02977071888744831, 0.028076566755771637, -0.016680870205163956, 0.017115266993641853, -0.010570342652499676, 0.0031349030323326588, -0.024731704965233803, -0.025658421218395233, 0.02720777317881584, 0.014726080000400543, 0.051201000809669495, -0.015956873074173927, 0.05024532601237297, -0.04436647891998291, 0.008883431553840637, 0.035041410475969315, -0.00455031543970108, 0.03266670182347298, -0.019634773954749107, 0.002611815929412842, -0.027251211926341057, -0.052388355135917664, 0.009839106351137161, 0.00609604688361

In [11]:
king_queen_similarity = cosine_similarity(king_vector, queen_vector)

In [None]:
print(king_queen_similarity) #king && queen 유사도 using OPENAI EMBEDDING-3-LARGE

0.5551190172693515


In [None]:
korean_king_embedding_response = openai_client.embeddings.create(
    input="왕",
    model="text-embedding-3-large"
)

korean_king_vector = korean_king_embedding_response.data[0].embedding

king_korean_king_similarity = cosine_similarity(king_vector, korean_king_vector)

print(king_korean_king_similarity) #king && 왕 유사도 using OPENAI EMBEDDING-3-LARGE

0.547487391214023


In [21]:
import os

upstage_client = OpenAI(
    api_key=os.getenv("UPSTAGE_API_KEY"),
    base_url="https://api.upstage.ai/v1"
)

In [31]:
king_enbedding_response = upstage_client.embeddings.create(
    input="king",
    model="solar-embedding-1-large-query"
)

upstage_king_vector = np.array(king_enbedding_response.data[0].embedding)

In [32]:
print(upstage_king_vector)

[-0.01187134 -0.02062988 -0.00674057 ... -0.01081848  0.00247955
  0.01520538]


In [30]:
korean_king_enbedding_response = upstage_client.embeddings.create(
    input="왕",
    model="solar-embedding-1-large-query"
)

upstage_korean_king_vector = np.array(korean_king_enbedding_response.data[0].embedding)

In [33]:
print(upstage_korean_king_vector) 

[-0.01213837 -0.0224762  -0.01316071 ... -0.00024557  0.00360489
  0.01416779]


In [35]:
upstage_king_korean_king_similarity = cosine_similarity(upstage_king_vector, upstage_korean_king_vector)

In [None]:
print(upstage_king_korean_king_similarity) #king && 왕 유사도 using UPSTAGE EMBEDDING-1-LARGE-QUERY

0.8523202418742798


-번외(로컬 임베딩모델 with huggingface)

In [28]:
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# model_name = 'jhgan/ko-sroberta-multitask'
model_name = 'BAAI/bge-m3'
model = SentenceTransformer(model_name, device=device)

text1 = "king"
text2 = "queen"

print("Generating embeddings...")
embeddings_king = model.encode(text1)
embeddings_queen = model.encode(text2)

print(f"\nGenerated embeddings shape: {embeddings_king.shape}")
print(f"\nGenerated embeddings shape: {embeddings_queen.shape}")

Generating embeddings...

Generated embeddings shape: (1024,)

Generated embeddings shape: (1024,)


In [29]:
import torch

if torch.cuda.is_available():
    print("CUDA 사용 가능")
    print("사용 가능한 GPU 개수:", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i} 이름:", torch.cuda.get_device_name(i))
        print(f"GPU {i} capability:", torch.cuda.get_device_capability(i))
    print("현재 디바이스:", torch.cuda.current_device())
    print(f"PyTorch에서 인식한 CUDA 버전: {torch.version.cuda}")
    print("설치된 PyTorch 버전:", torch.__version__)
    print("CUDA 드라이버 버전은 'nvidia-smi' 명령어로 확인하세요.")

CUDA 사용 가능
사용 가능한 GPU 개수: 1
GPU 0 이름: NVIDIA GeForce RTX 4070 SUPER
GPU 0 capability: (8, 9)
현재 디바이스: 0
PyTorch에서 인식한 CUDA 버전: 12.8
설치된 PyTorch 버전: 2.7.1+cu128
CUDA 드라이버 버전은 'nvidia-smi' 명령어로 확인하세요.


In [30]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_king_queen = cosine_similarity([embeddings_king], [embeddings_queen])[0][0]
print("king && queen 유사도 with huggingface model: ", similarity_king_queen)

king && queen 유사도 with huggingface model:  0.7121061


In [35]:
korean_text1 = "왕"
embeddings_korean_text1 = model.encode(korean_text1, show_progress_bar=True)

similarity_korean_text1_king = cosine_similarity([embeddings_korean_text1], [embeddings_king])[0][0]

  return forward_call(*args, **kwargs)
Batches: 100%|██████████| 1/1 [00:00<00:00, 90.86it/s]


In [36]:
print(korean_text1,"&&", text1)
print(similarity_korean_text1_king)

왕 && king
0.64954984
