In [17]:
import numpy as np

def cosine_similarity(vec_a: np.ndarray, vec_b: np.ndarray) -> float:
    dot = np.dot(vec_a, vec_b)
    norm_a = np.linalg.norm(vec_a)
    norm_b = np.linalg.norm(vec_b)
    return dot / (norm_a*norm_b)

# openai embedding

In [1]:
from dotenv import load_dotenv
load_dotenv

<function dotenv.main.load_dotenv(dotenv_path: Union[str, ForwardRef('os.PathLike[str]'), NoneType] = None, stream: Optional[IO[str]] = None, verbose: bool = False, override: bool = False, interpolate: bool = True, encoding: Optional[str] = 'utf-8') -> bool>

In [5]:
from openai import OpenAI

openai_client = OpenAI()

In [13]:
def get_openai_embedding(text:str, model='text-embedding-3-small'):
    response = openai_client.embeddings.create(
        input=text,
        model=model
    )
    return response.data[0].embedding

In [11]:
text_str = 'Hi'
emb_vector = get_openai_embedding(text_str)
len(emb_vector.embedding)

1536

In [None]:
def get_upstage_embedding(text:str, model='text-embedding-3-small'):
    response = openai_client.embeddings.create(
        input=text,
        model=model
    )
    return response.data[0]

In [21]:
# 임베딩 모델 생성

import os
from langchain_upstage import UpstageEmbeddings

embeddings = UpstageEmbeddings(
    api_key=os.getenv('UPSTAGE_API_KEY'),
    model='solar-embedding-1-large'
)

In [22]:
# 업세트이지 모델로 임베딩 함수
def get_upstage_embedding(text: str, is_query: bool = False) -> np.ndarray:
    """
    Upstage 임베딩 벡터를 반환한다.
    
    Args:
        text (str): 임베딩할 문장
        is_query (bool): True이면 검색 쿼리용 임베딩(embed_query), 
                         False이면 문서 임베딩(embed_documents)

    Returns:
        np.ndarray: 임베딩 벡터 (float 배열)
    """
    if is_query:
        vec = embeddings.embed_query(text)
    else:
        vec = embeddings.embed_documents([text])[0]  # 리스트에서 첫 번째 요소만 추출
    return np.array(vec)

In [10]:
# pip install -qU langchain-core langchain-upstage
import os
 
from langchain_upstage import UpstageEmbeddings
 
embeddings = UpstageEmbeddings(
    api_key=os.getenv('UPSTAGE_API_KEY'),
    model="embedding-query"
)
 
doc_result = embeddings.embed_documents(
    ["Sam is a teacher.", "This is another document"]
)
print(doc_result)
 
query_result = embeddings.embed_query("What does Sam do?")
print(query_result)

[[0.0163726806640625, 0.0171356201171875, -0.007755279541015625, 0.0241546630859375, 0.0034046173095703125, -0.006816864013671875, -0.014617919921875, -0.01065826416015625, -0.015594482421875, 0.006771087646484375, 0.0184173583984375, 0.00499725341796875, 0.007293701171875, 0.01177978515625, 0.02777099609375, 0.0218963623046875, -0.0213623046875, -0.0012426376342773438, -0.0030002593994140625, -0.016754150390625, -0.0240936279296875, -0.009124755859375, -0.01122283935546875, -0.005931854248046875, -0.0103912353515625, 0.0162811279296875, 0.0027751922607421875, -0.01192474365234375, 0.00513458251953125, 0.021484375, 0.00560760498046875, 0.01409149169921875, 0.0020751953125, -0.0093994140625, 0.0106964111328125, -0.0122833251953125, -0.003612518310546875, 0.0237274169921875, -0.01245880126953125, 0.0128631591796875, -0.01125335693359375, -0.0201263427734375, -0.0140838623046875, 9.810924530029297e-05, -0.0164642333984375, 0.012847900390625, 0.006908416748046875, -0.0008788108825683594, 0

In [12]:
len(query_result)

4096

In [14]:
texts = ['king', 'queen', 'slave', '왕']

In [None]:
openai_embeddings = {txt: get_openai_embedding(txt) for txt in texts}

{'king': [0.037221893668174744,
  -0.022094957530498505,
  0.05194341763854027,
  0.00014054813073016703,
  -0.013733332976698875,
  -0.023032471537590027,
  -0.02377995103597641,
  0.005580750294029713,
  -0.07190996408462524,
  -0.002796709770336747,
  0.043784502893686295,
  0.002969326451420784,
  -0.016748584806919098,
  -0.012821156531572342,
  0.04624231159687042,
  0.002609840128570795,
  -0.06618351489305496,
  -0.004269495606422424,
  0.01686260849237442,
  0.03144477307796478,
  -0.007702828850597143,
  0.015076261013746262,
  0.09593062847852707,
  0.03739926218986511,
  -0.006853997707366943,
  0.037525951862335205,
  -0.004757257178425789,
  -0.014265436679124832,
  0.07718031853437424,
  -0.041047971695661545,
  0.05315965414047241,
  -0.021284133195877075,
  0.03744993731379509,
  -0.01978917606174946,
  -0.02123345620930195,
  -0.05655498057603836,
  -0.004750922322273254,
  -0.013682656921446323,
  -0.038159407675266266,
  0.04109864681959152,
  -0.01785079948604107,


# 업스테이지 임베딩 모델 뭐시기

In [18]:
cosine_similarity(openai_embeddings['queen'], openai_embeddings['king'])

np.float64(0.590601530239691)

In [23]:
upstage_embeddings = {txt: get_upstage_embedding(txt) for txt in texts}

In [24]:
cosine_similarity(upstage_embeddings['queen'], upstage_embeddings['king'])

np.float64(0.6445591627644324)

# 왕의 비교

In [26]:
cosine_similarity(openai_embeddings['왕'], openai_embeddings['king'])

np.float64(0.5040406331683572)

In [27]:
cosine_similarity(upstage_embeddings['왕'], upstage_embeddings['king'])

np.float64(0.6962887728292608)

젬마2 모델로 임베딩 비교

# 올라마 임베딩 > huggingface의 임베딩 모델

In [None]:
# ! pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence_transformers)
  Downloading transformers-4.56.1-py3-none-any.whl.metadata (42 kB)
Collecting torch>=1.11.0 (from sentence_transformers)
  Downloading torch-2.8.0-cp312-cp312-win_amd64.whl.metadata (30 kB)
Collecting scikit-learn (from sentence_transformers)
  Downloading scikit_learn-1.7.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy (from sentence_transformers)
  Downloading scipy-1.16.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers<5.0.0,>=4.41.0->sentence_transformers)
  Using cached tokenizers-0.22.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence_transformers)
  Using cached safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Collecting sympy>=1.13.3 (from torch>=1.11.0->senten

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-upstage 0.7.3 requires tokenizers<0.21.0,>=0.20.0, but you have tokenizers 0.22.0 which is incompatible.


In [29]:
from langchain_community.chat_models import ChatOllama
llm_ollama = ChatOllama(model='gemma2')
response = llm_ollama.invoke('Hi. Introduce yourself in 2 sentences')
response.content

  llm_ollama = ChatOllama(model='gemma2')


'Hello! I am Gemma, an open-weights AI assistant developed by the Gemma team at Google DeepMind.\n\nI can process and generate text, helping you with tasks like writing, summarizing, and answering questions.'

# OS 계열의 임베딩 모델 사용
허깅페이스의 임베딩 모델 사용 - transformers 라이브러리, GPU 기반 pytorch\
토치쿠다 기반의 가상환경에서 실행

In [30]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('BAAI/bge-multilingual-gemma2')
gemma_embedding = model.encode(texts)
gemma_embedding

ImportError: cannot import name 'DecodeStream' from 'tokenizers.decoders' (c:\Users\Admin\miniconda3\envs\prompting_env\Lib\site-packages\tokenizers\decoders\__init__.py)

In [1]:
from sentence_transformers import SentenceTransformer
import torch
#MODEL = 'BAAI/bge-multilingual-gemma2'
MODEL = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

ModuleNotFoundError: No module named 'sentence_transformers'

In [None]:
model = SentenceTransformer(MODEL, device=device)

In [None]:
texts = ['king', 'queen', 'slave', '왕']

In [None]:
model.max_seq_length=256
gemma_embedding = model.encode(texts, batch_size=64, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True)
gemma_embedding