# 选择合适的embedding 模型

## HuggingFaceEmbeddings使用本地模型路径

In [1]:
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer

In [None]:
model_path = 'path/to/local/model'

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)

sentence_transformer_model = SentenceTransformer(modules=[model, tokenizer])

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

embedding = HuggingFaceEmbeddings(model_name='bert-base-uncased', cache_folder='/path/to/cache/folder')

In [None]:
from transformers import LocalModel, HuggingFaceEmbeddings

model_path = '/path/to/your/local/model'
model = LocalModel.from_pretrained(model_path)
embeddings = HuggingFaceEmbeddings(model)


## BAAI/bge-large-zh-v1.5

In [2]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

In [None]:
model_name = 'BAAI/bge-large-en-v1.5'
model_kwargs = { 'device': 'cuda' }
encode_kwargs = { 'normalize_embeddings': True }

model = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
    query_instruction="Represent this sentence for retrieval: "
)
model.query_instruction = "Represent this sentence for retrieval: "

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch

In [4]:
sentences = ['样例数据1', '样例数据2']

In [None]:
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5')
model.eval()

In [None]:
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')

In [None]:
with torch.no_grad():
    model_output = model(**encoded_input)
    sentence_embeddings = model_output[0][:, 0]

sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)