In [None]:
from FlagEmbedding import BGEM3FlagModel

model = BGEM3FlagModel('BAAI/bge-m3',  
                       use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

sentences_1 = ["What is BGE M3?", "Defination of BM25"]
sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", 
               "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]

embeddings_1 = model.encode(sentences_1, 
                            batch_size=12, 
                            max_length=8192, # If you don't need such a long length, you can set a smaller value to speed up the encoding process.
                            )['dense_vecs']
embeddings_2 = model.encode(sentences_2)['dense_vecs']
similarity = embeddings_1 @ embeddings_2.T
print(similarity)

In [2]:
embeddings = model.encode(sentences_1, 
                            batch_size=12, 
                            max_length=8192, # If you don't need such a long length, you can set a smaller value to speed up the encoding process.
                            )

In [4]:
embeddings.keys()

dict_keys(['dense_vecs', 'lexical_weights', 'colbert_vecs'])

# reranker

In [1]:
from FlagEmbedding import FlagReranker
reranker = FlagReranker('BAAI/bge-reranker-large', use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

score = reranker.compute_score(['query', 'passage'])
print(score)

scores = reranker.compute_score([['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']])
print(scores)

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/431 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

----------using 2*GPUs----------
-1.5234375
[-5.60546875, 5.76171875]


# llm_embedder



In [2]:
import os

# os.environ["NCCL_DEBUG"] = "INFO"

from FlagEmbedding import LLMEmbedder


# Define queries and keys

queries = ["test query 1", "test query 2"]

keys = ["test key 1", "test key 2"]


# Load model (automatically use GPUs)

model = LLMEmbedder("BAAI/llm-embedder", use_fp16=False)


# Encode for a specific task (qa, icl, chat, lrlm, tool, convsearch)
task = "qa"

query_embeddings = model.encode_queries(queries, task=task)

key_embeddings = model.encode_keys(keys, task=task)


similarity = query_embeddings @ key_embeddings.T

print(similarity)

# [[0.8971, 0.8534]

# [0.8462, 0.9091]]

----------using 2*GPUs----------
[[0.89705944 0.853418  ]
 [0.8462473  0.9091402 ]]


In [5]:
type(query_embeddings)

numpy.ndarray

# Visual

## bge-base-en-v1.5

In [5]:
####### Use Visualized BGE doing composed image retrieval
import sys

sys.path.append("D:/code/nlp/embeddings/FlagEmbedding/FlagEmbedding/visual")
import torch
from FlagEmbedding.visual.modeling import Visualized_BGE

model_weight = "D:/dataset/nlp/models/bge/Visualized_base_en_v1.5.pth"
model = Visualized_BGE(model_name_bge = "BAAI/bge-base-en-v1.5", model_weight=model_weight)
model.eval()
with torch.no_grad():
    query_emb = model.encode(image="d:/code/nlp/embeddings/FlagEmbedding/FlagEmbedding/visual/imgs/cir_query.png", text="Make the background dark, as if the camera has taken the photo at night")
    candi_emb_1 = model.encode(image="d:/code/nlp/embeddings/FlagEmbedding/FlagEmbedding/visual/imgs/cir_candi_1.png")
    candi_emb_2 = model.encode(image="d:/code/nlp/embeddings/FlagEmbedding/FlagEmbedding/visual/imgs/cir_candi_2.png")

sim_1 = query_emb @ candi_emb_1.T
sim_2 = query_emb @ candi_emb_2.T
print(sim_1, sim_2) # tensor([[0.8750]]) tensor([[0.7816]])

tensor([[0.8750]]) tensor([[0.7816]])


In [12]:
with torch.no_grad():
    # query_emb = model.encode(image="d:/code/nlp/embeddings/FlagEmbedding/FlagEmbedding/visual/imgs/SFT-CIRR.png", text="horse")
    query_emb = model.encode(text="horse")
    candi_emb_1 = model.encode(image="d:/code/nlp/embeddings/FlagEmbedding/FlagEmbedding/visual/imgs/cir_candi_1.png")
    candi_emb_2 = model.encode(image="d:/code/nlp/embeddings/FlagEmbedding/FlagEmbedding/visual/imgs/cir_candi_2.png")
    candi_emb_3 = model.encode(image="d:/code/nlp/embeddings/FlagEmbedding/FlagEmbedding/visual/imgs/wiki_candi_1.jpg")

sim_1 = query_emb @ candi_emb_1.T
sim_2 = query_emb @ candi_emb_2.T
sim_3 = query_emb @ candi_emb_3.T
print(sim_1, sim_2, sim_3) # tensor([[0.8750]]) tensor([[0.7816]])

tensor([[0.4483]]) tensor([[0.4368]]) tensor([[0.4372]])


## BAAI/bge-m3

In [14]:
####### Use Visualized BGE doing composed image retrieval
import sys

sys.path.append("D:/code/nlp/embeddings/FlagEmbedding/FlagEmbedding/visual")
import torch
from FlagEmbedding.visual.modeling import Visualized_BGE

model_weight = "D:/dataset/nlp/models/bge/Visualized_m3.pth"
model = Visualized_BGE(model_name_bge = "BAAI/bge-m3", model_weight=model_weight)
model.eval()
with torch.no_grad():
    query_emb = model.encode(image="d:/code/nlp/embeddings/FlagEmbedding/FlagEmbedding/visual/imgs/cir_query.png", text="Make the background dark, as if the camera has taken the photo at night")
    candi_emb_1 = model.encode(image="d:/code/nlp/embeddings/FlagEmbedding/FlagEmbedding/visual/imgs/cir_candi_1.png")
    candi_emb_2 = model.encode(image="d:/code/nlp/embeddings/FlagEmbedding/FlagEmbedding/visual/imgs/cir_candi_2.png")

sim_1 = query_emb @ candi_emb_1.T
sim_2 = query_emb @ candi_emb_2.T
print(sim_1, sim_2) # tensor([[0.8750]]) tensor([[0.7816]])

tensor([[0.6983]]) tensor([[0.6557]])


In [15]:
with torch.no_grad():
    # query_emb = model.encode(image="d:/code/nlp/embeddings/FlagEmbedding/FlagEmbedding/visual/imgs/SFT-CIRR.png", text="horse")
    # query_emb = model.encode(text="马")
    query_emb = model.encode(image="d:/code/nlp/embeddings/FlagEmbedding/FlagEmbedding/visual/imgs/cir_candi_1.png")
    candi_emb_1 = model.encode(image="d:/code/nlp/embeddings/FlagEmbedding/FlagEmbedding/visual/imgs/cir_candi_1.png")
    candi_emb_2 = model.encode(image="d:/code/nlp/embeddings/FlagEmbedding/FlagEmbedding/visual/imgs/cir_candi_2.png")
    candi_emb_3 = model.encode(image="d:/code/nlp/embeddings/FlagEmbedding/FlagEmbedding/visual/imgs/wiki_candi_1.jpg")

sim_1 = query_emb @ candi_emb_1.T
sim_2 = query_emb @ candi_emb_2.T
sim_3 = query_emb @ candi_emb_3.T
print(sim_1, sim_2, sim_3) # tensor([[0.8750]]) tensor([[0.7816]])

tensor([[0.2184]]) tensor([[0.2960]]) tensor([[0.2056]])


# trouble shooting

## module 'lib' has no attribute 'X509_V_FLAG_NOTIFY_POLICY'

* pip install pyopenssl==24.0.0

## Getting: ValueError: Attempting to unscale FP16 gradients

* https://huggingface.co/docs/peft/v0.8.0/en/developer_guides/troubleshooting