In [1]:
import torch
from modelscope import AutoModel

In [2]:
from pymilvus import MilvusClient

client = MilvusClient(
    uri="http://localhost:19530"
)

client.list_collections()



['Qwen3_Embedding_8B', 'BGE_VL_v1_5_zs', 'so400m_long_ctx309']

## BGE-VL-v1.5-zs

In [3]:
client.load_collection("BGE_VL_v1_5_zs")

In [4]:
MODEL_NAME = "/home/public/dkx/model/BAAI/BGE-VL-v1.5-zs"

model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True)
model.eval()
model.cuda()

with torch.no_grad():
    model.set_processor(MODEL_NAME)

    # box 34
    box34_inputs = model.data_process(
        text="The crop production data for barley and rye in the specified region reveals notable differences. Barley shows a wider range of production with high variability, indicated by the presence of multiple outliers around 10,000 kilograms per hectare. The median production of barley is higher than that of rye, with rye displaying a more concentrated range of values and fewer outliers. Overall, barley exhibits greater potential for high yield but with more variability, while rye production appears more consistent.",
        q_or_c="q",
        task_instruction="Recommend the most suitable chart's abstract for visualizing the information given by the provided text: "
    )

    box34_embs = model(**box34_inputs, output_hidden_states=True)[:, -1, :]

    box34_embs = torch.nn.functional.normalize(box34_embs, dim=-1)

    print(len(box34_embs.cpu().detach().tolist()[0]))

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

4096


In [5]:
box34_text_search_results = client.search(
    collection_name="BGE_VL_v1_5_zs",
    anns_field="text_dense",
    data=box34_embs.cpu().detach().tolist(),
    limit=5,
    search_params={"metric_type": "IP"},
    output_fields=["type", "image_url"],  # specifies fields to be returned
)

for box34_text_search_result in box34_text_search_results[0]:
    print(box34_text_search_result["type"])
    print(box34_text_search_result["image_url"])
    print(box34_text_search_result["distance"])


box
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/box/png/386.png
0.6119102239608765
violin
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/violin/png/19.png
0.5873621702194214
violin
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/violin/png/59.png
0.5858768224716187
violin
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/violin/png/296.png
0.559999406337738
ridgeline
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/ridgeline/png/3034.png
0.5387448072433472


In [6]:
box34_img_search_results = client.search(
    collection_name="BGE_VL_v1_5_zs",
    anns_field="image_dense",
    data=box34_embs.cpu().detach().tolist(),
    limit=5,
    search_params={"metric_type": "IP"},
    output_fields=["type", "image_url"],  # specifies fields to be returned
)

for box34_img_search_result in box34_img_search_results[0]:
    print(box34_img_search_result["type"])
    print(box34_img_search_result["image_url"])
    print(box34_img_search_result["distance"])

ridgeline
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/ridgeline/png/3034.png
0.4342377781867981
box
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/box/png/386.png
0.4314354658126831
violin
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/violin/png/296.png
0.42759770154953003
violin
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/violin/png/542.png
0.4245045483112335
box
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/box/png/725.png
0.42248523235321045


In [7]:
box34_hybrid_search_results = client.search(
    collection_name="BGE_VL_v1_5_zs",
    anns_field="hybrid_dense",
    data=box34_embs.cpu().detach().tolist(),
    limit=5,
    search_params={"metric_type": "IP"},
    output_fields=["type", "image_url"],  # specifies fields to be returned
)

for box34_hybrid_search_result in box34_hybrid_search_results[0]:
    print(box34_hybrid_search_result["type"])
    print(box34_hybrid_search_result["image_url"])
    print(box34_hybrid_search_result["distance"])

box
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/box/png/386.png
0.5334362387657166
violin
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/violin/png/19.png
0.5189244747161865
box
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/box/png/725.png
0.49726372957229614
violin
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/violin/png/59.png
0.4721393883228302
violin
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/violin/png/296.png
0.4707433581352234


In [15]:
from pymilvus import AnnSearchRequest

# text semantic search (dense)
request_1 = AnnSearchRequest(
    data=box34_embs.cpu().detach().tolist(),
    anns_field="text_dense",
    param={
        "metric_type": "IP"
    },
    limit=10
)

# text-to-image search (multimodal)
request_2 = AnnSearchRequest(
    data=box34_embs.cpu().detach().tolist(),
    anns_field="image_dense",
    param={
        "metric_type": "IP"
    },
    limit=10
)

reqs = [request_1, request_2]

In [16]:
from pymilvus import RRFRanker, WeightedRanker

rrf_ranker = RRFRanker()
weighed_ranker = WeightedRanker(0, 1)

In [17]:
hybrid_search_results = client.hybrid_search(
    collection_name="test",  # target collection
    reqs=reqs,
    ranker=rrf_ranker,
    limit=10,  # number of returned entities
    output_fields=["type", "image_url"],  # specifies fields to be returned
)

In [18]:
for hybrid_search_result in hybrid_search_results[0]:
    print(hybrid_search_result["type"])
    print(hybrid_search_result["image_url"])

box
/home/dukaixing/RAG4Ghart/Dataset-ZXQ/sample100/box/png/386.png
box
/home/dukaixing/RAG4Ghart/Dataset-ZXQ/sample100/box/png/34.png
violin
/home/dukaixing/RAG4Ghart/Dataset-ZXQ/sample100/violin/png/59.png
ridgeline
/home/dukaixing/RAG4Ghart/Dataset-ZXQ/sample100/ridgeline/png/3034.png
ridgeline
/home/dukaixing/RAG4Ghart/Dataset-ZXQ/sample100/ridgeline/png/3688.png
violin
/home/dukaixing/RAG4Ghart/Dataset-ZXQ/sample100/violin/png/296.png
box
/home/dukaixing/RAG4Ghart/Dataset-ZXQ/sample100/box/png/725.png
stream
/home/dukaixing/RAG4Ghart/Dataset-ZXQ/sample100/stream/png/501.png
violin
/home/dukaixing/RAG4Ghart/Dataset-ZXQ/sample100/violin/png/19.png
box
/home/dukaixing/RAG4Ghart/Dataset-ZXQ/sample100/box/png/595.png


## so400m-long-ctx309

In [3]:
client.load_collection("so400m_long_ctx309")

In [4]:
from transformers import SiglipModel, SiglipProcessor
from PIL import Image
model = SiglipModel.from_pretrained("/home/public/dkx/model/fancyfeast/so400m-long-ctx309", torch_dtype=torch.float16, device_map="auto",
                                        attn_implementation="sdpa")
processor = SiglipProcessor.from_pretrained("/home/public/dkx/model/fancyfeast/so400m-long-ctx309")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


### box34

In [5]:
box34_image = Image.open("/home/dkx/RAG4Ghart/Dataset-ZXQ/test20/box/png/34.png").convert("RGB")

with torch.no_grad():
    box34_inputs = processor(text=["The crop production data for barley and rye in the specified region reveals notable differences. Barley shows a wider range of production with high variability, indicated by the presence of multiple outliers around 10,000 kilograms per hectare. The median production of barley is higher than that of rye, with rye displaying a more concentrated range of values and fewer outliers. Overall, barley exhibits greater potential for high yield but with more variability, while rye production appears more consistent."], images=box34_image,padding="max_length", max_length=309, return_tensors="pt").to("cuda")
    box34_outputs = model(**box34_inputs)

In [6]:
box34_query_embs = box34_outputs.text_embeds
box34_text_results = client.search(
    collection_name="so400m_long_ctx309",
    anns_field="text_dense",
    data=box34_query_embs.detach().cpu().tolist(),
    limit=5,
    search_params={"metric_type": "IP"},
            output_fields=["type", "image_url"],  # specifies fields to be returned
    )

for result in box34_text_results[0]:
    print(result["type"])
    print(result["image_url"])
    print(result["distance"])

violin
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/violin/png/19.png
0.8739900588989258
box
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/box/png/386.png
0.8667271137237549
violin
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/violin/png/59.png
0.84194415807724
pie
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/pie/png/49.png
0.83468097448349
chord
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/chord/png/677.png
0.8220779299736023


In [9]:
box34_image_results = client.search(
    collection_name="so400m_long_ctx309",
    anns_field="image_dense",
    data=box34_query_embs.detach().cpu().tolist(),
    limit=5,
    search_params={"metric_type": "IP"},
            output_fields=["type", "image_url"],  # specifies fields to be returned
    )

for result in box34_image_results[0]:
    print(result["type"])
    print(result["image_url"])
    print(result["distance"])

box
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/box/png/725.png
0.194061279296875
box
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/box/png/386.png
0.19086508452892303
bar
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/bar/png/911.png
0.1841658651828766
box
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/box/png/430.png
0.182756245136261
violin
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/violin/png/542.png
0.18026849627494812


### bar21

In [7]:
bar21_image = Image.open("/home/dkx/RAG4Ghart/Dataset-ZXQ/test20/bar/png/21.png").convert("RGB")

with torch.no_grad():
    bar21_inputs = processor(text=["The chart illustrates the production of different crops in an area, measured in metric tons. Among the crops, rapeseed has the highest production at 20,000 metric tons, indicating its dominance in the dataset. The lowest production is apple, with 2,488 metric tons. The total production across all crops amounts to 32,755 metric tons, with an average production of approximately 8,189 metric tons per crop. The data does not indicate a consistent trend such as an increase or decrease since each crop's production level stands alone. No specific turning points or growth rates can be discerned from this static dataset; however, the significant observation is the substantial disparity in production levels, notably with rapeseed leading substantially."], images=bar21_image,padding="max_length", max_length=309, return_tensors="pt").to("cuda")
    bar21_outputs = model(**bar21_inputs)

In [8]:
bar21_query_embs = bar21_outputs.text_embeds
bar21_text_results = client.search(
    collection_name="so400m_long_ctx309",
    anns_field="text_dense",
    data=bar21_query_embs.detach().cpu().tolist(),
    limit=5,
    search_params={"metric_type": "IP"},
            output_fields=["type", "image_url"],  # specifies fields to be returned
    )

for result in bar21_text_results[0]:
    print(result["type"])
    print(result["image_url"])
    print(result["distance"])

bar
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/bar/png/273.png
0.9274871349334717
bar
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/bar/png/540.png
0.9231504797935486
bar
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/bar/png/911.png
0.9037222862243652
bar
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/bar/png/157.png
0.8723613023757935
bar
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/bar/png/742.png
0.853693425655365


In [10]:
bar21_query_embs = bar21_outputs.text_embeds
bar21_image_results = client.search(
    collection_name="so400m_long_ctx309",
    anns_field="image_dense",
    data=bar21_query_embs.detach().cpu().tolist(),
    limit=5,
    search_params={"metric_type": "IP"},
            output_fields=["type", "image_url"],  # specifies fields to be returned
    )

for result in bar21_image_results[0]:
    print(result["type"])
    print(result["image_url"])
    print(result["distance"])

bar
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/bar/png/911.png
0.237827867269516
bar
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/bar/png/273.png
0.2367294728755951
bar
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/bar/png/84.png
0.23169222474098206
bar
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/bar/png/141.png
0.23132580518722534
bar
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/bar/png/742.png
0.23068910837173462


## Qwen3-Embedding-8B

In [3]:
client.load_collection("Qwen3_Embedding_8B")

In [4]:
## Qwen3-Embedding
import torch.nn.functional as F
from torch import Tensor
from modelscope import AutoTokenizer
from transformers import Qwen3Model

In [5]:
def last_token_pool(last_hidden_states: Tensor,
                        attention_mask: Tensor) -> Tensor:
        left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
        if left_padding:
            return last_hidden_states[:, -1]
        else:
            sequence_lengths = attention_mask.sum(dim=1) - 1
            batch_size = last_hidden_states.shape[0]
            return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery:{query}'

# Each query must come with a one-sentence instruction that describes the task
task = 'Given a piece of text, recommend the most suitable type of chart to visualize it.'

tokenizer = AutoTokenizer.from_pretrained('/home/public/dkx/model/Qwen/Qwen3-Embedding-8B', padding_side='left')
model = Qwen3Model.from_pretrained('/home/public/dkx/model/Qwen/Qwen3-Embedding-8B')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

### box34

In [6]:
box34_queries = [
    get_detailed_instruct(task, 'The crop production data for barley and rye in the specified region reveals notable differences. Barley shows a wider range of production with high variability, indicated by the presence of multiple outliers around 10,000 kilograms per hectare. The median production of barley is higher than that of rye, with rye displaying a more concentrated range of values and fewer outliers. Overall, barley exhibits greater potential for high yield but with more variability, while rye production appears more consistent.')
]

# Tokenize the input texts
with torch.no_grad():
    batch_dict = tokenizer(
    box34_queries,
    padding=True,
    truncation=True,
    max_length=8192,
    return_tensors="pt",
)
    batch_dict.to(model.device)
    outputs = model(**batch_dict)
    embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    # normalize embeddings
    box34_query_embs = F.normalize(embeddings, p=2, dim=1)

In [7]:
box34_text_results = client.search(
    collection_name="Qwen3_Embedding_8B",
    anns_field="text_dense",
    data=box34_query_embs.detach().cpu().tolist(),
    limit=5,
    search_params={"metric_type": "IP"},
            output_fields=["type", "image_url"],  # specifies fields to be returned
    )

for result in box34_text_results[0]:
    print(result["type"])
    print(result["image_url"])
    print(result["distance"])

box
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/box/png/386.png
0.47660595178604126
box
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/box/png/10692.png
0.4488487243652344
violin
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/violin/png/19.png
0.4422077536582947
box
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/box/png/430.png
0.4297323524951935
box
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/box/png/4445.png
0.4281846880912781


### bar21

In [8]:
bar21_queries = [
    get_detailed_instruct(task, "The chart illustrates the production of different crops in an area, measured in metric tons. Among the crops, rapeseed has the highest production at 20,000 metric tons, indicating its dominance in the dataset. The lowest production is apple, with 2,488 metric tons. The total production across all crops amounts to 32,755 metric tons, with an average production of approximately 8,189 metric tons per crop. The data does not indicate a consistent trend such as an increase or decrease since each crop's production level stands alone. No specific turning points or growth rates can be discerned from this static dataset; however, the significant observation is the substantial disparity in production levels, notably with rapeseed leading substantially.")
]

# Tokenize the input texts
with torch.no_grad():
    batch_dict = tokenizer(
        bar21_queries,
        padding=True,
        truncation=True,
        max_length=8192,
        return_tensors="pt",
    )
    batch_dict.to(model.device)
    outputs = model(**batch_dict)
    embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    # normalize embeddings
    bar21_query_embs = F.normalize(embeddings, p=2, dim=1)

In [9]:
bar21_text_results = client.search(
    collection_name="Qwen3_Embedding_8B",
    anns_field="text_dense",
    data=bar21_query_embs.detach().cpu().tolist(),
    limit=5,
    search_params={"metric_type": "IP"},
            output_fields=["type", "image_url"],  # specifies fields to be returned
    )

for result in bar21_text_results[0]:
    print(result["type"])
    print(result["image_url"])
    print(result["distance"])

bar
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/bar/png/273.png
0.4573424160480499
bar
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/bar/png/742.png
0.44618138670921326
sunburst
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/sunburst/png/22.png
0.44571155309677124
sunburst
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/sunburst/png/191.png
0.44317981600761414
bar
/home/dkx/RAG4Ghart/Dataset-ZXQ/train80/bar/png/395.png
0.44175153970718384
