In [1]:
import os, sys
sys.path.append("Vary-toy/Vary-master/")
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'

# pip install albumentations==1.4.8 albucore==0.0.16

In [2]:
import argparse
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
from vary.utils.conversation import conv_templates, SeparatorStyle
from vary.utils.utils import disable_torch_init
from transformers import CLIPVisionModel, CLIPImageProcessor, StoppingCriteria
from vary.model import *
from vary.utils.utils import KeywordsStoppingCriteria

import os
import requests
from PIL import Image
from io import BytesIO
from vary.model.plug.blip_process import BlipImageEvalProcessor
from transformers import TextStreamer
from vary.model.plug.transforms import train_transform, test_transform


DEFAULT_IMAGE_TOKEN = "<image>"
DEFAULT_IMAGE_PATCH_TOKEN = '<imgpad>'
DEFAULT_IM_START_TOKEN = '<img>'
DEFAULT_IM_END_TOKEN = '</img>'

INFO:albumentations.check_version:A new version of Albumentations is available: 2.0.5 (you have 1.4.8). Upgrade using: pip install --upgrade albumentations


In [3]:
def load_image(image_file):
    if image_file.startswith('http') or image_file.startswith('https'):
        response = requests.get(image_file)
        image = Image.open(BytesIO(response.content)).convert('RGB')
    else:
        image = Image.open(image_file).convert('RGB')
    return image

In [4]:
# Model
disable_torch_init()

tokenizer = AutoTokenizer.from_pretrained("/root/autodl-tmp/model/Vary-toy", trust_remote_code=True)
model = varyQwenForCausalLM.from_pretrained("/root/autodl-tmp/model/Vary-toy", low_cpu_mem_usage=True, trust_remote_code=True)
model.to(device='cuda',  dtype=torch.bfloat16)

You are using a model of type mmgpt to instantiate a model of type vary. This is not supported for all configurations of models and can yield errors.
QWenLMHeadModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly defined. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
varyQwenForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly defined. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will 

varyQwenForCausalLM(
  (transformer): varyQwenModel(
    (wte): Embedding(151860, 2048)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-23): 24 x QWenBlock(
        (ln_1): RMSNorm()
        (attn): QWenAttention(
          (c_attn): Linear(in_features=2048, out_features=6144, bias=True)
          (c_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): RotaryEmbedding()
          (attn_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): RMSNorm()
        (mlp): QWenMLP(
          (w1): Linear(in_features=2048, out_features=5504, bias=False)
          (w2): Linear(in_features=2048, out_features=5504, bias=False)
          (c_proj): Linear(in_features=5504, out_features=2048, bias=False)
        )
      )
    )
    (ln_f): RMSNorm()
    (vision_tower): CLIPVisionModel(
      (vision_model): CLIPVisionTransformer(
        (embeddings): CLIPVisionEmbeddings(
          (patch_embedding): Conv2d(3, 1024, kernel_size=(14,

In [5]:
image_processor = CLIPImageProcessor.from_pretrained('/root/autodl-tmp/model/clip-vit-large-patch14/', torch_dtype=torch.float16)
image_processor_high = BlipImageEvalProcessor(image_size=1024)
use_im_start_end = True
image_token_len = 256

In [6]:
def run_vary(prompt: str, image_path: str):
    qs = prompt
    if use_im_start_end:
        qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len + DEFAULT_IM_END_TOKEN  + '\n' + qs
    else:
        qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
    
    
    conv_mode = "mpt"
    
    conv = conv_templates[conv_mode].copy()
    conv.append_message(conv.roles[0], qs)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()
    
    
    inputs = tokenizer([prompt])
    
    image = load_image(image_path)
    image_1 = image.copy()
    image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
    
    image_tensor_1 = image_processor_high(image_1)
    
    input_ids = torch.as_tensor(inputs.input_ids).cuda()
    
    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
    keywords = [stop_str]
    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    
    
    with torch.autocast("cuda", dtype=torch.bfloat16):
        output_ids = model.generate(
            input_ids,
            images=[(image_tensor.unsqueeze(0).half().cuda(), image_tensor_1.unsqueeze(0).half().cuda())],
            do_sample=True,
            num_beams = 1,
            # temperature=0.2,
            streamer=streamer,
            max_new_tokens=2048,
            stopping_criteria=[stopping_criteria]
            )

In [14]:
run_vary("Cnvert the image to latex format:", "data/iShot_2025-04-23_16.31.41.png")

# 
小结

## 向量检索的挑战与实践

- 数据规模大, 信息密度高, 处理成本高
- 向量检索、RAG 需求快速增长
向量检索的新CAP问题
\(\cdot\) 在成本、精度、性能之间取舍
向量数据库实践
- 存储工程与向量索引算法深度优化
・通过混合磁盘索引降低成本
- 让应用精通向量提升召回率
最新学术研究与应用
\(\cdot\) RabinQ 超高量化比
HGraph层次化索引框架
- 提供灵活组合能力，快速构建新索引
磁盘索引上的改进
- PAGE 进一步降低内存和 \(I O\) 需求
基于公开 Benchmark 工具的性能调优
- GIST-960数据集业界 SOTA


In [15]:
run_vary("Cnvert the image to markdown format:", "data/xh44z6ajf6noc_22c15d2af82e42a290dac1dde66bc685.png")

建设地点

建设规模 \(1-7 \mathrm{~h}\)
土方概规模 893m³、合占\(62\%\)

土层
活动区积沙大放:
质来源 电
话 (883)一817715
控制投资：拟投,
建筑面积
正常设计依据:（请填报明批文的票要内容及批文号） 


In [16]:
run_vary("Detect the person in the image", "data/pingpong.jpeg")

[720, 126, 938, 635]


In [17]:
run_vary("Describe the image", "data/pingpong.jpeg")

Two people playing ping pong in an indoor court.
