In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "/data/hf_models/Qwen2.5-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="cuda:6"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

prompt = '''Give me a short introduction to 回收显存.
import torch
from peft import PeftModel, PeftConfig
from colpali_engine.models import ColQwen2, ColQwen2Processor

# 加载基础模型
base_model = ColQwen2.from_pretrained(
    "/data/hf_models/colqwen2-base",  
    torch_dtype=torch.bfloat16,
    device_map="cuda:7"
)
# 加载 LoRA 配置
peft_model_id = "/data/hf_models/colqwen2-v0.1"
config = PeftConfig.from_pretrained(peft_model_id)

# 创建 PeftModel
model = PeftModel.from_pretrained(base_model, peft_model_id)
model.eval()

# 加载处理器
processor = ColQwen2Processor.from_pretrained(peft_model_id)

# 遍历前5个样本
for i in range(5):
    sample = ds[i]
    
    # 直接使用数据集中的图像对象
    image = sample['image']
    # query = sample['query']
    query = ds[0]['query']
    
    # 处理输入
    batch_images = processor.process_images([image]).to(model.device)
    batch_queries = processor.process_queries([query]).to(model.device)
    
    # 推理
    with torch.no_grad():
        image_embeddings = model(**batch_images)
        query_embeddings = model(**batch_queries)
    
    scores = processor.score_multi_vector(query_embeddings, image_embeddings)
    
    print(f"Sample {i}:")
    print(f"Query: {query}")
    print(f"Score: {scores.item()}")
'''
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.25it/s]
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


In [2]:
response

'你提供的代码片段主要用于加载一个基于ColQwen2的模型，并对其进行微调以适应特定任务，例如图像检索。在这个过程中，代码还展示了如何处理图像和查询文本，以及如何计算它们之间的相似度得分。\n\n关于“回收显存”（回收显存资源），在深度学习框架中，特别是在大规模模型推理时，有效管理显存是非常重要的。回收显存通常涉及以下几个方面：\n\n1. **释放不必要的张量**：在完成计算后，可以使用`del`关键字删除不再需要的张量，这样可以释放显存。\n2. **清空缓存**：可以通过调用`torch.cuda.empty_cache()`来清空GPU缓存，这有助于释放未使用的显存。\n3. **调整批处理大小**：减少批处理的大小可以降低每次迭代所需的显存量。\n4. **使用更高效的数据结构**：选择更节省内存的数据结构或方法可以减少显存使用。\n\n下面是如何在你的代码中添加显存回收的部分示例：\n\n```python\nimport torch\nfrom peft import PeftModel, PeftConfig\nfrom colpali_engine.models import ColQwen2, ColQwen2Processor\n\n# 加载基础模型\nbase_model = ColQwen2.from_pretrained(\n    "/data/hf_models/colqwen2-base",\n    torch_dtype=torch.bfloat16,\n    device_map="cuda:7"\n)\n# 加载 LoRA 配置\npeft_model_id = "/data/hf_models/colqwen2-v0.1"\nconfig = PeftConfig.from_pretrained(peft_model_id)\n\n# 创建 PeftModel\nmodel = PeftModel.from_pretrained(base_model, peft_model_id)\nmodel.eval()\n\n# 加载处理器\nprocessor = ColQwen2Processor.from_pretrained(peft_model_id)\n\n# 遍历前5个样本\nfor i in range(5):\