# 微调Qwen2.5-7B

In [1]:
import torch
import glob
import os
os.environ["MODELSCOPE_CACHE"] = "/data2/dzr/.cache" 
from collections import OrderedDict, defaultdict
import math
import random
from tqdm import tqdm  # 引入 tqdm 库
import time  # 引入 time 模块
import argparse  # 引入 argparse 模块
import sys
import numpy as np
import torch.optim as optim
import torch.nn as nn
from io import BytesIO
from torch.utils.data import DataLoader, Subset, random_split
from typing import Dict, List
from modelscope import AutoTokenizer, AutoProcessor,Qwen2_5_VLForConditionalGeneration
from qwen_vl_utils import process_vision_info
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import pandas as pd
from peft import LoraConfig, get_peft_model
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
)

model_ckpt = "Qwen/Qwen2.5-VL-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_ckpt, trust_remote_code=True)
print(torch.cuda.memory_summary())



Downloading Model from https://www.modelscope.cn to directory: /data2/dzr/.cache/models/Qwen/Qwen2.5-VL-7B-Instruct


2025-06-01 20:37:11,860 - modelscope - INFO - Target directory already exists, skipping creation.


Downloading Model from https://www.modelscope.cn to directory: /data2/dzr/.cache/models/Qwen/Qwen2.5-VL-7B-Instruct


2025-06-01 20:37:13,329 - modelscope - INFO - Target directory already exists, skipping creation.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------

## 定义度量指标

In [2]:
import torch.nn.functional as F
#---------新增topkloss---------
class NMSELoss(nn.Module):
    def __init__(self, reduction='mean'):
        super(NMSELoss, self).__init__()
        self.mse = nn.MSELoss(reduction=reduction)

    def forward(self, output, target):
        mse = self.mse(output, target)
        var = torch.var(target, unbiased=False)
        # 防止除以零的情况
        if var.item() == 0:
            return torch.tensor(float('inf')).to(output.device)
        return mse / var

class TopkLoss(nn.Module):
    def __init__(self, k=1, reduction='mean'):
        super().__init__()
        self.k = k
        self.reduction = reduction

    def forward(self, output, target):
        """
        Args:
            output : [B, T, C] 模型输出的logits（未归一化）
            target : [B, T, C] one-hot编码 或 [B, T] 类别索引
            B = Batch Size        批量大小（数据加载时设置的batch_size）
            T = Sequence Length   输出序列的时间步数（output_length=3）
            C = Num Classes       类别数量（64个离散目标类别）
        """
        # 转换target为类别索引
        if target.dim() == 3:
            target = torch.argmax(target, dim=-1)  # [B, T]
        
        B, T, C = output.shape
        output_flat = output.view(B*T, C)  # [B*T, C]
        target_flat = target.contiguous().view(-1)  # [B*T]
        
        # 计算Top-k正确性
        _, topk_indices = torch.topk(output_flat, self.k, dim=1)  # [B*T, k]
        correct = topk_indices.eq(target_flat.unsqueeze(1)).any(dim=1)  # [B*T]
        
        # 计算损失（仅惩罚Top-k错误的样本）
        loss = F.cross_entropy(output_flat, target_flat, reduction='none')  # [B*T]，表示每个样本的预测是否在 Top-K 中命中真实标签
        masked_loss = loss * ~correct  # 仅保留错误样本的损失值，正确样本的损失被置零
        
        if self.reduction == 'mean':
            return masked_loss.mean()
        elif self.reduction == 'sum':
            return masked_loss.sum()
        return masked_loss

#------------新增HybridLoss--------------
class HybridLoss(nn.Module):
    def __init__(self, alpha=0.7, k=3):
        super().__init__()
        self.alpha = alpha  # 混合权重
        self.k = k
        self.ce = nn.CrossEntropyLoss(reduction='none')
        
    def forward(self, output, target):
        """
        output : [B, T, C]
        target : [B, T]
        """
        # 转换target为类别索引
        if target.dim() == 3:
            target = torch.argmax(target, dim=-1)  # [B, T]
        
        B, T, C = output.shape
        
        # 常规交叉熵损失（保持生成特性）
        ce_loss = self.ce(output.view(-1, C), target.view(-1))  # [B*T]
        
        # Top-K增强损失
        _, topk = output.topk(self.k, dim=-1)  # [B, T, k]
        correct = topk.eq(target.unsqueeze(-1)).any(-1)  # [B, T]
        topk_loss = (1 - correct.float()).mean()  # 错误率
        
        # 时间依赖惩罚项
        seq_penalty = self._sequence_consistency(output, target)  # [1]
        
        return self.alpha*ce_loss.mean() + (1-self.alpha)*topk_loss + seq_penalty
        
    def _sequence_consistency(self, output, target):
        """
        惩罚相邻时间步预测不一致的情况
        """
        preds = output.argmax(-1)  # [B, T]
        diff = (preds[:, 1:] != preds[:, :-1]).float().mean()
        return diff * 0.2  # 可调节系数
    
#-------------新增CrossEntropyloss-----------------
class CrossEntropyLoss(nn.Module):
    def __init__(self, reduction='mean'):
        super().__init__()
        self.reduction = reduction
        self.ce = nn.CrossEntropyLoss(reduction='none')  # 始终返回非归约结果

    def forward(self, output, target):
        # 处理one-hot编码目标
        if target.dim() == 3:
            target = torch.argmax(target, dim=-1)  # [B, T]

        # 重塑维度
        output = output.view(-1, output.size(-1))  # [B*T, C]
        target = target.view(-1)                   # [B*T]

        # 计算基础损失
        ce_loss = self.ce(output, target)
        
        # 自定义归约方式
        if self.reduction == 'mean':
            return ce_loss.mean()
        elif self.reduction == 'sum':
            return ce_loss.sum()
        return ce_loss  # 'none'模式返回原始形状

def calculate_accuracy(output, target, k=3):
        if target.dim() == 3:
            target = torch.argmax(target, dim=-1)  # [B, T]
        with torch.no_grad():
            _, pred = output.topk(k, dim=-1)  # [B, T, k]
            correct = pred.eq(target.unsqueeze(-1)).any(dim=-1)
            return correct.float().mean()

## Processer
### 构建多模态提示词并提取视觉输入


In [3]:
def build_prompt_and_inputs(sample: Dict, hist_steps: int = 5) -> Dict:
    """构建多模态提示词并提取视觉输入
    Args:
        sample: 包含多模态数据的样本
        hist_steps: 使用历史时间步数（默认为5）
    Returns:
        包含处理后的提示词和视觉输入的字典
    """
    # 提取并规范化路径
    def normalize_paths(path_list: List[str]) -> List[str]:
        return [os.path.normpath(p) for p in path_list]
    # 处理所有路径
    video_paths = normalize_paths(sample['video_paths'][:hist_steps])
    heatmap_paths = normalize_paths(sample['heatmap_paths'][:hist_steps])
    gps_data = sample['gps'][:hist_steps].tolist()
    
    # 构建时间序列提示词
    prompt_parts = []
    for step in range(hist_steps):
        time_label = f"t-{hist_steps-1-step}" if step < hist_steps-1 else "Current time (t)"
        
        # GPS数据格式化（假设张量存储的是经度、纬度）
        lon, lat = gps_data[step]
        gps_str = f"longitude:{lon:.6f},dimension:{lat:.6f}"
        
        # 添加多模态信息块
        prompt_part = (
            f"time:{time_label}"
            f"gps:{gps_str}"
        )
        prompt_parts.append(prompt_part)
    
    # 组合完整提示词
    full_prompt = ("".join(prompt_parts) )
    
    # 提取所有视觉路径（RGB + 热力图）
    all_image_paths = [p for pair in zip(video_paths, heatmap_paths) for p in pair]
    
    return {
        "prompt": full_prompt,
        "image_paths": all_image_paths,
        "target_mmwave": sample['target_mmwave']
    }

# 示例使用 ---------------------------------------------------
def process_sample(sample, processor):  # 添加processor参数
    # Step 1: 构建提示词和获取图像路径
    processed = build_prompt_and_inputs(sample)
    
    # Step 2: 构建messages结构
    messages = [{
        "role": "user",
        "content": [{"type": "image", "image": path} for path in processed["image_paths"]] + 
                  [{"type": "text", "text": processed["prompt"]}]
    }]
    
    # Step 3: 使用传入的processor处理输入
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")
    
    return inputs, processed["target_mmwave"]





## Dataset

In [4]:
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from PIL import Image

class QwenVisionDataset(Dataset):
    def __init__(self, data_csv_paths, modal='mmwave_gps', input_length=8, output_length=3):
        self.data_csv_paths = data_csv_paths
        self.modal = modal
        self.input_length = input_length
        self.output_length = output_length
        
        # 特征列映射
        self.features_column = {
            # 'rgbs': 'unit1_rgb',
            'rgbs': 'unit1_camera_resized',
            'u1_loc': 'unit1_loc',
            'u2_loc': 'unit2_loc',
            'mmwave': 'unit1_pwr_60ghz',
            'heatmap': 'unit1_mmwave_heatmap'  # 新增热力图列
        }
        
        # 初始化滑动窗口
        self.window_samples = []
        for seq_idx, data_csv_path in enumerate(self.data_csv_paths):
            data_csv = pd.read_csv(data_csv_path)
            for seq_id in data_csv['seq_index'].unique():
                seq_data = data_csv[data_csv['seq_index'] == seq_id]
                if len(seq_data) >= self.input_length:
                    for start_idx in range(len(seq_data) - self.input_length + 1):
                        self.window_samples.append((seq_idx, seq_id, start_idx))

    def __len__(self):
        return len(self.window_samples)
    
    def __getitem__(self, idx):
        seq_idx, seq_id, start_idx = self.window_samples[idx]
        base_path = os.path.dirname(self.data_csv_paths[seq_idx])
        data_csv = pd.read_csv(self.data_csv_paths[seq_idx])
        seq_data = data_csv[data_csv['seq_index'] == seq_id]
        
        # 获取原始路径数据
        window_data = {
            'video_paths': 
            seq_data[self.features_column['rgbs']]
            .iloc[start_idx:start_idx+self.input_length] 
            .tolist(),
            'heatmap_paths': 
            seq_data[self.features_column['heatmap']]
            .iloc[start_idx:start_idx+self.input_length] 
            .tolist()
        }

        # 处理GPS数据
        gps = []
        for i in range(self.input_length):
            u1_loc = os.path.join(base_path, seq_data[self.features_column['u1_loc']].iloc[start_idx+i])
            u2_loc = os.path.join(base_path, seq_data[self.features_column['u2_loc']].iloc[start_idx+i])
            
            with open(u1_loc, 'r') as f:
                lat1, lon1 = map(float, f.read().strip().split())
            with open(u2_loc, 'r') as f:
                lat2, lon2 = map(float, f.read().strip().split())
                
            gps.append(torch.tensor([lat2-lat1, lon2-lon1], dtype=torch.float32))
        gps = torch.stack(gps)

        # 处理mmWave数据
        mmwave = []
        for i in range(self.input_length):
            mmwave_path = os.path.join(base_path, 
                seq_data[self.features_column['mmwave']].iloc[start_idx+i])
            with open(mmwave_path, 'r') as f:
                mmwave.append(torch.tensor(
                    list(map(float, f.read().strip().split())), 
                    dtype=torch.float32))
        mmwave = torch.stack(mmwave)

        # 目标数据（最后output_length个时间步）
        target = []
        for i in range(self.input_length-self.output_length, self.input_length):
            mmwave_path = os.path.join(base_path,
                seq_data[self.features_column['mmwave']].iloc[start_idx+i])
            with open(mmwave_path, 'r') as f:
                target.append(torch.tensor(
                    list(map(float, f.read().strip().split())),
                    dtype=torch.float32))
        target = torch.stack(target)

        return {
            'video_paths': [os.path.join(base_path, p) for p in window_data['video_paths']],
            'heatmap_paths': [os.path.join(base_path, p) for p in window_data['heatmap_paths']],
            'gps': gps,
            'mmwave': mmwave,
            'target_mmwave': target
        }

def qwen_collate_fn(batch):
    collated = {
        'video_paths': [item['video_paths'] for item in batch],
        'heatmap_paths': [item['heatmap_paths'] for item in batch],
        'gps': pad_sequence([item['gps'] for item in batch], batch_first=True),
        'mmwave': pad_sequence([item['mmwave'] for item in batch], batch_first=True),
        'target_mmwave': pad_sequence([item['target_mmwave'] for item in batch], batch_first=True)
    }
    return collated

In [5]:
dataset_start_idx = 1
dataset_end_idx = 9
# 定义数据集路径
dataset_path = [f'/data2/wzj/Datasets/DeepSense/scenario{i}/' for i in range(dataset_start_idx, dataset_end_idx)]  # scenario1 ~ scenario8

data_csv_paths = []
for path in dataset_path:
    data_csv_paths.extend(glob.glob(os.path.join(path, '*.csv')))

print(f"Found {len(data_csv_paths)} CSV files for training.")

Found 8 CSV files for training.


### 加载数据集

In [6]:
dataset = QwenVisionDataset(
    data_csv_paths,
    input_length=8,
    output_length=3
)
dataset[98]

{'video_paths': ['/data2/wzj/Datasets/DeepSense/scenario1/./unit1/camera_resized/image_BS1_554_00_42_26.jpg',
  '/data2/wzj/Datasets/DeepSense/scenario1/./unit1/camera_resized/image_BS1_555_00_42_26.jpg',
  '/data2/wzj/Datasets/DeepSense/scenario1/./unit1/camera_resized/image_BS1_556_00_42_26.jpg',
  '/data2/wzj/Datasets/DeepSense/scenario1/./unit1/camera_resized/image_BS1_557_00_42_26.jpg',
  '/data2/wzj/Datasets/DeepSense/scenario1/./unit1/camera_resized/image_BS1_558_00_42_26.jpg',
  '/data2/wzj/Datasets/DeepSense/scenario1/./unit1/camera_resized/image_BS1_559_00_42_26.jpg',
  '/data2/wzj/Datasets/DeepSense/scenario1/./unit1/camera_resized/image_BS1_560_00_42_26.jpg',
  '/data2/wzj/Datasets/DeepSense/scenario1/./unit1/camera_resized/image_BS1_561_00_42_26.jpg'],
 'heatmap_paths': ['/data2/wzj/Datasets/DeepSense/scenario1/./unit1/mmWave_heatmap/mmWave_power_98.png',
  '/data2/wzj/Datasets/DeepSense/scenario1/./unit1/mmWave_heatmap/mmWave_power_99.png',
  '/data2/wzj/Datasets/DeepSens

### 划分数据集（抽出1600个样本微调）

In [7]:
from torch.utils.data import Subset
import random

# 固定随机种子确保每次结果一致（可选）
random.seed(42)

# 原始数据集有约 14400 个样本
total_samples = len(dataset)

# 随机选出 12000 个样本的索引
subset_indices = random.sample(range(total_samples), 12000)

# 创建新的 dataset
small_dataset = Subset(dataset, subset_indices)


In [8]:
train_size = int(0.8 * len(small_dataset))
val_size = int(0.1 * len(small_dataset))
test_size = len(small_dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(small_dataset, [train_size, val_size, test_size])

print(f"Total Training samples: {len(train_dataset)}")
print(f"Total Validation samples: {len(val_dataset)}")
print(f"Total Testing samples: {len(test_dataset)}")


Total Training samples: 9600
Total Validation samples: 1200
Total Testing samples: 1200


In [None]:
def custom_collate(batch):
    # 直接返回样本列表，不进行合并
    return batch

batch_size = 128
train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size,
    shuffle=True,  
    collate_fn=custom_collate,
    pin_memory=True if torch.cuda.is_available() else False,
)
val_loader = DataLoader(
    val_dataset, 
    batch_size=batch_size,
    shuffle=False,  
    collate_fn=custom_collate,
    pin_memory=True if torch.cuda.is_available() else False,
)
test_loader = DataLoader(
    test_dataset, 
    batch_size=batch_size,
    shuffle=False,  
    collate_fn=custom_collate,
    pin_memory=True if torch.cuda.is_available() else False,
)

print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------

## Model
### 用Qwen构造带有64类分类头的模型

In [10]:
import torch
import torch.nn as nn
from peft import LoraConfig, get_peft_model, TaskType

class QwenTimeLLMDirectHead(nn.Module):
    def __init__(
        self,
        qwen_model: nn.Module,
        pred_len: int = 3,        # 未来要预测的时刻数 P（这里 3）
        num_beams: int = 64,      # 每个时刻要分类的波束数 C（这里 64）
        hidden_dim: int = 3584,   # Qwen 最后一层隐藏维度 D（Qwen2.5-VL 为 3584）
        proj_hidden: int = 2048,  # MLP 第一层隐藏维度
        dropout: float = 0.1,
        lora_r: int = 8,
        lora_alpha: int = 16,
        lora_dropout: float = 0.05
    ):
        """
        - qwen_model: 预加载好的 Qwen2.5-VLForConditionalGeneration（多模态）  
        - pred_len:   要预测的未来时刻数 P=3  
        - num_beams:  每个时刻输出的波束索引类别数 C=64  
        - hidden_dim: Qwen 骨干输出维度 D=3584  
        - proj_hidden: MLP 第一层隐藏维度  
        - dropout:    MLP 中用到的 Dropout 比例  
        - lora_*:     LoRA 超参数
        """
        super().__init__()

        # ————————————————————————————————————————
        # 1) 将 Qwen 骨干切换到 bfloat16，然后用 LoRA 微调
        # ————————————————————————————————————————
        # 原模型默认是 float32，我们这里转成 bf16
        self.qwen = qwen_model.to(torch.bfloat16)

        # LoRA 配置：只对 Qwen 内部的投影／线性层插入 LoRA adapter
        # target_modules 里的名字要与你加载的 Qwen2.5-VLForConditionalGeneration 内部层名称一致。
        # 下面示例里放了一些常见的线性层名称，比如 "qkv_proj", "out_proj", "fc1", "fc2" 等。
        # 请根据实际用到的 Qwen 版本检查一下内部模块名字是否需要改动。
        lora_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            inference_mode=False,  # 训练模式：只训练 LoRA adapter，冻结原始权重
            r=lora_r,
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout,
            target_modules=[
                # 视觉分支
                "qkv",         # Qwen2_5_VLVisionSdpaAttention.qkv
                "proj",        # Qwen2_5_VLVisionSdpaAttention.proj
                # 文本分支注意力
                "q_proj",      # Qwen2_5_VLSdpaAttention.q_proj
                "k_proj",      # Qwen2_5_VLSdpaAttention.k_proj
                "v_proj",      # Qwen2_5_VLSdpaAttention.v_proj
                "o_proj",      # Qwen2_5_VLSdpaAttention.o_proj
                # 文本分支 MLP
                "gate_proj",   # Qwen2_5_VLMLP.gate_proj
                "up_proj",     # Qwen2_5_VLMLP.up_proj
                "down_proj"    # Qwen2_5_VLMLP.down_proj
            ]
        )
        # 用 PEFT 把 LoRA 加到基础模型上
        self.qwen = get_peft_model(self.qwen, lora_config)

        # 保证只有 LoRA adapter 参数可训练，其余骨干权重被冻结
        # get_peft_model 默认会将原始权重 requires_grad=False

        # 最后启用 gradient checkpointing 省显存
        self.qwen.gradient_checkpointing_enable()

        # ————————————————————————————————————————
        # 2) 定义输出头：把 Qwen 最后一层“文本隐藏态”里最后一个 token 的向量当作上下文特征
        #    然后用一个两层 MLP 预测 pred_len * num_beams 个 logits
        # ————————————————————————————————————————
        self.pred_len = pred_len
        self.num_beams = num_beams
        self.hidden_dim = hidden_dim  # 3584

        # MLP：D -> proj_hidden -> ReLU -> Dropout -> (proj_hidden -> P*C)
        self.classifier = nn.Sequential(
            nn.Linear(self.hidden_dim, proj_hidden),   # 3584 -> 2048
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(proj_hidden, self.pred_len * self.num_beams)  # 2048 -> 3*64=192
        )
        self.dropout = nn.Dropout(dropout)

    def forward(
        self,
        input_ids: torch.LongTensor,         # (B, L_text)
        attention_mask: torch.LongTensor,    # (B, L_text)
        pixel_values: torch.FloatTensor,     # (B, 3, H, W) or 按 Qwen 要求的多模态格式
        image_grid_thw: torch.LongTensor      # (B, N_patches, 3)
    ):
        """
        Args:
          - input_ids:      文本 token 序列 (B, L_text)，它已经把 5 个时间步的 GPS 串在一起共 482 个 token
          - attention_mask: 对应的 mask (B, L_text)，这里应全为 1
          - pixel_values:   10 张图合并后的一批图像 (B, 3, H, W)，供视觉编码器使用
          - image_grid_thw: (B, N_patches, 3)：告诉视觉编码器如何切 patch，例如 shape=(1, 10, 18) 等
        Returns:
          - logits: (B, P, C) → (1, 3, 64)，预测未来 3 步每步 64 类的 logits
        """
        B = input_ids.size(0)
        device = input_ids.device

        # —————————————————————————————————————————————
        # A. 将文本和图像输入到 Qwen，多模态骨干输出
        #    我们让 Qwen 以 bfloat16 运行，输入前先把 input_ids 转 long、转到 cuda/bf16
        # —————————————————————————————————————————————
        # 1) 文本 Embedding + Transformer：得到最后一层文本隐藏态 (B, L_text, D)
        #    这里直接用 input_ids/attention_mask，Qwen 会内部调用 get_input_embeddings()
        outputs = self.qwen(
            input_ids=input_ids.to(torch.long),
            attention_mask=attention_mask,
            pixel_values=pixel_values,
            image_grid_thw=image_grid_thw,
            output_hidden_states=True,
            return_dict=True
        )
        # hidden_states[-1] 的形状 = (B, L_text, D)
        last_hidden_text = outputs.hidden_states[-1]  # bf16, (B, L_text, 3584)

        # —————————————————————————————————————————————
        # B. 我们只取“文本隐藏态序列”里最后一个 token 对应的向量，作为整条5步+10图的上下文特征
        #    这个最后 token 通常对应拼接后 GPS 序列的末尾。
        #    你也可以换成 mean pooling 或者 first token（[CLS]）向量，视任务需求而定。
        # —————————————————————————————————————————————
        # last_hidden_text[:, -1, :] 的形状 = (B, 3584)
        context_vec = last_hidden_text[:, -1, :].to(torch.float32)  # 转回 float32 做后续 MLP 更稳定

        # —————————————————————————————————————————————
        # C. 分类头：context_vec -> MLP -> (B, P * C) -> reshape -> (B, P, C)
        # —————————————————————————————————————————————
        x = self.dropout(context_vec)                           # (B, 3584)
        logits_flat = self.classifier(x)                         # (B, P * C) = (B, 3*64=192)
        logits = logits_flat.view(B, self.pred_len, self.num_beams)  # (B, 3, 64)

        return logits



### 加载Qwen

In [11]:
# !pip install qwen-vl-utils[decord]==0.0.8
device = torch.device('cuda:2')

In [12]:
# 配置 bfloat16 精度
finetune_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_ckpt,
    torch_dtype=torch.bfloat16,    # 设置模型权重为 bfloat16
    device_map="cuda",              # 自动分配设备
    trust_remote_code=True,         # 必须开启
    return_dict=True
).to(device)
print(f"Memory usage: {torch.cuda.memory_allocated(device=device)/1024**3:.2f} GB")



Downloading Model from https://www.modelscope.cn to directory: /data2/dzr/.cache/models/Qwen/Qwen2.5-VL-7B-Instruct


2025-06-01 20:37:17,342 - modelscope - INFO - Target directory already exists, skipping creation.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Memory usage: 15.49 GB


In [13]:
print(finetune_model)

Qwen2_5_VLForConditionalGeneration(
  (model): Qwen2_5_VLModel(
    (visual): Qwen2_5_VisionTransformerPretrainedModel(
      (patch_embed): Qwen2_5_VisionPatchEmbed(
        (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
      )
      (rotary_pos_emb): Qwen2_5_VisionRotaryEmbedding()
      (blocks): ModuleList(
        (0-31): 32 x Qwen2_5_VLVisionBlock(
          (norm1): Qwen2RMSNorm((1280,), eps=1e-06)
          (norm2): Qwen2RMSNorm((1280,), eps=1e-06)
          (attn): Qwen2_5_VLVisionSdpaAttention(
            (qkv): Linear(in_features=1280, out_features=3840, bias=True)
            (proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (mlp): Qwen2_5_VLMLP(
            (gate_proj): Linear(in_features=1280, out_features=3420, bias=True)
            (up_proj): Linear(in_features=1280, out_features=3420, bias=True)
            (down_proj): Linear(in_features=3420, out_features=1280, bias=True)
            (act_fn): Si

In [14]:
finetuner = QwenTimeLLMDirectHead(
    qwen_model=finetune_model,  # 你加载好的 Qwen2.5-VLForConditionalGeneration
    pred_len=3,
    num_beams=64,
    hidden_dim=3584,
    proj_hidden=2048,
    dropout=0.1,
    lora_r=8,
    lora_alpha=16,
    lora_dropout=0.05
).to(device)
print(f"Memory usage: {torch.cuda.memory_allocated(device=device)/1024**3:.2f} GB")
print(finetuner)

Could not load bitsandbytes native library: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.34' not found (required by /home/dzr/anaconda3/envs/mllm/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda126.so)
Traceback (most recent call last):
  File "/home/dzr/anaconda3/envs/mllm/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 85, in <module>
    lib = get_native_library()
  File "/home/dzr/anaconda3/envs/mllm/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 72, in get_native_library
    dll = ct.cdll.LoadLibrary(str(binary_path))
  File "/home/dzr/anaconda3/envs/mllm/lib/python3.10/ctypes/__init__.py", line 452, in LoadLibrary
    return self._dlltype(name)
  File "/home/dzr/anaconda3/envs/mllm/lib/python3.10/ctypes/__init__.py", line 374, in __init__
    self._handle = _dlopen(self._name, mode)
OSError: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.34' not found (required by /home/dzr/anaconda3/envs/mllm/lib/python3.10/site-packages/bitsa

I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://x.com/casper_hansen_
- LinkedIn: https://www.linkedin.com/in/casper-hansen-804005170/



Memory usage: 15.62 GB
QwenTimeLLMDirectHead(
  (qwen): PeftModelForCausalLM(
    (base_model): LoraModel(
      (model): Qwen2_5_VLForConditionalGeneration(
        (model): Qwen2_5_VLModel(
          (visual): Qwen2_5_VisionTransformerPretrainedModel(
            (patch_embed): Qwen2_5_VisionPatchEmbed(
              (proj): lora.Conv3d(
                (base_layer): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Conv3d(3, 8, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Conv3d(8, 1280, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lor

### 训练参数

In [15]:
# 忽略可能的警告
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
total_trainable = sum(p.numel() for p in finetuner.parameters() if p.requires_grad)
print("Trainable params:", total_trainable)

Trainable params: 33515904


### 检查输入样本

In [16]:
sample = dataset[2324]
print(build_prompt_and_inputs(sample))
print(process_sample(sample,processor=processor))

{'prompt': 'time:t-4gps:longitude:0.000087,dimension:0.000158time:t-3gps:longitude:0.000084,dimension:0.000159time:t-2gps:longitude:0.000081,dimension:0.000159time:t-1gps:longitude:0.000078,dimension:0.000159time:Current time (t)gps:longitude:0.000075,dimension:0.000159', 'image_paths': ['/data2/wzj/Datasets/DeepSense/scenario2/unit1/camera_resized/image_BS1_976_02_12_21.jpg', '/data2/wzj/Datasets/DeepSense/scenario2/unit1/mmWave_heatmap/mmWave_power_123.png', '/data2/wzj/Datasets/DeepSense/scenario2/unit1/camera_resized/image_BS1_977_02_12_21.jpg', '/data2/wzj/Datasets/DeepSense/scenario2/unit1/mmWave_heatmap/mmWave_power_124.png', '/data2/wzj/Datasets/DeepSense/scenario2/unit1/camera_resized/image_BS1_978_02_12_21.jpg', '/data2/wzj/Datasets/DeepSense/scenario2/unit1/mmWave_heatmap/mmWave_power_125.png', '/data2/wzj/Datasets/DeepSense/scenario2/unit1/camera_resized/image_BS1_979_02_12_21.jpg', '/data2/wzj/Datasets/DeepSense/scenario2/unit1/mmWave_heatmap/mmWave_power_126.png', '/data2

({'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198, 151652, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151653, 151652, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151653, 151652, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151653, 151652, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         1516

In [17]:
# 假设你已经有 process_sample 函数和 processor
sample = train_dataset[0]     # 或者任何一个样本

# 1) 只做一次前处理（在 CPU 上）
inputs, _ = process_sample(sample, processor)

# 2) 文本 token 数
#    inputs["input_ids"] 的形状是 (1, seq_len)
text_token_count = inputs["input_ids"].shape[1]
print(f"文本 token 数: {text_token_count}")

# 3) 图像 token 数
#    inputs["image_grid_thw"] 的形状是 (n_images, 3)
#      每行 = [T, H, W]，对于静态图片 T=1，token = H*W
grid = inputs["image_grid_thw"].cpu().long()  # (n_images, 3)
T, H, W = grid.unbind(dim=1)                 # 拆成三个向量
tokens_per_image = (H * W).tolist()           # list 长度 = n_images
total_image_tokens = sum(tokens_per_image)

print(f"图像张数: {grid.shape[0]}")
for i, nt in enumerate(tokens_per_image):
    print(f"  第 {i+1} 张图 → {nt} 个 patch token")
print(f"图像总 patch token 数: {total_image_tokens}")


文本 token 数: 407
图像张数: 10
  第 1 张图 → 120 个 patch token
  第 2 张图 → 64 个 patch token
  第 3 张图 → 120 个 patch token
  第 4 张图 → 64 个 patch token
  第 5 张图 → 120 个 patch token
  第 6 张图 → 64 个 patch token
  第 7 张图 → 120 个 patch token
  第 8 张图 → 64 个 patch token
  第 9 张图 → 120 个 patch token
  第 10 张图 → 64 个 patch token
图像总 patch token 数: 920


In [18]:
from torch.cuda.amp import autocast, GradScaler

def train_epoch(model, processor, train_loader, criterion, optimizer, scaler, device):
    model.train()
    total_loss = 0.0
    total_correct_1 = 0
    for batch_idx, batch in enumerate(tqdm(train_loader, desc="Training")):
        batch_inputs = {"input_ids": [], "attention_mask": [], "pixel_values": [], "image_grid_thw": []}
        batch_labels = []

        for sample in batch:
            inputs, target = process_sample(sample, processor)  # 变量名改为target
            batch_inputs["input_ids"].append(inputs["input_ids"])
            batch_inputs["attention_mask"].append(inputs["attention_mask"])
            batch_inputs["pixel_values"].append(inputs["pixel_values"])
            batch_inputs["image_grid_thw"].append(inputs["image_grid_thw"])
            batch_labels.append(target)  # 接收target_mmwave数据

        # 修改维度处理
        batch_inputs = {
            k: torch.cat(v, dim=0).to(device)
            for k, v in batch_inputs.items()
        }
        # 改为stack处理三维目标数据 (batch_size, seq_len, num_classes)
        batch_labels = torch.stack(batch_labels).to(device)  # [B, T, C]

        optimizer.zero_grad()
        
        logits = model(**batch_inputs)  # [B, T, C]
        # 展平时间步维度计算损失
        loss = criterion(logits.view(-1, logits.size(-1)), 
                        batch_labels.view(-1, batch_labels.size(-1)))

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
        total_correct_1 += calculate_accuracy(logits, batch_labels, k=1)
    train_acc1 = total_correct_1 / len(train_loader)
    return total_loss / len(train_loader) , train_acc1

def evaluate(model, processor, val_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    total_samples = 0
    total_correct = defaultdict(int)  # 存储不同k值的正确数

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="evaluating"):
            batch_inputs = {"input_ids": [], "attention_mask": [], "pixel_values": [], "image_grid_thw": []}
            batch_labels = []

            for sample in batch:
                inputs, target = process_sample(sample, processor)  # 变量名改为target
                batch_inputs["input_ids"].append(inputs["input_ids"])
                batch_inputs["attention_mask"].append(inputs["attention_mask"])
                batch_inputs["pixel_values"].append(inputs["pixel_values"])
                batch_inputs["image_grid_thw"].append(inputs["image_grid_thw"])
                batch_labels.append(target)

            batch_inputs = {k: torch.cat(v, dim=0).to(device) for k, v in batch_inputs.items()}
            batch_labels = torch.stack(batch_labels).to(device)  # [B, T, C]

            with autocast():
                logits = model(**batch_inputs)  # [B, T, C]
                loss = criterion(logits.view(-1, logits.size(-1)),
                                batch_labels.view(-1, batch_labels.size(-1)))

            total_loss += loss.item()
            total_samples += batch_labels.size(0)

            # 修改准确率计算逻辑
            for k in [1, 3, 5]:
                _, preds = logits.topk(k, dim=-1)  # [B, T, k]
                # 将target转换为类别索引（假设target是one-hot编码）
                targets = torch.argmax(batch_labels, dim=-1)  # [B, T]
                correct = preds.eq(targets.unsqueeze(-1)).any(-1)  # [B, T]
                total_correct[k] += correct.sum().item()

    avg_loss = total_loss / len(val_loader)
    accuracies = {k: total_correct[k]/(total_samples * batch_labels.size(1)) for k in [1,3,5]}
    
    return avg_loss, accuracies[1], accuracies[3], accuracies[5]

    


## 超参数

In [19]:
epochs = 50
learning_rate = 1e-4
patience  = 5
checkpoint_dir = "/data2/dzr/finetune/finetunning_1_checkpoints"

In [None]:
model = finetuner.to(device)
# 模型里用了 model.qwen.gradient_checkpointing_enable()
scaler    = GradScaler()
criterion = HybridLoss()
optimizer = optim.AdamW(
    model.parameters(),
    lr=learning_rate
)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, 
        mode='min', 
        factor=0.5, 
        patience=5,
    )


  scaler    = GradScaler()


### 测试

In [21]:
# train_test ,train_test_acc = train_epoch(model,processor,train_loader,criterion,optimizer,scaler,device)

In [22]:
from matplotlib_inline import backend_inline
from IPython import display
# 定义 use_svg_display 函数
def use_svg_display():
    """Use the svg format to display a plot in Jupyter."""
    backend_inline.set_matplotlib_formats('svg')

# 定义 set_axes 函数
def set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend):
    """设置 Matplotlib 的轴"""
    axes.set_xlabel(xlabel)
    axes.set_ylabel(ylabel)
    axes.set_xscale(xscale)
    axes.set_yscale(yscale)
    axes.set_xlim(xlim)
    axes.set_ylim(ylim)
    if legend:
        axes.legend(legend)
    axes.grid()
 

class Animator:  #@save
    """在动画中绘制数据"""
    def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None,
                 ylim=None, xscale='linear', yscale='linear',
                 fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1,
                 figsize=(3.5, 3.5)):
        # 增量地绘制多条线
        if legend is None:
            legend = []
        use_svg_display()   
        self.fig, self.axes = plt.subplots(nrows, ncols, figsize=figsize)
        if nrows * ncols == 1:
            self.axes = [self.axes, ]
        # 使用lambda函数捕获参数
        self.config_axes = lambda: set_axes(
            self.axes[0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
        self.X, self.Y, self.fmts = None, None, fmts

    def add(self, x, y):
        # 向图表中添加多个数据点
        if not hasattr(y, "__len__"):
            y = [y]
        n = len(y)
        if not hasattr(x, "__len__"):
            x = [x] * n
        if not self.X:
            self.X = [[] for _ in range(n)]
        if not self.Y:
            self.Y = [[] for _ in range(n)]
        for i, (a, b) in enumerate(zip(x, y)):
            if a is not None and b is not None:
                self.X[i].append(a)
                self.Y[i].append(b)
        self.axes[0].cla()
        for x, y, fmt in zip(self.X, self.Y, self.fmts):
            self.axes[0].plot(x, y, fmt)
        self.config_axes()

    def show(self):
        display.display(self.fig)# 输出图像
        display.clear_output(wait=True)# 不输出新图像，而是覆盖之前的图像

In [None]:
# 初始化画图
animator_loss = Animator(xlabel='epoch', xlim=[1, epochs], ylim=[0, 10],
                            legend=['train_loss','val_loss'])
animator_acc = Animator(xlabel='epoch', xlim=[1, epochs], ylim=[0, 1],
                            legend=['train_acc1', 'val_acc1'])

def format_time(seconds):
    mins, sec = divmod(seconds, 60)
    hrs, mins = divmod(mins, 60)
    return f"{int(hrs)}h {int(mins)}m {int(sec)}s"

num_epochs = epochs
best_val_loss = float('inf') # 初始化为“正无限大”（infinity）

# 确保保存模型的目录存在
os.makedirs(checkpoint_dir, exist_ok=True)

# 记录训练开始时间
training_start_time = time.time()

for epoch in range(num_epochs):
    epoch_start_time = time.time()
    current_lr = optimizer.param_groups[0]['lr']
    print(f"Current LR: {current_lr:.2e}")
    # 训练
    train_loss, train_acc1 = train_epoch(model,processor,train_loader,criterion,optimizer,scaler,device)

    # 验证
    val_loss ,acc_1 ,acc_3 ,acc_5 = evaluate(model,processor,val_loader,criterion,device)
    # 绘图
    animator_loss.add(epoch + 1, [
    train_loss.item() if isinstance(train_loss, torch.Tensor) else train_loss,
    val_loss.item() if isinstance(val_loss, torch.Tensor) else val_loss
    ])
    animator_acc.add(epoch + 1, [
    train_acc1.item() if isinstance(train_acc1, torch.Tensor) else train_acc1,
    acc_1.item() if isinstance(acc_1, torch.Tensor) else acc_1
    ])

    epoch_end_time = time.time()
    epoch_duration = epoch_end_time - epoch_start_time

    # 计算剩余时间
    elapsed_time = epoch_end_time - training_start_time
    avg_epoch_time = elapsed_time / (epoch + 1)
    remaining_epochs = num_epochs - (epoch + 1)
    remaining_time = avg_epoch_time * remaining_epochs

    # 转换为更易读的格式
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f},Train Accuracy@1:{train_acc1:.4f}")
    print(f"Val Loss: {val_loss:.4f},Val Accuracy@1: {acc_1:.4f},Val Accuracy@3: {acc_3:.4f},Val Accuracy@5: {acc_5:.4f}")
    print(f"Epoch Duration: {format_time(epoch_duration)}, Estimated Remaining Time: {format_time(remaining_time)}")

    # 更新学习率调度器
    scheduler.step(val_loss)

    # 保存最佳模型
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_path = os.path.join(checkpoint_dir, 'multimodal_encoder_decoder_best.pth')
        torch.save(model.state_dict(), best_model_path)
        print(f"Saved best model at epoch {epoch+1} to {best_model_path}")
        early_stop_counter = 0  # 重置计数器
    else:
        early_stop_counter += 1  # 增加计数器

    # 如果验证损失连续多个 epoch 没有改善，则停止训练
    if early_stop_counter >= patience:
        print(f"Early stopping at epoch {epoch+1}")
        break  # 提前停止训练

    # 每隔若干个 epoch 保存模型
    if (epoch + 1) % 5 == 0:
        checkpoint_path = os.path.join(checkpoint_dir, f'multimodal_encoder_decoder_epoch_{epoch+1}.pth')
        torch.save(model.state_dict(), checkpoint_path)
        print(f"Saved model at epoch {epoch+1} to {checkpoint_path}")
animator_loss.show()
animator_acc.show()
# 7. 测试评估

# 加载最佳模型
best_model_path = os.path.join(checkpoint_dir, 'multimodal_encoder_decoder_best.pth')
if os.path.exists(best_model_path):
    model.load_state_dict(torch.load(best_model_path, map_location=device))
    model.eval()
    print("Loaded best model for testing.")
else:
    print(f"Best model not found at {best_model_path}. Skipping test evaluation.")

# 定义测试评估函数（可以与验证相同）


test_loss ,test_acc1 ,test_acc3 ,test_acc5 = evaluate(model,processor,test_loader,criterion,device)
print(f"Test Loss : {test_loss:.4f};Test Accuracy@3 : {test_acc3:.4f}")
print(f"Test Accuracy@1 : {test_acc1:.4f};Test Accuracy@5 : {test_acc5:.4f}")


Current LR: 1.00e-04


Training:   0%|          | 0/150 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
