# 微调Qwen2.5-7B

In [1]:
import torch
import glob
import os
os.environ["MODELSCOPE_CACHE"] = "/data2/dzr/.cache" 
from collections import OrderedDict, defaultdict
import math
import random
from tqdm import tqdm  # 引入 tqdm 库
import time  # 引入 time 模块
import argparse  # 引入 argparse 模块
import sys
import numpy as np
import torch.optim as optim
import torch.nn as nn
from io import BytesIO
from torch.utils.data import DataLoader, Subset, random_split
from typing import Dict, List
from modelscope import AutoTokenizer, AutoProcessor,Qwen2_5_VLForConditionalGeneration
from qwen_vl_utils import process_vision_info
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import pandas as pd
from peft import LoraConfig, get_peft_model
from modelscope import AutoModel

model_ckpt = "Qwen/Qwen2.5-VL-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
processor = AutoProcessor.from_pretrained(model_ckpt, trust_remote_code=True)
print(torch.cuda.memory_summary())



Downloading Model from https://www.modelscope.cn to directory: /data2/dzr/.cache/models/Qwen/Qwen2.5-VL-7B-Instruct


2025-05-23 23:07:46,218 - modelscope - INFO - Target directory already exists, skipping creation.


Downloading Model from https://www.modelscope.cn to directory: /data2/dzr/.cache/models/Qwen/Qwen2.5-VL-7B-Instruct


2025-05-23 23:07:47,836 - modelscope - INFO - Target directory already exists, skipping creation.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------

## 处理数据

In [2]:
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from PIL import Image

class QwenVisionDataset(Dataset):
    def __init__(self, data_csv_paths, modal='mmwave_gps', input_length=8, output_length=3):
        self.data_csv_paths = data_csv_paths
        self.modal = modal
        self.input_length = input_length
        self.output_length = output_length

        # 特征列映射
        self.features_column = {
            'rgbs': 'unit1_rgb',
            'u1_loc': 'unit1_loc',
            'u2_loc': 'unit2_loc',
            'mmwave': 'unit1_pwr_60ghz',
            'heatmap': 'unit1_mmwave_heatmap'  # 新增热力图列
        }
        
        # 初始化滑动窗口
        self.window_samples = []
        for seq_idx, data_csv_path in enumerate(self.data_csv_paths):
            data_csv = pd.read_csv(data_csv_path)
            for seq_id in data_csv['seq_index'].unique():
                seq_data = data_csv[data_csv['seq_index'] == seq_id]
                if len(seq_data) >= self.input_length:
                    for start_idx in range(len(seq_data) - self.input_length + 1):
                        self.window_samples.append((seq_idx, seq_id, start_idx))

    def __len__(self):
        return len(self.window_samples)
    
    def __getitem__(self, idx):
        seq_idx, seq_id, start_idx = self.window_samples[idx]
        base_path = os.path.dirname(self.data_csv_paths[seq_idx])
        data_csv = pd.read_csv(self.data_csv_paths[seq_idx])
        seq_data = data_csv[data_csv['seq_index'] == seq_id]

        # 获取原始路径数据
        window_data = {
            'video_paths': seq_data[self.features_column['rgbs']]
            .iloc[start_idx:start_idx+self.input_length] 
            .tolist(),
            'heatmap_paths': seq_data[self.features_column['heatmap']]
            .iloc[start_idx:start_idx+self.input_length] 
            .tolist()
        }

        # 处理GPS数据
        gps = []
        for i in range(self.input_length):
            u1_loc = os.path.join(base_path, seq_data[self.features_column['u1_loc']].iloc[start_idx+i])
            u2_loc = os.path.join(base_path, seq_data[self.features_column['u2_loc']].iloc[start_idx+i])
            
            with open(u1_loc, 'r') as f:
                lat1, lon1 = map(float, f.read().strip().split())
            with open(u2_loc, 'r') as f:
                lat2, lon2 = map(float, f.read().strip().split())
                
            gps.append(torch.tensor([lat2-lat1, lon2-lon1], dtype=torch.float32))
        gps = torch.stack(gps)

        # 处理mmWave数据
        mmwave = []
        for i in range(self.input_length):
            mmwave_path = os.path.join(base_path, 
                seq_data[self.features_column['mmwave']].iloc[start_idx+i])
            with open(mmwave_path, 'r') as f:
                mmwave.append(torch.tensor(
                    list(map(float, f.read().strip().split())), 
                    dtype=torch.float32))
        mmwave = torch.stack(mmwave)

        # 目标数据（最后output_length个时间步）
        target = []
        for i in range(self.input_length-self.output_length, self.input_length):
            mmwave_path = os.path.join(base_path,
                seq_data[self.features_column['mmwave']].iloc[start_idx+i])
            with open(mmwave_path, 'r') as f:
                target.append(torch.tensor(
                    list(map(float, f.read().strip().split())),
                    dtype=torch.float32))
        target = torch.stack(target)

        return {
            'video_paths': [os.path.join(base_path, p) for p in window_data['video_paths']],
            'heatmap_paths': [os.path.join(base_path, p) for p in window_data['heatmap_paths']],
            'gps': gps,
            'mmwave': mmwave,
            'target_mmwave': target
        }

def qwen_collate_fn(batch):
    collated = {
        'video_paths': [item['video_paths'] for item in batch],
        'heatmap_paths': [item['heatmap_paths'] for item in batch],
        'gps': pad_sequence([item['gps'] for item in batch], batch_first=True),
        'mmwave': pad_sequence([item['mmwave'] for item in batch], batch_first=True),
        'target_mmwave': pad_sequence([item['target_mmwave'] for item in batch], batch_first=True)
    }
    return collated

In [3]:
dataset_start_idx = 1
dataset_end_idx = 9
# 定义数据集路径
dataset_path = [f'/data2/wzj/Datasets/DeepSense/scenario{i}/' for i in range(dataset_start_idx, dataset_end_idx)]  # scenario1 ~ scenario8

data_csv_paths = []
for path in dataset_path:
    data_csv_paths.extend(glob.glob(os.path.join(path, '*.csv')))

print(f"Found {len(data_csv_paths)} CSV files for training.")

Found 8 CSV files for training.


### 加载数据集

In [4]:
dataset = QwenVisionDataset(
    data_csv_paths,
    input_length=8,
    output_length=3
)
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------

In [5]:
dataset[998]

{'video_paths': ['/data2/wzj/Datasets/DeepSense/scenario1/./unit1/camera_data/image_BS1_5376_00_52_36.jpg',
  '/data2/wzj/Datasets/DeepSense/scenario1/./unit1/camera_data/image_BS1_5377_00_52_36.jpg',
  '/data2/wzj/Datasets/DeepSense/scenario1/./unit1/camera_data/image_BS1_5378_00_52_36.jpg',
  '/data2/wzj/Datasets/DeepSense/scenario1/./unit1/camera_data/image_BS1_5379_00_52_36.jpg',
  '/data2/wzj/Datasets/DeepSense/scenario1/./unit1/camera_data/image_BS1_5380_00_52_36.jpg',
  '/data2/wzj/Datasets/DeepSense/scenario1/./unit1/camera_data/image_BS1_5381_00_52_36.jpg',
  '/data2/wzj/Datasets/DeepSense/scenario1/./unit1/camera_data/image_BS1_5382_00_52_36.jpg',
  '/data2/wzj/Datasets/DeepSense/scenario1/./unit1/camera_data/image_BS1_5383_00_52_36.jpg'],
 'heatmap_paths': ['/data2/wzj/Datasets/DeepSense/scenario1/./unit1/mmWave_heatmap/mmWave_power_1075.png',
  '/data2/wzj/Datasets/DeepSense/scenario1/./unit1/mmWave_heatmap/mmWave_power_1076.png',
  '/data2/wzj/Datasets/DeepSense/scenario1/

### 划分数据集

In [6]:
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

print(f"Total Training samples: {len(train_dataset)}")
print(f"Total Validation samples: {len(val_dataset)}")
print(f"Total Testing samples: {len(test_dataset)}")


Total Training samples: 11400
Total Validation samples: 1425
Total Testing samples: 1425


In [7]:
def custom_collate(batch):
    # 直接返回样本列表，不进行合并
    return batch

# 创建 DataLoader 时指定 collate_fn
train_loader = DataLoader(dataset, batch_size=1, collate_fn=custom_collate)
print(torch.cuda.memory_summary())



# 创建 DataLoader
#batch_size = 64


#train_loader = DataLoader(
#   train_dataset,
 #   batch_size=batch_size,
 #   shuffle=True,  # 打乱训练集
 #   num_workers=4,
 #   pin_memory=True if torch.cuda.is_available() else False,
#)



|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------

## 定义度量指标

In [8]:
import torch.nn.functional as F
#---------新增topkloss---------
class TopkLoss(nn.Module):
    def __init__(self, k=1, reduction='mean'):
        super().__init__()
        self.k = k
        self.reduction = reduction

    def forward(self, output, target):
        """
        Args:
            output : [B, T, C] 模型输出的logits（未归一化）
            target : [B, T, C] one-hot编码 或 [B, T] 类别索引
            B = Batch Size        批量大小（数据加载时设置的batch_size）
            T = Sequence Length   输出序列的时间步数（output_length=3）
            C = Num Classes       类别数量（64个离散目标类别）
        """
        # 转换target为类别索引
        if target.dim() == 3:
            target = torch.argmax(target, dim=-1)  # [B, T]
        
        B, T, C = output.shape
        output_flat = output.view(B*T, C)  # [B*T, C]
        target_flat = target.contiguous().view(-1)  # [B*T]
        
        # 计算Top-k正确性
        _, topk_indices = torch.topk(output_flat, self.k, dim=1)  # [B*T, k]
        correct = topk_indices.eq(target_flat.unsqueeze(1)).any(dim=1)  # [B*T]
        
        # 计算损失（仅惩罚Top-k错误的样本）
        loss = F.cross_entropy(output_flat, target_flat, reduction='none')  # [B*T]，表示每个样本的预测是否在 Top-K 中命中真实标签
        masked_loss = loss * ~correct  # 仅保留错误样本的损失值，正确样本的损失被置零
        
        if self.reduction == 'mean':
            return masked_loss.mean()
        elif self.reduction == 'sum':
            return masked_loss.sum()
        return masked_loss

#------------新增HybridLoss--------------
class HybridLoss(nn.Module):
    def __init__(self, alpha=0.7, k=3):
        super().__init__()
        self.alpha = alpha  # 混合权重
        self.k = k
        self.ce = nn.CrossEntropyLoss(reduction='none')
        
    def forward(self, output, target):
        """
        output : [B, T, C]
        target : [B, T]
        """
        # 转换target为类别索引
        if target.dim() == 3:
            target = torch.argmax(target, dim=-1)  # [B, T]
        
        B, T, C = output.shape
        
        # 常规交叉熵损失（保持生成特性）
        ce_loss = self.ce(output.view(-1, C), target.view(-1))  # [B*T]
        
        # Top-K增强损失
        _, topk = output.topk(self.k, dim=-1)  # [B, T, k]
        correct = topk.eq(target.unsqueeze(-1)).any(-1)  # [B, T]
        topk_loss = (1 - correct.float()).mean()  # 错误率
        
        # 时间依赖惩罚项
        seq_penalty = self._sequence_consistency(output, target)  # [1]
        
        return self.alpha*ce_loss.mean() + (1-self.alpha)*topk_loss + seq_penalty
        
    def _sequence_consistency(self, output, target):
        """
        惩罚相邻时间步预测不一致的情况
        """
        preds = output.argmax(-1)  # [B, T]
        diff = (preds[:, 1:] != preds[:, :-1]).float().mean()
        return diff * 0.2  # 可调节系数
    
#-------------新增CrossEntropyloss-----------------
class CrossEntropyLoss(nn.Module):
    def __init__(self, reduction='mean'):
        super().__init__()
        self.reduction = reduction
        self.ce = nn.CrossEntropyLoss(reduction='none')  # 始终返回非归约结果

    def forward(self, output, target):
        # 处理one-hot编码目标
        if target.dim() == 3:
            target = torch.argmax(target, dim=-1)  # [B, T]

        # 重塑维度
        output = output.view(-1, output.size(-1))  # [B*T, C]
        target = target.view(-1)                   # [B*T]

        # 计算基础损失
        ce_loss = self.ce(output, target)
        
        # 自定义归约方式
        if self.reduction == 'mean':
            return ce_loss.mean()
        elif self.reduction == 'sum':
            return ce_loss.sum()
        return ce_loss  # 'none'模式返回原始形状

def calculate_accuracy(output, target, k=3):
        if target.dim() == 3:
            target = torch.argmax(target, dim=-1)  # [B, T]
        with torch.no_grad():
            _, pred = output.topk(k, dim=-1)  # [B, T, k]
            correct = pred.eq(target.unsqueeze(-1)).any(dim=-1)
            return correct.float().mean()

## 加载Qwen

In [9]:
# It's highly recommanded to use `[decord]` feature for faster video loading.
#!pip install qwen-vl-utils[decord]==0.0.8
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    print(f"Using device: {device} ({torch.cuda.get_device_name(device)})")
else:
    print(f"Using device: {device}")


Using device: cuda (NVIDIA A100-SXM4-80GB)


In [10]:
# 正确加载模型的方式（使用AutoModelForCausalLM）
finetune_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_ckpt,
    trust_remote_code=True,  # 必须开启此选项
    device_map="cuda",
    return_dict=True,
).to(device)
print(torch.cuda.memory_summary())



Downloading Model from https://www.modelscope.cn to directory: /data2/dzr/.cache/models/Qwen/Qwen2.5-VL-7B-Instruct


2025-05-23 23:07:50,539 - modelscope - INFO - Target directory already exists, skipping creation.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  31774 MiB |  31774 MiB |  31774 MiB |      0 B   |
|       from large pool |  31771 MiB |  31771 MiB |  31771 MiB |      0 B   |
|       from small pool |      3 MiB |      3 MiB |      3 MiB |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |  31774 MiB |  31774 MiB |  31774 MiB |      0 B   |
|       from large pool |  31771 MiB |  31771 MiB |  31771 MiB |      0 B   |
|       from small pool |      3 MiB |      3 MiB |      3 MiB |      0 B   |
|---------------------------------------------------------------

In [11]:
print(finetune_model)

Qwen2_5_VLForConditionalGeneration(
  (visual): Qwen2_5_VisionTransformerPretrainedModel(
    (patch_embed): Qwen2_5_VisionPatchEmbed(
      (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
    )
    (rotary_pos_emb): Qwen2_5_VisionRotaryEmbedding()
    (blocks): ModuleList(
      (0-31): 32 x Qwen2_5_VLVisionBlock(
        (norm1): Qwen2RMSNorm((1280,), eps=1e-06)
        (norm2): Qwen2RMSNorm((1280,), eps=1e-06)
        (attn): Qwen2_5_VLVisionSdpaAttention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (mlp): Qwen2_5_VLMLP(
          (gate_proj): Linear(in_features=1280, out_features=3420, bias=True)
          (up_proj): Linear(in_features=1280, out_features=3420, bias=True)
          (down_proj): Linear(in_features=3420, out_features=1280, bias=True)
          (act_fn): SiLU()
        )
      )
    )
    (merger): Qwen2_5_VLPatchMerger

In [12]:
# 3) 冻结原模型所有参数
for param in finetune_model.parameters():
    param.requires_grad = False


In [14]:
# 4) 配置 LoRA Adapter
lora_config = LoraConfig(
    r=8,                         # LoRA rank
    lora_alpha=32,               # LoRA scaling
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)
# 5) 注入 LoRA
qwen_lora = get_peft_model(finetune_model, lora_config)

## 用Qwen构造带有64类分类头的模型

In [15]:
class Qwen_and_Head(nn.Module):
    def __init__(self, pretrained_model):
        super().__init__()
        self.qwen = pretrained_model
        joint_hidden_size = 3584

        # final head now produces 3×64 dims
        self.classifier = nn.Sequential(
            nn.Linear(joint_hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, 3 * 64),    # 3 timesteps × 64 classes
        )

    def forward(self, input_ids, attention_mask, pixel_values, image_grid_thw):
        outputs = self.qwen(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values,
            image_grid_thw=image_grid_thw,
            output_hidden_states=True,
            return_dict=True,
        )

        # grab the [CLS] token
        last_hidden = outputs.hidden_states[-1]   # (B, L, D)
        cls_token   = last_hidden[:, 0, :]        # (B, D)

        # project to (B, 3*64) and reshape
        logits_flat = self.classifier(cls_token)            # (B, 192)
        logits      = logits_flat.view(-1, 3, 64)           # (B, 3, 64)
        return logits


In [16]:
finetuner = Qwen_and_Head(pretrained_model=qwen_lora).to(device)
print(torch.cuda.memory_summary())
print(finetuner)

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  31791 MiB |  31791 MiB |  31791 MiB |      0 B   |
|       from large pool |  31778 MiB |  31778 MiB |  31778 MiB |      0 B   |
|       from small pool |     13 MiB |     13 MiB |     13 MiB |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |  31791 MiB |  31791 MiB |  31791 MiB |      0 B   |
|       from large pool |  31778 MiB |  31778 MiB |  31778 MiB |      0 B   |
|       from small pool |     13 MiB |     13 MiB |     13 MiB |      0 B   |
|---------------------------------------------------------------

## 构建多模态提示词并提取视觉输入

In [17]:
def build_prompt_and_inputs(sample: Dict, hist_steps: int = 5) -> Dict:
    """构建多模态提示词并提取视觉输入
    Args:
        sample: 包含多模态数据的样本
        hist_steps: 使用历史时间步数（默认为5）
    Returns:
        包含处理后的提示词和视觉输入的字典
    """
    # 提取并规范化路径
    def normalize_paths(path_list: List[str]) -> List[str]:
        return [os.path.normpath(p) for p in path_list]
    # 处理所有路径
    video_paths = normalize_paths(sample['video_paths'][:hist_steps])
    heatmap_paths = normalize_paths(sample['heatmap_paths'][:hist_steps])
    gps_data = sample['gps'][:hist_steps].tolist()
    
    # 构建时间序列提示词
    prompt_parts = []
    for step in range(hist_steps):
        time_label = f"t-{hist_steps-1-step}" if step < hist_steps-1 else "Current time (t)"
        
        # GPS数据格式化（假设张量存储的是经度、纬度）
        lon, lat = gps_data[step]
        gps_str = f"longitude:{lon:.6f}, dimension:{lat:.6f}"
        
        # 添加多模态信息块
        prompt_part = (
            f"This is the observation data for {time_label}:\n"
            f"- Monitoring photos:[IMG:{video_paths[step]}]\n"
            f"- mmWave heatmap:[IMG:{heatmap_paths[step]}]\n"
            f"- The relative position between the car and the base station:{gps_str}\n"
        )
        prompt_parts.append(prompt_part)
    
    # 组合完整提示词
    system_prompt = "You are a wireless communication expert. Please analyze spatiotemporal multimodal data comprehensively and predict future mmWave beam indices."
    full_prompt = (
        f"{system_prompt}" 
        + "".join(prompt_parts) +
        "Please predict the beam indices for the next three time steps (t+1, t+2, t+3) based on the above time-series observation data, and provide numerical results directly."
    )
    
    # 提取所有视觉路径（RGB + 热力图）
    all_image_paths = [p for pair in zip(video_paths, heatmap_paths) for p in pair]
    
    return {
        "prompt": full_prompt,
        "image_paths": all_image_paths,
        "labels": sample['target_mmwave'].argmax(dim=-1).tolist()  # 假设索引是最大值位置
    }

# 示例使用 ---------------------------------------------------
def process_sample(sample, processor):  # 添加processor参数
    # Step 1: 构建提示词和获取图像路径
    processed = build_prompt_and_inputs(sample)
    
    # Step 2: 构建messages结构
    messages = [{
        "role": "user",
        "content": [{"type": "image", "image": path} for path in processed["image_paths"]] + 
                  [{"type": "text", "text": processed["prompt"]}]
    }]
    
    # Step 3: 使用传入的processor处理输入
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    return inputs, processed["labels"]

sample = dataset[2324]
print(build_prompt_and_inputs(sample))
print(process_sample(sample,processor=processor))
print(torch.cuda.memory_summary())



{'prompt': 'You are a wireless communication expert. Please analyze spatiotemporal multimodal data comprehensively and predict future mmWave beam indices.This is the observation data for t-4:\n- Monitoring photos:[IMG:/data2/wzj/Datasets/DeepSense/scenario2/unit1/camera_data/image_BS1_976_02_12_21.jpg]\n- mmWave heatmap:[IMG:/data2/wzj/Datasets/DeepSense/scenario2/unit1/mmWave_heatmap/mmWave_power_123.png]\n- The relative position between the car and the base station:longitude:0.000087, dimension:0.000158\nThis is the observation data for t-3:\n- Monitoring photos:[IMG:/data2/wzj/Datasets/DeepSense/scenario2/unit1/camera_data/image_BS1_977_02_12_21.jpg]\n- mmWave heatmap:[IMG:/data2/wzj/Datasets/DeepSense/scenario2/unit1/mmWave_heatmap/mmWave_power_124.png]\n- The relative position between the car and the base station:longitude:0.000084, dimension:0.000159\nThis is the observation data for t-2:\n- Monitoring photos:[IMG:/data2/wzj/Datasets/DeepSense/scenario2/unit1/camera_data/image_BS

In [18]:
from torch.cuda.amp import autocast, GradScaler

def train_epoch(model, processor, train_loader, criterion, optimizer, scaler, device):
    model.train()
    total_loss = 0.0

    for batch in tqdm(train_loader, desc="Training"):
        # 1) run process_sample on every raw sample
        batch_inputs = {"input_ids": [], "attention_mask": [], "pixel_values": [],"image_grid_thw": []}
        batch_labels = []

        for sample in batch:
            inputs, label = process_sample(sample, processor)
            batch_inputs["input_ids"].append(inputs["input_ids"])
            batch_inputs["attention_mask"].append(inputs["attention_mask"])
            batch_inputs["pixel_values"].append(inputs["pixel_values"])
            batch_inputs["image_grid_thw"].append(inputs["image_grid_thw"])
            batch_labels.append(label)

        # 2) stack/cat into real batched tensors
        batch_inputs = {
            k: torch.cat(v, dim=0).to(device)
            for k, v in batch_inputs.items()
        }
        batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
        print(batch_inputs)
        # 3) forward + loss + backward
        optimizer.zero_grad()
        # 1) forward in mixed precision
        with autocast():
            logits = model(**batch_inputs)
            loss   = criterion(logits, batch_labels)

        # 2) scale, backward, unscale, step, update
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()


        total_loss += loss.item()

    return total_loss / len(train_loader)

def evaluate(model,  processor,evaluate_loader, criterion , optimizer, device):
    model.eval()
    total_loss = 0.0
    criterion = criterion
    model.train()
    total_loss = 0.0
    
    for batch in tqdm(train_loader, desc="Training"):
        # 多模态数据处理
        inputs_list = []
        labels_list = []
        
        # 处理每个样本
        for sample in batch:
            # 使用之前定义的process_sample函数
            inputs, labels = process_sample(sample)
            inputs_list.append(inputs)
            labels_list.extend(labels)
        
        # 自定义批处理函数
        batch_inputs = (inputs_list)
        batch_labels = torch.tensor(labels_list, dtype=torch.long, device=device)
        
        # 梯度清零
        optimizer.zero_grad()
        
        # 模型前向传播
        outputs = model(
            input_ids=batch_inputs["input_ids"].to(device),
            attention_mask=batch_inputs["attention_mask"].to(device),
            pixel_values=batch_inputs["pixel_values"].to(device),
            image_grid_thw=batch_inputs["image_grid_thw"].to(device)
        )
        
        # 输出层适配
        # 输出logits的形状为 (batch_size, seq_len, vocab_size)
    
        # 计算损失
        loss = criterion(outputs, batch_labels)
        total_loss += loss.item()
        #计算topk准确率
        topk_acc = calculate_accuracy(outputs, batch_labels, k=3)

        avg_loss = total_loss / len(evaluate_loader)
    return avg_loss ,topk_acc
    


In [19]:
code here

  scaler = GradScaler()
Training:   0%|          | 0/14250 [00:00<?, ?it/s]

{'input_ids': tensor([[151644,   8948,    198,  ..., 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[ 0.2661,  0.1785,  0.1493,  ..., -0.8545, -0.9683, -0.8972],
        [ 0.0325, -0.3178, -0.1280,  ..., -0.5701, -0.3142, -0.3142],
        [-0.9602, -0.9748, -0.7120,  ...,  0.3968,  0.5390,  0.4253],
        ...,
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857],
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857],
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857]],
       device='cuda:0'), 'image_grid_thw': tensor([[ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16]], device='cuda:0')}


  with autocast():
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
Training:   0%|          | 1/14250 [00:08<32:11:53,  8.13s/it]

{'input_ids': tensor([[151644,   8948,    198,  ..., 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[ 0.2953,  0.2661,  0.2661,  ..., -0.8403, -1.0110, -0.8688],
        [ 0.0033, -0.2010, -0.0259,  ..., -0.5559, -0.2573, -0.3711],
        [-0.9310, -0.9456, -0.7558,  ...,  0.4253,  0.5675,  0.3115],
        ...,
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857],
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857],
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857]],
       device='cuda:0'), 'image_grid_thw': tensor([[ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16]], device='cuda:0')}


  with autocast():
Training:   0%|          | 2/14250 [00:14<28:59:15,  7.32s/it]

{'input_ids': tensor([[151644,   8948,    198,  ..., 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[ 0.2661,  0.1931,  0.1639,  ..., -0.8545, -0.8972, -0.8830],
        [-0.0405, -0.2594, -0.1718,  ..., -0.5275, -0.2857, -0.2431],
        [-0.9164, -0.9602, -0.8142,  ...,  0.3257,  0.4110,  0.3257],
        ...,
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857],
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857],
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857]],
       device='cuda:0'), 'image_grid_thw': tensor([[ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16]], device='cuda:0')}


  with autocast():
Training:   0%|          | 3/14250 [00:22<28:53:24,  7.30s/it]

{'input_ids': tensor([[151644,   8948,    198,  ..., 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[ 0.2953,  0.2369,  0.1639,  ..., -0.8261, -0.9967, -0.9683],
        [ 0.0179, -0.3616, -0.2010,  ..., -0.6412, -0.3711, -0.3568],
        [-0.9602, -0.9748, -0.7120,  ...,  0.3684,  0.4253,  0.3399],
        ...,
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857],
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857],
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857]],
       device='cuda:0'), 'image_grid_thw': tensor([[ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16]], device='cuda:0')}


  with autocast():
Training:   0%|          | 4/14250 [00:28<28:02:27,  7.09s/it]

{'input_ids': tensor([[151644,   8948,    198,  ..., 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[ 0.2807,  0.2223,  0.2369,  ..., -0.7977, -0.9399, -0.9114],
        [ 0.0763, -0.3178, -0.2740,  ..., -0.5417, -0.2857, -0.3000],
        [-0.9310, -0.9602, -0.7412,  ...,  0.3968,  0.4964,  0.3115],
        ...,
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857],
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857],
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857]],
       device='cuda:0'), 'image_grid_thw': tensor([[ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16]], device='cuda:0')}


  with autocast():
Training:   0%|          | 5/14250 [00:35<27:35:41,  6.97s/it]

{'input_ids': tensor([[151644,   8948,    198,  ..., 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[ 0.2953,  0.2369,  0.2077,  ..., -0.8830, -1.0110, -0.8261],
        [ 0.0471, -0.2886, -0.1426,  ..., -0.5701, -0.2573, -0.2857],
        [-0.8726, -1.0039, -0.7558,  ...,  0.3826,  0.6386,  0.3684],
        ...,
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857],
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857],
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857]],
       device='cuda:0'), 'image_grid_thw': tensor([[ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16]], device='cuda:0')}


  with autocast():
Training:   0%|          | 6/14250 [00:42<27:21:55,  6.92s/it]

{'input_ids': tensor([[151644,   8948,    198,  ..., 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[ 0.3099,  0.2223,  0.1785,  ..., -0.8545, -1.0110, -0.9114],
        [-0.0113, -0.2594, -0.1134,  ..., -0.5701, -0.2431, -0.3568],
        [-0.9164, -0.9310, -0.7558,  ...,  0.4395,  0.6528,  0.4110],
        ...,
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857],
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857],
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857]],
       device='cuda:0'), 'image_grid_thw': tensor([[ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16]], device='cuda:0')}


  with autocast():
Training:   0%|          | 7/14250 [00:49<27:15:11,  6.89s/it]

{'input_ids': tensor([[151644,   8948,    198,  ..., 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[ 0.3099,  0.2223,  0.1785,  ..., -0.9541, -0.9541, -0.9114],
        [ 0.0179, -0.2740, -0.0550,  ..., -0.5417, -0.3711, -0.3000],
        [-0.8872, -1.0039, -0.7704,  ...,  0.3542,  0.5390,  0.3826],
        ...,
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857],
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857],
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857]],
       device='cuda:0'), 'image_grid_thw': tensor([[ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16]], device='cuda:0')}


  with autocast():
Training:   0%|          | 8/14250 [00:56<27:14:22,  6.89s/it]

{'input_ids': tensor([[151644,   8948,    198,  ..., 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[ 0.3391,  0.2515,  0.2515,  ..., -0.8403, -0.9683, -0.9256],
        [ 0.0763, -0.2886, -0.0696,  ..., -0.5701, -0.3000, -0.3000],
        [-0.9018, -1.0039, -0.6828,  ...,  0.3968,  0.4964,  0.2973],
        ...,
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857],
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857],
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857]],
       device='cuda:0'), 'image_grid_thw': tensor([[ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16]], device='cuda:0')}


  with autocast():
Training:   0%|          | 9/14250 [01:03<27:20:24,  6.91s/it]

{'input_ids': tensor([[151644,   8948,    198,  ..., 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[ 0.2515,  0.1785,  0.1347,  ..., -0.7977, -0.9256, -0.8545],
        [-0.0113, -0.4054, -0.1134,  ..., -0.5844, -0.3142, -0.3142],
        [-0.9310, -0.9456, -0.7412,  ...,  0.2973,  0.4110,  0.2688],
        ...,
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857],
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857],
        [-0.7996, -0.7996, -0.7996,  ..., -0.2857, -0.2857, -0.2857]],
       device='cuda:0'), 'image_grid_thw': tensor([[ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16],
        [ 1, 38, 68],
        [ 1, 16, 16]], device='cuda:0')}


  with autocast():
Training:   0%|          | 9/14250 [01:04<28:15:41,  7.14s/it]


KeyboardInterrupt: 