In [1]:
import torch
from datasets import load_from_disk
from transformers import AutoModelForCausalLM
from drivevlms.models.phi4_bjxx import Phi4MMProcessor
from drivevlms.build import build_collate_fn
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from functools import partial
from tqdm import tqdm
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
SAVE_DIR = f"../data/models/{MODEL_ID.split('/')[-1]}-W4A16-G128"


In [None]:
# Constants
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 8192
MODEL_ID = 'cutebananas/phi-4-multimodal-finetuned'


# Load model and processor manually
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    attn_implementation='sdpa',
)

# ✅ 手动加载 processor（并绕过 llmcompressor 的自动 processor 初始化）
processor = Phi4MMProcessor.from_pretrained("microsoft/Phi-4-multimodal-instruct", trust_remote_code=True)




Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.84it/s]
Some weights of Phi4MMForCausalLM were not initialized from the model checkpoint at cutebananas/phi-4-multimodal-finetuned and are newly initialized: ['model.embed_tokens_extend.audio_embed.audio_projection.speech.0.bias', 'model.embed_tokens_extend.audio_embed.audio_projection.speech.0.weight', 'model.embed_tokens_extend.audio_embed.audio_projection.speech.2.bias', 'model.embed_tokens_extend.audio_embed.audio_projection.speech.2.weight', 'model.embed_tokens_extend.audio_embed.audio_projection.vision.0.bias', 'model.embed_tokens_extend.audio_embed.audio_projection.vision.0.weight', 'model.embed_tokens_extend.audio_embed.audio_projection.vision.2.bias', 'model.embed_tokens_extend.audio_embed.audio_projection.vision.2.weight', 'model.embed_tokens_extend.audio_embed.encoder.embed.conv.0.bias', 'model.embed_tokens_extend.audio_embed.encoder.embed.conv.0.weight', 'model.embed_tokens_extend.audio_embed.encoder.embed.con

In [8]:
calib_dataset = load_from_disk("../data/DriveLM_nuScenes/split/train")
calib_dataset = calib_dataset.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
print("Dataset type:", type(calib_dataset))
print("Sample:", calib_dataset[0])

Dataset type: <class 'datasets.arrow_dataset.Dataset'>
Sample: {'id': '21464a6831c1443db294d1b56a7a33a5_b8b9a4513c06454b9f66b09f50a539ae_6', 'image_paths': ['data/DriveLM_nuScenes/nuscenes/samples/CAM_FRONT/n015-2018-09-26-11-17-24+0800__CAM_FRONT__1537932304912461.jpg', 'data/DriveLM_nuScenes/nuscenes/samples/CAM_FRONT_LEFT/n015-2018-09-26-11-17-24+0800__CAM_FRONT_LEFT__1537932304904844.jpg', 'data/DriveLM_nuScenes/nuscenes/samples/CAM_FRONT_RIGHT/n015-2018-09-26-11-17-24+0800__CAM_FRONT_RIGHT__1537932304920339.jpg', 'data/DriveLM_nuScenes/nuscenes/samples/CAM_BACK/n015-2018-09-26-11-17-24+0800__CAM_BACK__1537932304937525.jpg', 'data/DriveLM_nuScenes/nuscenes/samples/CAM_BACK_LEFT/n015-2018-09-26-11-17-24+0800__CAM_BACK_LEFT__1537932304947423.jpg', 'data/DriveLM_nuScenes/nuscenes/samples/CAM_BACK_RIGHT/n015-2018-09-26-11-17-24+0800__CAM_BACK_RIGHT__1537932304927893.jpg'], 'conversations': [{'from': 'human', 'value': 'In this scenario, what are safe actions to take for the ego vehicle?

In [10]:
collate_fn = build_collate_fn("drivelm_nus_phi4_collate_fn_val")
val_collate_fn = partial(collate_fn, processor=processor, device='cuda')

dataloader = DataLoader(
    calib_dataset,
    batch_size=1,
    shuffle=False,
    collate_fn=val_collate_fn,
    num_workers=0,
)


In [31]:
def preprocess(example):
    question = [x["value"] for x in example["conversations"] if x["from"] == "human"]
    answer = [x["value"] for x in example["conversations"] if x["from"] == "gpt"]
    q = question[0] if question else ""
    a = answer[0] if answer else ""
    return {
        "text": f"<image>\n{q}\n{a}"
    }

dataset = calib_dataset.map(preprocess)
print("Preprocessed dataset:", dataset[0])
tokenizer = processor.tokenizer

# Step 2: Tokenize text（图像不用于 calibration）
def tokenize(example):
    return tokenizer(
        example["text"],
        padding=False,
        truncation=True,
        max_length=MAX_SEQUENCE_LENGTH,
        return_tensors=None,
        add_special_tokens=False,
    )

dataset = dataset.map(tokenize, remove_columns=dataset.column_names)
print("Tokenized dataset:", dataset[0])

Preprocessed dataset: {'id': '21464a6831c1443db294d1b56a7a33a5_b8b9a4513c06454b9f66b09f50a539ae_6', 'image_paths': ['data/DriveLM_nuScenes/nuscenes/samples/CAM_FRONT/n015-2018-09-26-11-17-24+0800__CAM_FRONT__1537932304912461.jpg', 'data/DriveLM_nuScenes/nuscenes/samples/CAM_FRONT_LEFT/n015-2018-09-26-11-17-24+0800__CAM_FRONT_LEFT__1537932304904844.jpg', 'data/DriveLM_nuScenes/nuscenes/samples/CAM_FRONT_RIGHT/n015-2018-09-26-11-17-24+0800__CAM_FRONT_RIGHT__1537932304920339.jpg', 'data/DriveLM_nuScenes/nuscenes/samples/CAM_BACK/n015-2018-09-26-11-17-24+0800__CAM_BACK__1537932304937525.jpg', 'data/DriveLM_nuScenes/nuscenes/samples/CAM_BACK_LEFT/n015-2018-09-26-11-17-24+0800__CAM_BACK_LEFT__1537932304947423.jpg', 'data/DriveLM_nuScenes/nuscenes/samples/CAM_BACK_RIGHT/n015-2018-09-26-11-17-24+0800__CAM_BACK_RIGHT__1537932304927893.jpg'], 'conversations': [{'from': 'human', 'value': 'In this scenario, what are safe actions to take for the ego vehicle?'}, {'from': 'gpt', 'value': 'Keep going 

In [17]:
print("Number of rows in calib_dataset:", len(calib_dataset))
print("Example entry:", calib_dataset[0] if len(calib_dataset) > 0 else "EMPTY")


Number of rows in calib_dataset: 512
Example entry: {'id': '21464a6831c1443db294d1b56a7a33a5_b8b9a4513c06454b9f66b09f50a539ae_6', 'image_paths': ['data/DriveLM_nuScenes/nuscenes/samples/CAM_FRONT/n015-2018-09-26-11-17-24+0800__CAM_FRONT__1537932304912461.jpg', 'data/DriveLM_nuScenes/nuscenes/samples/CAM_FRONT_LEFT/n015-2018-09-26-11-17-24+0800__CAM_FRONT_LEFT__1537932304904844.jpg', 'data/DriveLM_nuScenes/nuscenes/samples/CAM_FRONT_RIGHT/n015-2018-09-26-11-17-24+0800__CAM_FRONT_RIGHT__1537932304920339.jpg', 'data/DriveLM_nuScenes/nuscenes/samples/CAM_BACK/n015-2018-09-26-11-17-24+0800__CAM_BACK__1537932304937525.jpg', 'data/DriveLM_nuScenes/nuscenes/samples/CAM_BACK_LEFT/n015-2018-09-26-11-17-24+0800__CAM_BACK_LEFT__1537932304947423.jpg', 'data/DriveLM_nuScenes/nuscenes/samples/CAM_BACK_RIGHT/n015-2018-09-26-11-17-24+0800__CAM_BACK_RIGHT__1537932304927893.jpg'], 'conversations': [{'from': 'human', 'value': 'In this scenario, what are safe actions to take for the ego vehicle?'}, {'from'

In [None]:
recipe = GPTQModifier(
    targets="Linear",
    scheme="W4A16",
    sequential_targets=["Phi4MMDecoderLayer"],
    ignore=["lm_head", "re:model.vision_embed_tokens.*"],
)


oneshot(
    model=model,
    dataset=dataset,  
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    trust_remote_code_model=True,
    processor=processor,  
    
)
# Save model and processor
model.save_pretrained(SAVE_DIR, save_compressed=True)
processor.save_pretrained(SAVE_DIR)
model.to("cuda")


2025-05-09T10:49:05.370771+0800 | reset | INFO - Compression lifecycle reset
2025-05-09T10:49:05.371799+0800 | from_modifiers | INFO - Creating recipe from modifiers


Preparing intermediates cache:   0%|          | 0/512 [00:00<?, ?it/s]
Calibrating:   0%|          | 0/512 [00:00<?, ?it/s]


ValueError: None is not a valid InputMode