diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
index efee3a4085f..0d75e7b3902 100644
--- a/examples/.config/model_params_pytorch_3x.json
+++ b/examples/.config/model_params_pytorch_3x.json
@@ -143,6 +143,14 @@
       "main_script": "run_clm_no_trainer.py",
       "batch_size": 1
     },
+    "phi3_vlm_128k_autoround_int4":{
+      "model_src_dir": "multimodal-modeling/quantization/auto_round",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "mllm.py",
+      "batch_size": 8,
+      "iters": 50
+    },
     "gpt_j_ipex":{
       "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
       "dataset_location": "",
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/mllm.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/mllm.py
index 5881b45c7b0..84f4976bb31 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/mllm.py
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/mllm.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024 Intel Corporation
+# Copyright (c) 2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,35 +22,7 @@
 
 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
 torch.use_deterministic_algorithms(True, warn_only=True)
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, AutoProcessor
-
-from neural_compressor.torch.utils.utility import (get_multimodal_block_names,
-                                                    get_layer_names_in_block,
-                                                    detect_device,
-                                                    find_matching_blocks,
-                                                    to_device,
-                                                    to_dtype
-                                                    )
-from neural_compressor.torch.quantization import (AutoRoundConfig,
-                                                    prepare,
-                                                    convert,
-                                                    load)
-
-def set_nontext_module_config(model, to_quant_block_names, quant_config):
-    all_block_list = get_multimodal_block_names(model, quant_vision=True)
-    all_block_set = set(tuple(block) for block in all_block_list)
-    quant_block_set = set(tuple(block) for block in to_quant_block_names)
-    set_to_full_prec = list(all_block_set - quant_block_set)
-    set_to_full_prec = get_layer_names_in_block(model, to_quant_block_names=set_to_full_prec)
-    for name in set_to_full_prec:
-        quant_config.set_local(name, AutoRoundConfig(dtype="fp32"))
-        
-    # skip layers not in blocks
-    quant_config.set_local("model.vision_embed_tokens.img_projection*", AutoRoundConfig(dtype="fp32"))
-    quant_config.set_local("transformer.visual.attn_pool.*_proj", AutoRoundConfig(dtype="fp32"))
-    quant_config.set_local("model.mm_projector*", AutoRoundConfig(dtype="fp32"))
-    quant_config.set_local("multi_modal_projector", AutoRoundConfig(dtype="fp32"))
-    quant_config.set_local("visual.merger", AutoRoundConfig(dtype="fp32"))
+from neural_compressor.transformers import AutoModelForCausalLM, AutoRoundConfig
 
 
 @torch.no_grad()
@@ -116,7 +88,7 @@ def __init__(self, *args, **kwargs):
         self.add_argument("--low_gpu_mem_usage", action='store_true',
                           help="offload intermediate features to cpu")
 
-        self.add_argument("--export_format", default="auto_round:gptq", type=str,
+        self.add_argument("--export_format", default="itrex", type=str,
                           help="the format to save the model"
                           )
 
@@ -250,320 +222,88 @@ def tune(args):
             devices = args.device.replace(" ", "").split(',')
         use_auto_mapping = True
 
-    device_str = detect_device(devices[0])
-
-    torch_dtype = "auto"
-    if "hpu" in device_str:
-        torch_dtype = torch.bfloat16
-
-    # load_model
-    processor, image_processor = None, None
-    if "llava" in model_name:
-        from llava.model.builder import load_pretrained_model  # pylint: disable=E0401
-        tokenizer, model, image_processor, _ = load_pretrained_model(
-            model_name, model_base=None, model_name=model_name,
-            torch_dtype=torch_dtype)
-        model_type = "llava"
-    else:
-        config = AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code)
-        model_type = config.model_type
-        if "qwen2_vl" in model_type:
-            from transformers import Qwen2VLForConditionalGeneration
-            cls = Qwen2VLForConditionalGeneration
-        elif "mllama" in model_type:
-            from transformers import MllamaForConditionalGeneration
-            cls = MllamaForConditionalGeneration
-        else:
-            cls = AutoModelForCausalLM
-        
-        kargs = {}
-        if "phi3_v" in model_type:
-            kargs['attn_implementation'] = 'eager'
-        model = cls.from_pretrained(
-            model_name, trust_remote_code=not args.disable_trust_remote_code, torch_dtype=torch_dtype,
-            device_map="auto" if use_auto_mapping else None, **kargs)
-        
-    if "cogvlm2" in model_name:
-        model.config.model_type = "cogvlm2"
-
-    from neural_compressor.torch.algorithms.weight_only.autoround import get_mllm_dataloader
-
-    model = model.eval()
-
-    if args.model_dtype != None:
-        try:
-            if args.model_dtype == "float16" or args.model_dtype == "fp16":
-                model = model.to(torch.float16)
-            elif args.model_dtype == "bfloat16" or args.model_dtype == "bfp16" or args.model_dtype == "bf16":
-                model = model.to(torch.bfloat16)
-            elif args.model_dtype == "float32" or args.model_dtype == "fp32":
-                model = model.to(torch.float32)
-        except:
-            raise ("please use more device to fit the device or just use one device")
-            exit()
-
-    all_blocks = get_multimodal_block_names(model, args.quant_nontext_module)
-    to_quant_block_names = find_matching_blocks(model, all_blocks, args.to_quant_block_names)
-    
-    # TODO check dataset?
-    dataloader, template, truncation, batch_size, gradient_accumulate_steps, seqlen, nsamples = get_mllm_dataloader(
-        model=model,
-        tokenizer=tokenizer,
-        template=None,
-        dataset=args.dataset,
-        extra_data_dir=args.extra_data_dir,
-        seqlen=args.seqlen,
-        batch_size=args.batch_size,
-        split=None,
-        apply_template=None,
-        truncation=args.truncation,
-        seed=args.seed,
-        nsamples=args.nsamples,
-        gradient_accumulate_steps=args.gradient_accumulate_steps,
-        quant_nontext_module=args.quant_nontext_module,
-        processor=processor,
-        image_processor=image_processor,
-    )
-    quant_config = AutoRoundConfig(
-        is_mllm=True,
+    woq_config = AutoRoundConfig(
+        is_vlm=True,
         bits=args.bits,
-        use_sym=not args.asym,
+        sym=not args.asym,
         group_size=args.group_size,
-        nsamples=nsamples,
-        batch_size=batch_size,
+        nsamples=args.nsamples,
+        batch_size=args.batch_size,
         iters=args.iters,
-        seqlen=seqlen,
+        seqlen=args.seqlen,
         quant_nontext_module=args.quant_nontext_module,
-        truncation=truncation,
-        gradient_accumulate_steps=gradient_accumulate_steps,
+        truncation=args.truncation,
+        gradient_accumulate_steps=args.gradient_accumulate_steps,
         nblocks=args.nblocks,
         lr=args.lr,
         minmax_lr=args.minmax_lr,
         enable_quanted_input=not args.disable_quanted_input,
-        seed=args.seed,
         scale_dtype=args.scale_dtype,
         enable_minmax_tuning=not args.disable_minmax_tuning,
         act_bits=args.act_bits,
-        to_quant_block_names=to_quant_block_names,
         export_format=args.export_format
     )
-        
-    # set_nontext_module_config(model, to_quant_block_names, quant_config)
-
-    format = args.export_format
-    if args.fp_layers != "":
-        fp_layers = args.fp_layers.replace(" ", "").split(",")
-        for n, m in model.named_modules():
-            if not isinstance(m, (torch.nn.Linear, transformers.modeling_utils.Conv1D)):
-                continue
-            for fp_layer in fp_layers:
-                if fp_layer in n:
-                    quant_config.set_local(n, AutoRoundConfig(dtype="fp32"))
-                    print(
-                        f"{n} will not be quantized.")
-
-    for n, m in model.named_modules():
-        if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
-            if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
-                quant_config.set_local(n, AutoRoundConfig(dtype="fp32"))
-                print(
-                    f"{n} will not be quantized due to its shape not being divisible by 32,"
-                    " resulting in an exporting issue to autogptq")
-                
-    lm_head_layer_name = "lm_head"
-    for n, _ in model.named_modules():
-        lm_head_layer_name = n
-        
-    if args.quant_lm_head:
-        config = AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code)
-        if config.tie_word_embeddings and hasattr(model, "_tied_weights_keys"):
-            tied_keys = model._tied_weights_keys
-            for item in tied_keys:
-                if lm_head_layer_name in item:  ##TODO extend to encoder-decoder layer, seq classification model
-                    args.quant_lm_head = False
-                    print(
-                        f"warning, disable quant_lm_head as quantizing lm_head with tied weights has not been "
-                        f"supported currently")
-                    break
-                
-    if not args.quant_lm_head:
-        quant_config.set_local(lm_head_layer_name, AutoRoundConfig(dtype="fp32"))
-    else:
-        if "auto_round" not in format:
-            raise ValueError(
-                f"{format} is not supported for lm-head quantization, please change to {auto_round_formats}")
-
-    if args.quant_lm_head and args.low_gpu_mem_usage:
-        print(f"warning, low_gpu_mem_usage=False is strongly recommended if the whole model could be loaded to "
-              f"gpu")
-
-    if "--truncation" not in sys.argv:
-        args.truncation = None
-
-    user_model = prepare(model=model, quant_config=quant_config)
-    run_fn(user_model, dataloader)
-    user_model = convert(user_model)
+
+    model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True, attn_implementation='eager')
         
     model.eval()
-    if args.device != "cpu":
-        torch.cuda.empty_cache()
-    
-    from neural_compressor.torch.utils import (SaveLoadFormat,)
-    kargs = {}
-    if "phi3_v" in model_type:
-        kargs['safe_serialization'] = 'False'
-    user_model.save(args.output_dir, format=SaveLoadFormat.HUGGINGFACE, **kargs)
-    if tokenizer is not None:
-        tokenizer.save_pretrained(args.output_dir)
-    if processor is not None and hasattr(processor, 'chat_template'): # Avoiding phi-3.5-vision save errors
-        processor.save_pretrained(args.output_dir)
-
-
-
-def setup_mllm_eval_parser():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", "--model_name", "--model_name_or_path",
-                          help="model name or path")
-    parser.add_argument("--tasks", type=str,
-                        default="MMBench_DEV_EN_V11,ScienceQA_VAL,TextVQA_VAL,POPE",
-                        help="eval tasks for VLMEvalKit.")
-    # Args that only apply to Video Dataset
-    parser.add_argument("--nframe", type=int, default=8,
-                        help="the number of frames to sample from a video,"
-                            " only applicable to the evaluation of video benchmarks.")
-    parser.add_argument("--pack", action='store_true',
-                        help="a video may associate with multiple questions, if pack==True,"
-                            " will ask all questions for a video in a single")
-    parser.add_argument("--fps", type=float, default=-1,
-                        help="set the fps for a video.")
-    # Work Dir
-    # Infer + Eval or Infer Only
-    parser.add_argument("--mode", type=str, default='all', choices=['all', 'infer'],
-                        help="when mode set to 'all', will perform both inference and evaluation;"
-                            " when set to 'infer' will only perform the inference.")
-    parser.add_argument('--eval_data_dir', type=str, default=None,
-                        help='path for VLMEvalKit to store the eval data. Default will store in ~/LMUData')
-    # API Kwargs, Apply to API VLMs and Judge API LLMs
-    parser.add_argument('--retry', type=int, default=None, help='retry numbers for API VLMs')
-    # Explicitly Set the Judge Model
-    parser.add_argument('--judge', type=str, default=None,
-                        help="whether is a judge model.")
-    # Logging Utils
-    parser.add_argument('--verbose', action='store_true',
-                        help="whether to display verbose information.")
-    # Configuration for Resume
-    # Ignore: will not rerun failed VLM inference
-    parser.add_argument('--ignore', action='store_true',
-                        help='ignore failed indices. ')
-    # Rerun: will remove all evaluation temp files
-    parser.add_argument('--rerun', action='store_true',
-                        help="if true, will remove all evaluation temp files and rerun.")
-    parser.add_argument("--output_dir", default="./eval_result", type=str,
-                          help="the directory to save quantized model")
-    args = parser.parse_args()
-    return args
-
-
-def setup_lmms_parser():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", "--model_name", "--model_name_or_path",
-                        help="model name or path")
-    parser.add_argument(
-        "--tasks",
-        default="pope,textvqa_val,scienceqa,mmbench_en",
-        help="To get full list of tasks, use the command lmms-eval --tasks list",
-    )
-    parser.add_argument("--output_dir", default="./eval_result", type=str,
-                          help="the directory to save quantized model")
-    parser.add_argument(
-        "--num_fewshot",
-        type=int,
-        default=None,
-        help="Number of examples in few-shot context",
-    )
-    parser.add_argument(
-        "--batch_size",
-        "-b",
-        type=str,
-        default=1,
-        metavar="auto|auto:N|N",
-        help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
-    )
-    parser.add_argument(
-        "--max_batch_size",
-        type=int,
-        default=None,
-        metavar="N",
-        help="Maximal batch size to try with --batch_size auto.",
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default=None,
-        help="Device to use (e.g. cuda, cuda:0, cpu)",
-    )
-    parser.add_argument(
-        "--limit",
-        type=float,
-        default=None,
-        help="Limit the number of examples per task. " "If <1, limit is a percentage of the total"
-             " number of examples.",
-    )
-    args = parser.parse_args()
-    return args
-
-
-def mllm_eval(args):
-    if isinstance(args.tasks, str):
-        args.tasks = args.tasks.replace(' ', '').split(',')
-    from neural_compressor.torch.algorithms.weight_only.autoround import mllm_eval
-    mllm_eval(
-        args.model,
-        work_dir=args.output_dir,
-        data_store_dir=args.eval_data_dir,
-        dataset=args.tasks,
-        pack=args.pack,
-        fps=args.fps,
-        nframe=args.nframe,
-        rerun=args.rerun,
-        judge=args.judge,
-        verbose=args.verbose,
-        mode=args.mode,
-        ignore=args.ignore
-    )
-
-def lmms_eval(args):
-    from neural_compressor.torch.algorithms.weight_only.autoround import lmms_eval
-    results = lmms_eval(
-        model=args.model,
-        tasks=args.tasks,
-        output_dir=args.output_dir,
-        num_fewshot=args.num_fewshot,
-        limit=args.limit,
-        batch_size=args.batch_size,
-        max_batch_size=args.max_batch_size,
-        device=args.device,
-        use_cache=None,
-        apply_chat_template=False,
-    )
-    return results
-
+    kwargs = {}
+    kwargs['safe_serialization'] = 'False'  # for phi3 saving model
+    model.save_pretrained(args.output_dir, safe_serialization=False)
 
 if __name__ == '__main__':
     if "--quantize" in sys.argv:
         args = setup_parser()
         tune(args)
-    elif "--accuracy" in sys.argv:
-        sys.argv.remove("--accuracy")
-        from neural_compressor.torch.quantization import load
-        if "--lmms" in sys.argv:
-            sys.argv.remove("--lmms")
-            args = setup_lmms_parser()
-            lmms_eval(args)
-        else:
-            if "--mllm_eval" in sys.argv:
-                sys.argv.remove("--mllm_eval")
-            args = setup_mllm_eval_parser()
-            mllm_eval(args)
+    elif "--inference" in sys.argv:
+        sys.argv.remove("--inference")
+        from transformers import AutoProcessor
+        import requests
+        from PIL import Image
+        
+        args = setup_parser()
+        model_name = args.model
+        if model_name[-1] == "/":
+            model_name = model_name[:-1]
+        
+        # Preparation for inference    
+        model = AutoModelForCausalLM.from_pretrained(
+            args.output_dir, 
+            trust_remote_code=True,
+            attn_implementation='eager',
+            torch_dtype=torch.float16,
+        )
+        processor = AutoProcessor.from_pretrained(model_name,  trust_remote_code=True)
+        image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+
+        content = "Describe this image."
+
+        messages = [
+            {"role": "user", 
+            "content": "<|image_1|>\n"+content},
+        ]
+        
+        prompt = processor.tokenizer.apply_chat_template(
+            messages, 
+            tokenize=False, 
+            add_generation_prompt=True
+        )
+        image_inputs = Image.open(requests.get(image_url, stream=True).raw)
+        inputs = processor(prompt, image_inputs, return_tensors="pt").to(model.device)
+        generation_args = { 
+            "max_new_tokens": 50, 
+            "temperature": 0.0, 
+            "do_sample": False, 
+        }
+        
+        generate_ids = model.generate(**inputs, 
+                eos_token_id=processor.tokenizer.eos_token_id, 
+                **generation_args
+        )
+        
+        # remove input tokens 
+        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+        response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        
+        print(response)
+        
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/requirements.txt b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/requirements.txt
new file mode 100644
index 00000000000..733f9388bfc
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/requirements.txt
@@ -0,0 +1,18 @@
+transformers==4.47.0
+torch
+tiktoken
+transformers_stream_generator
+peft
+sentencepiece
+einops
+accelerate
+datasets
+protobuf
+auto-gptq
+openpyxl
+wandb
+py-cpuinfo
+Pillow
+torchvision
+setuptools
+auto-round
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/run_benchmark.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/run_benchmark.sh
new file mode 100644
index 00000000000..24e52d179ca
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/run_benchmark.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  iters=50
+  batch_size=8
+  tuned_checkpoint=saved_results
+  echo ${max_eval_samples}
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --config=*)
+          tuned_checkpoint=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+
+# run_benchmark
+function run_benchmark {
+    extra_cmd=''
+
+    if [[ ${mode} == "accuracy" ]]; then
+        mode_cmd=" --accuracy "
+    elif [[ ${mode} == "performance" ]]; then
+        mode_cmd=" --inference --iters "${iters}
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+
+    if [[ ${int8} == "true" ]]; then
+        extra_cmd=$extra_cmd" --load"
+    fi
+    echo $extra_cmd
+
+    if [ "${topology}" = "phi3_vlm_128k_autoround_int4" ]; then
+        model_name_or_path="microsoft/Phi-3-vision-128k-instruct"
+    fi
+
+    if [[ ${mode} == "performance" ]]; then
+        python -u mllm.py \
+            --model ${model_name_or_path} \
+            --output_dir ${tuned_checkpoint} \
+            --batch_size ${batch_size} \
+            ${extra_cmd} ${mode_cmd}
+    fi
+        
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/run_quant.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/run_quant.sh
new file mode 100644
index 00000000000..e24e544ed75
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/run_quant.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+       --output_model=*)
+           tuned_checkpoint=$(echo $var |cut -f2 -d=)
+       ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+    extra_cmd=''
+    batch_size=8
+    DATASET_NAME="NeelNanda/pile-10k"
+    tuned_checkpoint="saved_results"
+
+    if [ "${topology}" = "phi3_vlm_128k_autoround_int4" ]; then
+        model_name_or_path="microsoft/Phi-3-vision-128k-instruct"
+    fi
+
+    python -u mllm.py \
+        --model ${model_name_or_path} \
+        --dataset ${DATASET_NAME} \
+        --quantize \
+        --iters ${iters} \
+        --output_dir ${tuned_checkpoint} \
+        --batch_size ${batch_size} \
+        ${extra_cmd}
+}
+
+main "$@"