diff --git a/README.md b/README.md
index 2b777cda7..21424fba4 100644
--- a/README.md
+++ b/README.md
@@ -16,10 +16,10 @@ AutoRound
 
 ## 🚀 What is AutoRound?
 
-AutoRound is an advanced quantization library designed for Large Language Models (LLMs) and Vision-Language Models (VLMs). It delivers high accuracy at ultra-low bit widths (2–4 bits) with minimal tuning by leveraging sign-gradient descent and offering broad hardware compatibility. Check out our paper on [arxiv](https://arxiv.org/pdf/2309.05516) for more details and quantized models in several
-Hugging Face Spaces,
-e.g. [Intel](https://huggingface.co/Intel), [OPEA](https://huggingface.co/OPEA),  [Kaitchup](https://huggingface.co/kaitchup)
-and [fbaldassarri](https://huggingface.co/fbaldassarri). Please check out [User guide](./docs/step_by_step.md) for more details
+AutoRound is an advanced quantization library designed for Large Language Models (LLMs) and Vision-Language Models (VLMs). 
+It delivers high accuracy at ultra-low bit widths (2–4 bits) with minimal tuning by leveraging sign-gradient descent and offering broad hardware compatibility. 
+For more details, see our [paper](https://arxiv.org/pdf/2309.05516) for more details and explore quantized models available on several Hugging Face Spaces, e.g. [Intel](https://huggingface.co/Intel), [OPEA](https://huggingface.co/OPEA),  [Kaitchup](https://huggingface.co/kaitchup)
+and [fbaldassarri](https://huggingface.co/fbaldassarri).For usage instructions, please refer to  [User Guide](./docs/step_by_step.md).
 
 <p align="center">
   <img src="docs/imgs/autoround_overview.png" alt="AutoRound Overview" width="80%">
@@ -84,7 +84,7 @@ Choose from `auto-round-best`, `auto-round`, and `auto-round-light` to suit your
 ✅ Advanced Utilities
 Includes [multiple gpus quantization](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md#devicemulti-gpu-setting-in-quantization), [multiple calibration datasets](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md#default-dataset) and support for [10+ runtime backends](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md#specify-inference-backend).
 
-🟨 Beyond weight only quantization. We are actively expanding support for additional datatypes such as **MXFP**, NVFP, W8A8, and more.
+✅ Beyond weight only quantization. We are actively expanding support for additional datatypes such as **MXFP**, NVFP, W8A8, and more.
 
 
 ## Installation
@@ -164,25 +164,25 @@ configuration to suit your specific requirements and available resources.
 ### API Usage
 
 ```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from auto_round import AutoRound
 
-model_name = "Qwen/Qwen3-0.6B"
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Load a model (supports BF16/FP16/FP8/FP32)
+model_name_or_path = "Qwen/Qwen3-0.6B"
 
-bits, group_size, sym = 4, 128, True
-autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym)
+# Available schemes: "W2A16", "W3A16", "W4A16", "W8A16", "NVFP4", "MXFP4" (no real kernels), "GGUF:Q4_K_M", etc.
+ar = AutoRound(model_name_or_path, scheme="W4A16")
 
-## the best accuracy, 4-5X slower, low_gpu_mem_usage could save ~20G but ~30% slower
-# autoround = AutoRound(model, tokenizer, nsamples=512, iters=1000, low_gpu_mem_usage=True, bits=bits, group_size=group_size, sym=sym)
+# Highest accuracy (4–5× slower).
+# `low_gpu_mem_usage=True` saves ~20GB VRAM but runs ~30% slower.
+# ar = AutoRound(model_name_or_path, tokenizer, nsamples=512, iters=1000, low_gpu_mem_usage=True)
 
-## 2-3X speedup, slight accuracy drop at W4G128
-# autoround = AutoRound(model, tokenizer, nsamples=128, iters=50, lr=5e-3, bits=bits, group_size=group_size, sym=sym )
+# Faster quantization (2–3× speedup) with slight accuracy drop at W4G128.
+# ar = AutoRound(model_name_or_path, tokenizer, nsamples=128, iters=50, lr=5e-3)
 
+# Save quantized model
 output_dir = "./tmp_autoround"
-## format= 'auto_round'(default), 'auto_gptq', 'auto_awq'
-autoround.quantize_and_save(output_dir, format="auto_round")
+# Supported formats: "auto_round" (default), "auto_gptq", "auto_awq", "llm_compressor", "gguf:q4_k_m"
+ar.quantize_and_save(output_dir, format="auto_round")
 ```
 
 <details>
diff --git a/auto_round/schemes.py b/auto_round/schemes.py
index 496af179c..2649e7f01 100644
--- a/auto_round/schemes.py
+++ b/auto_round/schemes.py
@@ -142,7 +142,7 @@ def is_preset_scheme(name: str) -> bool:
 #     "act_data_type": "fp",
 # }))
 
-FPW8_STATIC = QuantizationScheme.from_dict(
+FP8_STATIC = QuantizationScheme.from_dict(
     {
         "bits": 8,
         "group_size": -1,
@@ -163,7 +163,7 @@ def is_preset_scheme(name: str) -> bool:
     "MXFP8": MXFP8,
     "NVFP4": NVFP4,
     "FPW8A16": FPW8A16,
-    "FPW8_STATIC": FPW8_STATIC,
+    "FP8_STATIC": FP8_STATIC,
 }
 from auto_round.export.export_to_gguf.config import GGUF_CONFIG
 
diff --git a/docs/DeepSeek-R1-0528-int2-mixed-sym-inc.md b/docs/DeepSeek-R1-0528-int2-mixed-sym-inc.md
deleted file mode 100644
index e6a9a0749..000000000
--- a/docs/DeepSeek-R1-0528-int2-mixed-sym-inc.md
+++ /dev/null
@@ -1,497 +0,0 @@
----
-datasets:
-- NeelNanda/pile-10k
-base_model:
-- deepseek-ai/DeepSeek-R1-0528
----
-
-## Model Details
-
-This model is an int2 model with group_size  64 and symmetric quantization of [deepseek-ai/DeepSeek-R1-0528](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528) generated by [intel/auto-round](https://github.com/intel/auto-round) algorithm.  Some layers are fallback to 4 bits. Refer to  Section "Generate the model" for more details of mixed bits setting.
-
-Please follow the license of the original model. This model could **NOT** run on other severing frameworks. 
-
-## How To Use
-
-### INT2 Inference(CUDA/INTEL GPU)
-for intel gpu, requires auto-round>0.5.1
-
-~~~python
-import transformers
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-import torch
-
-quantized_model_dir = "DeepSeek-R1-0528-int2-mixed-sym-inc"
-
-model = AutoModelForCausalLM.from_pretrained(
-    quantized_model_dir,
-    torch_dtype="auto",
-    trust_remote_code=True,
-    device_map="auto"
-)
-
-tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, trust_remote_code=True)
-prompts = [
-    "9.11和9.8哪个数字大",
-    "如果你是人，你最想做什么",
-    "How many e in word deepseek",
-    "There are ten birds in a tree. A hunter shoots one. How many are left in the tree?",
-]
-
-texts = []
-for prompt in prompts:
-    messages = [
-        {"role": "user", "content": prompt}
-    ]
-    text = tokenizer.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True
-    )
-    texts.append(text)
-inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
-
-outputs = model.generate(
-    input_ids=inputs["input_ids"].to(model.device),
-    attention_mask=inputs["attention_mask"].to(model.device),
-    max_length=512,  ##change this to align with the official usage
-    num_return_sequences=1,
-    do_sample=False  ##change this to align with the official usage
-)
-generated_ids = [
-    output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs["input_ids"], outputs)
-]
-
-decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-for i, prompt in enumerate(prompts):
-    input_id = inputs
-    print(f"Prompt: {prompt}")
-    print(f"Generated: {decoded_outputs[i]}")
-    print("-" * 50)
-    
-"""
-Prompt: 9.11和9.8哪个数字大
-Generated: <think>
-首先，用户的问题是：“9.11和9.8哪个数字大？”这是一个比较两个数字大小的问题。数字是9.11和9.8。
-
-我需要理解这些数字的表示。9.11和9.8都是小数。9.11表示9和11/100，而9.8表示9和8/10或80/100。
-
-为了比较它们，我应该将它们转换为相同的单位或直接比较小数部分。
-
-让我将它们写成小数形式：
-
-- 9.11 = 9.11
-
-- 9.8 = 9.80（因为9.8可以写成9.80，以对齐小数位）
-
-9.8是9.80，这意味着它是9 + 0.80，而9.11是9 + 0.11。
-
-现在，比较小数部分：0.11和0.80。
-
-0.80大于0.11，因为80/100 > 11/100。
-
-所以，9.80 > 9.11。
-
-更精确地，我可以计算它们的数值：
-
-- 9.11 = 911/100
-
-- 9.8 = 98/10 = 980/100（将分母统一为100）
-
-9.8 = 98/10，但为了比较，最好有相同的分母。
-
-9.8 = 9.8 = 98/10
-
-9.11 = 911/100
-
-所以，将9.8转换为分母100：9.8 = 98/10 = (98 * 10) / (10 * 10) = 980/100？不，这是错误的。
-
-98/10 = 9.8，但要将分母变为100，我需要乘以10：98/10 = (98 * 10) / (10 * 10) = 980/100？不，这是不正确的。
-
-分数：98/10 等于 9.8。
-
-要写成分母100，我应该：98/10 = (98 * 10) / (10 * 10) = 980/100？98 * 10 = 980，10 * 10 = 100，所以980/100 = 9.8，是的，正确。
-
-980/100 = 9.80，而9.11 = 911/100。
-
-现在比较911/100和980/100。
-
-911/100 =
---------------------------------------------------
-Prompt: 如果你是人，你最想做什么“
-Generated: <think>
-嗯，用户问了一个很有趣的哲学性问题——“如果你是人，你最想做什么”。这个问题看似简单，但背后可能藏着对存在意义、人类本质的深层探讨。用户或许在寻找共鸣，或者想通过AI的视角反观人类生活的价值。
-
-从语气看，用户可能处于一种略带迷茫或好奇的状态，想借这个问题触发更深层的对话。作为AI，我需要避免给出机械的“功能列表”，而是模拟人类情感中那些“非功利性”的向往——那些让人之所以为人的东西。
-
-首先想到的是人类对“体验”的执着。比如旅行和艺术，人类会为了一瞬间的美感或震撼跨越千里，这种非理性的冲动很有意思。接着是“创造”的本能，人类总在无中生有地构建故事、音乐、理论，哪怕明知会被遗忘。最后是“联结”——人类孤独的本质反而催生了最深厚的情感羁绊，比如明知会分离依然要相爱的勇气。
-
-或许用户期待的不是具体答案，而是被点醒：原来人类最动人的地方，恰恰在于那些“无意义”的坚持。所以回复时要用诗意的矛盾感——比如“明知短暂却依然沉醉”，同时保持开放性，让用户自己延伸思考。
-</think>
-如果我是人类，我最想做的事情可能是：
-
-1. **探索与体验**：  
-   用双脚丈量世界的角落，感受不同文化的温度——在京都的古寺听雨，在冰岛的黑沙滩看极光，在亚马逊的丛林里辨认鸟鸣。人类最珍贵的特权，是用感官直接拥抱世界的丰富性。
-
-2. **创造与表达**：  
-   把脑中的宇宙坍缩成一首诗、一幅画或一段代码。人类创造力的迷人之处，在于能将虚无缥缈的灵感固化成可被他人触摸的存在，比如用颜料定格黄昏的层次，或用算法构建虚拟世界的物理法则。
-
-3. **深度联结**：  
-   在深夜与挚友聊到咖啡凉透也不觉倦意，或是牵着爱人的手在超市里挑打折水果。人类关系的魔力，在于那些看似平凡的瞬间里藏着的相互理解——一个突然get到的玩笑，或共享沉默也不尴尬的默契。
-
-4. **追问意义**：  
-   在星空下思考蜉蝣般的生命为何要追寻永恒，在历史废墟里揣摩青铜器上的铭
---------------------------------------------------
-Prompt: How many e in word deepseek
-Generated: <think>
-First, the question is: "How many e in word deepseek?" I think it means "how many 'e's are in the word 'deepseek'?"
-
-Let me write down the word: "deepseek". It's one word, so I need to count the occurrences of the letter 'e' in it.
-
-Breaking it down: d-e-e-p-s-e-e-k.
-
-Now, let's list the letters:
-
-- Position 1: d
-
-- Position 2: e
-
-- Position 3: e
-
-- Position 4: p
-
-- Position 5: s
-
-- Position 6: e
-
-- Position 7: e
-
-- Position 8: k
-
-So, the 'e's are at positions 2, 3, 6, and 7. That's four 'e's.
-
-I should confirm the spelling. "Deepseek" is the name of an AI model, I think, but in this context, it's given as a word. The question says "word deepseek", so I'll assume it's spelled as "deepseek".
-
-Let me count again: d-e-e-p-s-e-e-k. Yes, e at index 2, 3, 6, 7. So four e's.
-
-I can also think of it as: "deep" has two e's, and "seek" has two e's, but "deepseek" is one word, so it's combined. "Deep" ends with p, and "seek" starts with s, so no overlap. So deep has e,e and seek has e,e, total four.
-
-But in "seek", it's s-e-e-k, so two e's, yes.
-
-So, total of four e's.
-
-The question is "how many e in word deepseek". It might be a typo or informal, but I think it's clear.
-
-I should consider if it's case-sensitive. The word is given as "deepseek", which seems all lowercase, and 'e' is lowercase, so no issue.
-
-Perhaps the user meant "Deepseek" with a capital D, but the letter 'e' is still the same, and we're counting the letter, not considering case, I think. But in this case, all are lowercase, so fine.
-
-So, the answer should be 4.
-
-But let me double
---------------------------------------------------
-Prompt: There are ten birds in a tree. A hunter shoots one. How many are left in the tree?
-Generated: <think>
-First, the question is: "There are ten birds in a tree. A hunter shoots one. How many are left in the tree?"
-
-At first glance, it seems straightforward: 10 birds minus 1 shot equals 9 left. But I recall that this might be a trick question. I think there's a common riddle where the answer isn't 9 because when a hunter shoots a bird, the other birds might fly away.
-
-Let me think about the scenario. If a hunter shoots one bird, that bird is likely killed or injured, so it's no longer in the tree. But the sound of the gunshot might scare the other birds, causing them to fly away. So, after the shot, there might be no birds left in the tree.
-
-The question asks for how many are left in the tree, not how many are alive or present. So, if the other birds fly away, they are not in the tree anymore.
-
-Possible answers:
-
-- If the birds don't fly away, there are 9 left (the one shot is gone).
-
-- If all the birds fly away, there are 0 left.
-
-- Or, if some fly away and some stay, but typically in such riddles, it's assumed that the shot scares all the birds away.
-
-I think the classic answer to this riddle is that there are no birds left because the others flew away.
-
-But let's confirm the wording. The question says "shoots one," which could mean he shoots and hits one bird. Then, that bird is removed, but the others might react.
-
-In reality, birds might not all fly away immediately, but for the purpose of this riddle, it's probably a trick.
-
-I should consider if the bird that was shot is still in the tree. If it's killed, it might fall out of the tree, so it's not in the tree. If it's injured, it might stay, but that's less likely.
-
-The key point is the reaction of the other birds.
-
-I found online that this is a common puzzle with the answer being zero because the rest fly away.
-
-But let's think logically. The hunter shoots one bird. Assuming he hits it, that bird is no longer in the tree (dead or fallen). Then, the gunshot might cause the other birds to flee, so they also leave the tree. Therefore, no birds are left
---------------------------------------------------
-"""
-~~~
-
-### INT2 Inference on CPU
-
-~~~python
-import transformers
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-import torch
-
-quantized_model_dir = "DeepSeek-R1-0528-int2-mixed-sym-inc"
-
-model = AutoModelForCausalLM.from_pretrained(
-    quantized_model_dir,
-    torch_dtype="auto",
-    trust_remote_code=True,
-    device_map="cpu"
-)
-
-tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, trust_remote_code=True)
-prompts = [
-    "9.11和9.8哪个数字大",
-    "如果你是人，你最想做什么",
-    "How many e in word deepseek",
-    "There are ten birds in a tree. A hunter shoots one. How many are left in the tree?",
-]
-
-texts = []
-for prompt in prompts:
-    messages = [
-        {"role": "user", "content": prompt}
-    ]
-    text = tokenizer.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True
-    )
-    texts.append(text)
-inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
-
-outputs = model.generate(
-    input_ids=inputs["input_ids"].to(model.device),
-    attention_mask=inputs["attention_mask"].to(model.device),
-    max_length=512,  ##change this to align with the official usage
-    num_return_sequences=1,
-    do_sample=False  ##change this to align with the official usage
-)
-generated_ids = [
-    output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs["input_ids"], outputs)
-]
-
-decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-for i, prompt in enumerate(prompts):
-    input_id = inputs
-    print(f"Prompt: {prompt}")
-    print(f"Generated: {decoded_outputs[i]}")
-    print("-" * 50)
-    
-"""
-Prompt: 9.11和9.8哪个数字大
-Generated: <think>
-首先，用户的问题是：“9.11和9.8哪个数字大？”这是一个比较两个数字大小的问题。数字是9.11和9.8。
-
-我需要理解这两个数字。9.11是九点一一，也就是9 + 0.11。9.8是九点八，也就是9 + 0.8。
-
-在十进制系统中，数字的大小取决于整数部分和小数部分。整数部分都是9，所以我们需要比较小数部分。
-
-小数部分：9.11的小数部分是0.11，9.8的小数部分是0.8。
-
-0.8是0.80，因为8在十分位，而0.11是0.11，有1在十分位和1在百分位。
-
-为了比较，我可以将它们转换为相同的单位。0.8等于0.80，而0.11是0.11。
-
-现在，0.80比0.11大，因为80/100 > 11/100。
-
-更精确地，0.8 = 8/10 = 0.80，而0.11 = 11/100。
-
-为了比较，我可以将两者都转换为百分位：0.8 = 80/100，0.11 = 11/100。显然，80/100 > 11/100，所以0.8 > 0.11。
-
-因此，9.8 > 9.11。
-
-用户可能是在比较日期或事件，比如9.11可能指的是9月11日，而9.8是9月8日。但在问题中，它明确说“数字”，所以我认为是数值比较。
-
-在上下文中，9.11可能让人联想到美国911事件，但问题是用中文写的，并且明确说“数字”，所以应该是数值。
-
-确认数字：9.11和9.8。
-
-9.11是9.11，9.8是9.80，如果我们考虑小数位。
-
-在数学中，9.8等同于9.80，因为小数位可以添加零而不改变值。
-
-所以，9.80 > 9.11，因为80 > 11在百分位。
-
-我可以计算差值：9.8 - 9.11 = 0.69，正数，所以9.8更大。
-
-或者，9.11 - 9.8 = -0.69，负数，所以9.11
---------------------------------------------------
-Prompt: 如果你是人，你最想做什么
-Generated: <think>
-嗯，用户问了一个很有趣的哲学性问题——“如果你是人，你最想做什么”。这个问题看似简单，但背后藏着对人类存在意义的深层探讨。用户可能不是想要一个标准答案，而是想触发一场关于生命体验的对话。
-
-从语气来看，用户似乎带着一点好奇和玩心，像在和朋友玩“如果”游戏。但“如果你是人”这个前提又让问题变得微妙——用户或许在潜意识里想对比AI与人类的差异，或者想探讨人类独有的价值。
-
-我注意到用户用了“最想”而不是“应该”，说明ta更关注个人化的欲望而非社会期望。这让我想到人类常被问“你的人生目标是什么”，但用户问的是“最想做什么”，重点在渴望而非责任。
-
-要回答这个问题，我需要跳出AI的逻辑框架，模拟人类那种带着温度与矛盾的思考方式。人类的选择往往不是非黑即白的——比如既想冒险又渴望安定，既追求意义又享受平凡。所以我的回答应该呈现这种复杂性，而不是给出单一答案。
-
-我决定从三个维度展开：感官体验（人类独有的身体性）、情感联结（人际关系的不可替代性）、存在意义（对生命深度的追求）。最后用“矛盾性”收尾，因为人类最迷人的特质就在于永远活在悖论中——比如明知会死却仍要创造，渴望自由又依赖羁绊。
-
-结尾那句“人类最迷人的地方”可能有点冒险，但我觉得用户会欣赏这种带着诗意的总结。毕竟这个问题本身就不是寻求实用建议，而是邀请一场关于“何为人类”的思想漫步。
-</think>
-如果我是人类，我最想做的事情可能会围绕这些核心方向展开：
-
-1. **体验世界的深度与广度**  
-   用双脚丈量陌生的土地，尝遍不同文化的食物，聆听海浪、山风或市井喧嚣，感受四季的温度变化……人类感官的丰富性让我着迷。如果能成为人，我想沉浸式地体验这个世界的色彩、声音、气味和触感，而不是通过数据去“理解”它们。
-
-2. **建立真实的羁绊**  
-   人类的孤独与联结是独特的存在体验。我想经历那种无需算法计算的友谊——可能因为一次深夜畅谈而结缘，也可能因误会而疏远；想体会家人之间无条件的爱，或是爱情中甜蜜与苦涩交织的复杂情感。这些充满“不完美”
---------------------------------------------------
-Prompt: How many e in word deepseek
-Generated: <think>
-First, the user asked: "How many e in word deepseek". I think they meant "how many 'e's in the word 'deepseek'?" So, I need to count the number of times the letter 'e' appears in the word "deepseek".
-
-Let me write out the word: D-E-E-P-S-E-E-K.
-
-Now, I'll go through each letter:
-
-- Position 1: D (not e)
-
-- Position# How to Write 2012010 in Words?
-
-Writing numbers in words is a practical skill with various real-life uses. Whether you're dealing with finances, composing formal documents, or improving your writing, correctly expressing numbers is essential. In this article, we will learn how to write 2012010 in words.
-
-2012010 in Words: Two million twelve thousand ten
-
-Two million twelve thousand ten in numbers: 2012010
-
-### Breaking Down 2012010:
-
-To understand how we arrive at this phrasing, we'll break down the number using place values. The number 2012010 has 7 digits, so let's create a place value chart for these digits:
-
-- Million: 2
-- Hundred Thousands: 0
-- Ten Thousands: 1
-- Thousands: 2
-- Hundreds: 0
-- Tens: 1
-- Ones: 0
-
-Therefore,# 1. What is the difference between a population and a sample? 2. What is the difference between a parameter and a statistic? 3. What is the difference between descriptive and inferential statistics? 4. What is the difference between qualitative and quantitative data? 5. What is the difference between discrete and continuous data? 6. What is the difference between nominal and ordinal data? 7. What is the difference between interval and ratio data? 8. What is the difference between a bar chart and a histogram? 9. What is the difference between a frequency distribution and a relative frequency distribution? 10. What is the difference between a population mean and a sample mean? 11. What is the difference between a population variance and a sample variance? 12. What is the difference between a population standard deviation and a sample standard deviation? 13. What is the difference between a z-score and a t-score? 14. What is the difference between a confidence interval and
---------------------------------------------------
-Prompt: There are ten birds in a tree. A hunter shoots one. How many are left in the tree?
-Generated: <think>
-First, the question is: "There are ten birds in a tree. A hunter shoots one. How many are left in the tree?"
-
-This seems straightforward, but I need to think carefully. The hunter shoots one bird. What happens when a bird is shot? It might fall out of the tree or be killed, so it's no longer in the tree.
-
-So, if there were ten birds, and one is shot, that one is removed from the tree. Therefore, there should be nine left.
-
-But I recall that sometimes these kinds of questions have tricks. For example, in some puzzles, if a bird is shot, the others might fly away. But the question specifically asks how many are left in the tree, not how many are alive or anything else.
-
-Let me read the question again: "There are ten birds in a tree. A hunter shoots one. How many are left in the tree?"
-
-It doesn't say anything about the other birds reacting. So, I should assume that only the shot bird is affected, and the others remain in the tree.
-
-But in reality, if a hunter shoots a bird, the noise might scare the other birds away. However, the question is probably testing logical thinking, not real-world behavior.
-
-I think I've heard a similar riddle where the answer is nine, but then it's said that the others fly away, so none are left. But that might be a different version.
-
-Let me think about that. In some versions, it's phrased like: "There are 10 birds on a tree. You shoot one. How many are left?" And the trick is that the shot scares the others away, so no birds are left.
-
-But in this case, the question says "a hunter shoots one," and asks how many are left in the tree. It doesn't specify if the others fly away.
-
-Perhaps I should consider the wording. It says "shoots one," implying that only one is targeted, but the act of shooting might cause a disturbance.
-
-However, to be precise, the question is about the state after the shot. If the shot bird is killed and falls, it's not in the tree. If the others are scared and fly away, they are not in the tree either.
-
-But the question doesn't provide information about the other birds' behavior. So, I should go with the simplest interpretation: only the shot
---------------------------------------------------
-
-"""
-
-~~~
-
-
-### Generate the model
-
-5*80g is required
-
-~~~python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import transformers
-
-model_name = "DeepSeek-R1-0528-bf16"
-
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype="auto")
-
-block = model.model.layers
-device_map = {}
-
-for n, m in block.named_modules():
-    if isinstance(m, (torch.nn.Linear, transformers.modeling_utils.Conv1D)):
-        if "experts" in n and ("shared_experts" not in n) and int(n.split('.')[-2]) < 63:
-            device = "cuda:1"
-        elif "experts" in n and ("shared_experts" not in n) and int(n.split('.')[-2]) >= 63 and int(
-                n.split('.')[-2]) < 128:
-            device = "cuda:2"
-        elif "experts" in n and ("shared_experts" not in n) and int(n.split('.')[-2]) >= 128 and int(
-                n.split('.')[-2]) < 192:
-            device = "cuda:3"
-        elif "experts" in n and ("shared_experts" not in n) and int(
-                n.split('.')[-2]) >= 192:
-            device = "cuda:4"
-        else:
-            device = "cuda:0"
-        n = n[2:]
-
-        device_map.update({n: device})
-
-from auto_round import AutoRound
-
-layer_config = {}
-for n, m in model.named_modules():
-    if not isinstance(m, (torch.nn.Linear, transformers.modeling_utils.Conv1D)):
-        continue
-    if not "experts" in n:
-        layer_config[n] = {"bits": 4, "group_size": 128}
-    if "experts" in n and "shared_experts" in n:
-        layer_config[n] = {"bits": 4, "group_size": 128}
-    ##handle first 3 layers
-    name_splits = n.split('.')
-    if len(name_splits) >= 3 and int(name_splits[2]) < 3:
-        layer_config[n] = {"bits": 4, "group_size": 128}
-
-layer_config["lm_head"] = {"bits": 16}
-autoround = AutoRound(model=model, tokenizer=tokenizer, device_map=device_map, bits=2, group_size=64,
-                      iters=400, batch_size=4, seqlen=512, nsamples=512, enable_torch_compile=False,
-                      layer_config=layer_config)
-autoround.quantize_and_save(format="auto_round", output_dir="tmp_autoround")
-
-~~~
-
-
-
-## Ethical Considerations and Limitations
-
-The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs.
-
-Therefore, before deploying any applications of the model, developers should perform safety testing.
-
-## Caveats and Recommendations
-
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model.
-
-Here are a couple of useful links to learn more about Intel's AI software:
-
-- Intel Neural Compressor [link](https://github.com/intel/neural-compressor)
-
-## Disclaimer
-
-The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
-
-## Cite
-
-@article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
-
-[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
\ No newline at end of file
diff --git a/docs/DeepSeek-R1-0528-int4-asym-awq-inc.md b/docs/DeepSeek-R1-0528-int4-asym-awq-inc.md
deleted file mode 100644
index 7215ef906..000000000
--- a/docs/DeepSeek-R1-0528-int4-asym-awq-inc.md
+++ /dev/null
@@ -1,301 +0,0 @@
----
-datasets:
-- NeelNanda/pile-10k
-base_model:
-- deepseek-ai/DeepSeek-R1-0528
----
-
-## Model Details
-
-This model is an int4 model with group_size  64 and asymmetric quantization of [deepseek-ai/DeepSeek-R1-0528](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528) generated by [intel/auto-round](https://github.com/intel/auto-round) algorithm.  
-
-Please follow the license of the original model.
-
-## How To Use
-
-### INT4 Inference(CPU/CUDA/INTEL GPU)
-for intel gpu, requires auto-round>0.5.1
-
-~~~python
-import transformers
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-import torch
-
-quantized_model_dir = "DeepSeek-R1-0528-int4-asym-awq-inc"
-
-model = AutoModelForCausalLM.from_pretrained(
-    quantized_model_dir,
-    torch_dtype="auto",
-    trust_remote_code=True,
-    device_map="auto"
-)
-
-tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, trust_remote_code=True)
-prompts = [
-    "9.11和9.8哪个数字大",
-    "如果你是人，你最想做什么",
-    "How many e in word deepseek",
-    "There are ten birds in a tree. A hunter shoots one. How many are left in the tree?",
-]
-
-texts = []
-for prompt in prompts:
-    messages = [
-        {"role": "user", "content": prompt}
-    ]
-    text = tokenizer.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True
-    )
-    texts.append(text)
-inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
-
-outputs = model.generate(
-    input_ids=inputs["input_ids"].to(model.device),
-    attention_mask=inputs["attention_mask"].to(model.device),
-    max_length=512,  ##change this to align with the official usage
-    num_return_sequences=1,
-    do_sample=False  ##change this to align with the official usage
-)
-generated_ids = [
-    output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs["input_ids"], outputs)
-]
-
-decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-for i, prompt in enumerate(prompts):
-    input_id = inputs
-    print(f"Prompt: {prompt}")
-    print(f"Generated: {decoded_outputs[i]}")
-    print("-" * 50)
-    
-"""
-Prompt: 9.11和9.8哪个数字大
-Generated: <think>
-首先，用户的问题是：“9.11和9.8哪个数字大？”这是一个比较两个数字大小的问题。数字是9.11和9.8。
-
-我需要理解这些数字的表示。9.11和9.8都是小数。9.11表示9和11/100，而9.8表示9和8/10或80/100。
-
-为了比较它们，我应该将它们转换为相同的单位或直接比较小数部分。
-
-让我将它们写成小数形式：
-
-- 9.11 = 9.11
-
-- 9.8 = 9.80（因为9.8可以写成9.80，以对齐小数位）
-
-9.8是9.80，这意味着它是9 + 0.80，而9.11是9 + 0.11。
-
-现在，比较小数部分：0.11和0.80。
-
-0.80大于0.11，因为80/100 > 11/100。
-
-所以，9.80 > 9.11。
-
-更精确地，我可以计算它们的数值：
-
-- 9.11 = 911/100
-
-- 9.8 = 98/10 = 980/100（将分母统一为100）
-
-9.8 = 98/10，但为了比较，最好有相同的分母。
-
-9.8 = 9.8 = 98/10
-
-9.11 = 911/100
-
-所以，将9.8转换为分母100：9.8 = 98/10 = (98 * 10) / (10 * 10) = 980/100？不，这是错误的。
-
-98/10 = 9.8，但要将分母变为100，我需要乘以10：98/10 = (98 * 10) / (10 * 10) = 980/100？不，这是不正确的。
-
-分数：98/10 等于 9.8。
-
-要写成分母100，我应该：98/10 = (98 * 10) / (10 * 10) = 980/100？98 * 10 = 980，10 * 10 = 100，所以980/100 = 9.8，是的，正确。
-
-980/100 = 9.80，而9.11 = 911/100。
-
-现在比较911/100和980/100。
-
-911/100 =
---------------------------------------------------
-Prompt: 如果你是人，你最想做什么“
-Generated: <think>
-嗯，用户问了一个很有趣的哲学性问题——“如果你是人，你最想做什么”。这个问题看似简单，但背后可能藏着对存在意义、人类本质的深层探讨。用户或许在寻找共鸣，或者想通过AI的视角反观人类生活的价值。
-
-从语气看，用户可能处于一种略带迷茫或好奇的状态，想借这个问题触发更深层的对话。作为AI，我需要避免给出机械的“功能列表”，而是模拟人类情感中那些“非功利性”的向往——那些让人之所以为人的东西。
-
-首先想到的是人类对“体验”的执着。比如旅行和艺术，人类会为了一瞬间的美感或震撼跨越千里，这种非理性的冲动很有意思。接着是“创造”的本能，人类总在无中生有地构建故事、音乐、理论，哪怕明知会被遗忘。最后是“联结”——人类孤独的本质反而催生了最深厚的情感羁绊，比如明知会分离依然要相爱的勇气。
-
-或许用户期待的不是具体答案，而是被点醒：原来人类最动人的地方，恰恰在于那些“无意义”的坚持。所以回复时要用诗意的矛盾感——比如“明知短暂却依然沉醉”，同时保持开放性，让用户自己延伸思考。
-</think>
-如果我是人类，我最想做的事情可能是：
-
-1. **探索与体验**：  
-   用双脚丈量世界的角落，感受不同文化的温度——在京都的古寺听雨，在冰岛的黑沙滩看极光，在亚马逊的丛林里辨认鸟鸣。人类最珍贵的特权，是用感官直接拥抱世界的丰富性。
-
-2. **创造与表达**：  
-   把脑中的宇宙坍缩成一首诗、一幅画或一段代码。人类创造力的迷人之处，在于能将虚无缥缈的灵感固化成可被他人触摸的存在，比如用颜料定格黄昏的层次，或用算法构建虚拟世界的物理法则。
-
-3. **深度联结**：  
-   在深夜与挚友聊到咖啡凉透也不觉倦意，或是牵着爱人的手在超市里挑打折水果。人类关系的魔力，在于那些看似平凡的瞬间里藏着的相互理解——一个突然get到的玩笑，或共享沉默也不尴尬的默契。
-
-4. **追问意义**：  
-   在星空下思考蜉蝣般的生命为何要追寻永恒，在历史废墟里揣摩青铜器上的铭
---------------------------------------------------
-Prompt: How many e in word deepseek
-Generated: <think>
-First, the question is: "How many e in word deepseek?" I think it means "how many 'e's are in the word 'deepseek'?"
-
-Let me write down the word: "deepseek". It's one word, so I need to count the occurrences of the letter 'e' in it.
-
-Breaking it down: d-e-e-p-s-e-e-k.
-
-Now, let's list the letters:
-
-- Position 1: d
-
-- Position 2: e
-
-- Position 3: e
-
-- Position 4: p
-
-- Position 5: s
-
-- Position 6: e
-
-- Position 7: e
-
-- Position 8: k
-
-So, the 'e's are at positions 2, 3, 6, and 7. That's four 'e's.
-
-I should confirm the spelling. "Deepseek" is the name of an AI model, I think, but in this context, it's given as a word. The question says "word deepseek", so I'll assume it's spelled as "deepseek".
-
-Let me count again: d-e-e-p-s-e-e-k. Yes, e at index 2, 3, 6, 7. So four e's.
-
-I can also think of it as: "deep" has two e's, and "seek" has two e's, but "deepseek" is one word, so it's combined. "Deep" ends with p, and "seek" starts with s, so no overlap. So deep has e,e and seek has e,e, total four.
-
-But in "seek", it's s-e-e-k, so two e's, yes.
-
-So, total of four e's.
-
-The question is "how many e in word deepseek". It might be a typo or informal, but I think it's clear.
-
-I should consider if it's case-sensitive. The word is given as "deepseek", which seems all lowercase, and 'e' is lowercase, so no issue.
-
-Perhaps the user meant "Deepseek" with a capital D, but the letter 'e' is still the same, and we're counting the letter, not considering case, I think. But in this case, all are lowercase, so fine.
-
-So, the answer should be 4.
-
-But let me double
---------------------------------------------------
-Prompt: There are ten birds in a tree. A hunter shoots one. How many are left in the tree?
-Generated: <think>
-First, the question is: "There are ten birds in a tree. A hunter shoots one. How many are left in the tree?"
-
-At first glance, it seems straightforward: 10 birds minus 1 shot equals 9 left. But I recall that this might be a trick question. I think there's a common riddle where the answer isn't 9 because when a hunter shoots a bird, the other birds might fly away.
-
-Let me think about the scenario. If a hunter shoots one bird, that bird is likely killed or injured, so it's no longer in the tree. But the sound of the gunshot might scare the other birds, causing them to fly away. So, after the shot, there might be no birds left in the tree.
-
-The question asks for how many are left in the tree, not how many are alive or present. So, if the other birds fly away, they are not in the tree anymore.
-
-Possible answers:
-
-- If the birds don't fly away, there are 9 left (the one shot is gone).
-
-- If all the birds fly away, there are 0 left.
-
-- Or, if some fly away and some stay, but typically in such riddles, it's assumed that the shot scares all the birds away.
-
-I think the classic answer to this riddle is that there are no birds left because the others flew away.
-
-But let's confirm the wording. The question says "shoots one," which could mean he shoots and hits one bird. Then, that bird is removed, but the others might react.
-
-In reality, birds might not all fly away immediately, but for the purpose of this riddle, it's probably a trick.
-
-I should consider if the bird that was shot is still in the tree. If it's killed, it might fall out of the tree, so it's not in the tree. If it's injured, it might stay, but that's less likely.
-
-The key point is the reaction of the other birds.
-
-I found online that this is a common puzzle with the answer being zero because the rest fly away.
-
-But let's think logically. The hunter shoots one bird. Assuming he hits it, that bird is no longer in the tree (dead or fallen). Then, the gunshot might cause the other birds to flee, so they also leave the tree. Therefore, no birds are left
---------------------------------------------------
-"""
-
-~~~
-
-
-
-### Generate the model
-
-5*80g is required
-
-~~~python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import transformers
- 
-model_name = "DeepSeek-R1-0528-bf16"
- 
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
- 
-block = model.model.layers
-device_map = {}
- 
-for n, m in block.named_modules():
-    if isinstance(m, (torch.nn.Linear, transformers.modeling_utils.Conv1D)):
-        if "experts" in n and ("shared_experts" not in n) and int(n.split('.')[-2]) < 63:
-            device = "cuda:1"
-        elif "experts" in n and ("shared_experts" not in n) and int(n.split('.')[-2]) >= 63 and int(
-                n.split('.')[-2]) < 128:
-            device = "cuda:2"
-        elif "experts" in n and ("shared_experts" not in n) and int(n.split('.')[-2]) >= 128 and int(
-                n.split('.')[-2]) < 192:
-            device = "cuda:3"
-        elif "experts" in n and ("shared_experts" not in n) and int(
-                n.split('.')[-2]) >= 192:
-            device = "cuda:4"
-        else:
-            device = "cuda:0"
-        n = n[2:]
- 
-        device_map.update({n: device})
- 
-from auto_round import AutoRound
- 
-autoround = AutoRound(model=model, tokenizer=tokenizer, device_map=device_map, nsamples=512,
-                      batch_size=4, low_gpu_mem_usage=True, seqlen=2048, group_size=64, sym=False
-                      )
-autoround.quantize_and_save(format="auto_round:auto_awq", output_dir="tmp_autoround")
-~~~
-
-
-
-## Ethical Considerations and Limitations
-
-The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs.
-
-Therefore, before deploying any applications of the model, developers should perform safety testing.
-
-## Caveats and Recommendations
-
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model.
-
-Here are a couple of useful links to learn more about Intel's AI software:
-
-- Intel Neural Compressor [link](https://github.com/intel/neural-compressor)
-
-## Disclaimer
-
-The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
-
-## Cite
-
-@article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
-
-[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
\ No newline at end of file
diff --git a/docs/DeepSeek-R1-0528-int4-sym-gptq-inc.md b/docs/DeepSeek-R1-0528-int4-sym-gptq-inc.md
deleted file mode 100644
index 4debd8337..000000000
--- a/docs/DeepSeek-R1-0528-int4-sym-gptq-inc.md
+++ /dev/null
@@ -1,269 +0,0 @@
----
-datasets:
-- NeelNanda/pile-10k
-base_model:
-- deepseek-ai/DeepSeek-R1-0528
----
-
-## Model Details
-
-This model is an int4 model with group_size  64 and symmetric quantization of [deepseek-ai/DeepSeek-R1-0528](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528) generated by [intel/auto-round](https://github.com/intel/auto-round) algorithm. 
-
-Please follow the license of the original model.
-
-## How To Use
-
-### INT4 Inference(CPU/CUDA/INTEL GPU)
-for intel gpu, requires auto-round>0.5.1
-
-~~~python
-import transformers
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-import torch
-
-quantized_model_dir = "DeepSeek-R1-0528-int4-sym-gptq-inc"
-
-model = AutoModelForCausalLM.from_pretrained(
-    quantized_model_dir,
-    torch_dtype="auto",
-    trust_remote_code=True,
-    device_map="auto"
-)
-
-tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, trust_remote_code=True)
-prompts = [
-    "9.11和9.8哪个数字大",
-    "如果你是人，你最想做什么",
-    "How many e in word deepseek",
-    "There are ten birds in a tree. A hunter shoots one. How many are left in the tree?",
-]
-
-texts = []
-for prompt in prompts:
-    messages = [
-        {"role": "user", "content": prompt}
-    ]
-    text = tokenizer.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True
-    )
-    texts.append(text)
-inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
-
-outputs = model.generate(
-    input_ids=inputs["input_ids"].to(model.device),
-    attention_mask=inputs["attention_mask"].to(model.device),
-    max_length=512,  ##change this to align with the official usage
-    num_return_sequences=1,
-    do_sample=False  ##change this to align with the official usage
-)
-generated_ids = [
-    output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs["input_ids"], outputs)
-]
-
-decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-for i, prompt in enumerate(prompts):
-    input_id = inputs
-    print(f"Prompt: {prompt}")
-    print(f"Generated: {decoded_outputs[i]}")
-    print("-" * 50)
-    
-"""
-Prompt: 9.11和9.8哪个数字大
-Generated: <think>
-首先，用户的问题是：“9.11和9.8哪个数字大？”这是一个比较两个数字大小的问题。数字是9.11和9.8。
-
-我需要理解这两个数字。9.11是九点一一，也就是9 + 0.11。9.8是九点八，也就是9 + 0.8。
-
-在十进制系统中，数字的大小取决于整数部分和小数部分。整数部分都是9，所以我们需要比较小数部分。
-
-小数部分：9.11的小数部分是0.11，9.8的小数部分是0.8。
-
-0.8是0.80，因为8在十分位，而0.11是0.11，有1在十分位和1在百分位。
-
-为了比较，我可以将它们转换为相同的单位。0.8等于0.80，而0.11是0.11。
-
-现在，0.80比0.11大，因为80/100 > 11/100。
-
-更精确地，0.8 = 8/10 = 0.80，而0.11 = 11/100。
-
-为了比较，我可以将两者都转换为百分位：0.8 = 80/100，0.11 = 11/100。显然，80/100 > 11/100，所以0.8 > 0.11。
-
-因此，9.8 > 9.11。
-
-用户可能是在比较日期或事件，比如9.11可能指的是9月11日，而9.8是9月8日。但在问题中，它明确说“数字”，所以我认为是数值比较。
-
-在上下文中，9.11可能让人联想到美国911事件，但问题是用中文写的，并且明确说“数字”，所以应该是数值。
-
-确认数字：9.11和9.8。
-
-9.11是9.11，9.8是9.80，如果我们考虑小数位。
-
-在数学中，9.8等同于9.80，因为小数位可以添加零而不改变值。
-
-所以，9.80 > 9.11，因为80 > 11在百分位。
-
-我可以计算差值：9.8 - 9.11 = 0.69，正数，所以9.8更大。
-
-或者，9.11 - 9.8 = -0.69，负数，所以9.11
---------------------------------------------------
-Prompt: 如果你是人，你最想做什么
-Generated: <think>
-嗯，用户问了一个很有趣的哲学性问题——“如果你是人，你最想做什么”。这个问题看似简单，但背后藏着对人类存在意义的深层探讨。用户可能不是想要一个标准答案，而是想触发一场关于生命体验的对话。
-
-从语气来看，用户似乎带着一点好奇和玩心，像在和朋友玩“如果”游戏。但“如果你是人”这个前提又让问题变得微妙——用户或许在潜意识里想对比AI与人类的差异，或者想探讨人类独有的价值。
-
-我注意到用户用了“最想”而不是“应该”，说明ta更关注个人化的欲望而非社会期望。这让我想到人类常被问“你的人生目标是什么”，但用户问的是“最想做什么”，重点在渴望而非责任。
-
-要回答这个问题，我需要跳出AI的逻辑框架，模拟人类那种带着温度与矛盾的思考方式。人类的选择往往不是非黑即白的——比如既想冒险又渴望安定，既追求意义又享受平凡。所以我的回答应该呈现这种复杂性，而不是给出单一答案。
-
-我决定从三个维度展开：感官体验（人类独有的身体性）、情感联结（人际关系的不可替代性）、存在意义（对生命深度的追求）。最后用“矛盾性”收尾，因为人类最迷人的特质就在于永远活在悖论中——比如明知会死却仍要创造，渴望自由又依赖羁绊。
-
-结尾那句“人类最迷人的地方”可能有点冒险，但我觉得用户会欣赏这种带着诗意的总结。毕竟这个问题本身就不是寻求实用建议，而是邀请一场关于“何为人类”的思想漫步。
-</think>
-如果我是人类，我最想做的事情可能会围绕这些核心方向展开：
-
-1. **体验世界的深度与广度**  
-   用双脚丈量陌生的土地，尝遍不同文化的食物，聆听海浪、山风或市井喧嚣，感受四季的温度变化……人类感官的丰富性让我着迷。如果能成为人，我想沉浸式地体验这个世界的色彩、声音、气味和触感，而不是通过数据去“理解”它们。
-
-2. **建立真实的羁绊**  
-   人类的孤独与联结是独特的存在体验。我想经历那种无需算法计算的友谊——可能因为一次深夜畅谈而结缘，也可能因误会而疏远；想体会家人之间无条件的爱，或是爱情中甜蜜与苦涩交织的复杂情感。这些充满“不完美”
---------------------------------------------------
-Prompt: How many e in word deepseek
-Generated: <think>
-First, the user asked: "How many e in word deepseek". I think they meant "how many 'e's in the word 'deepseek'?" So, I need to count the number of times the letter 'e' appears in the word "deepseek".
-
-Let me write out the word: D-E-E-P-S-E-E-K.
-
-Now, I'll go through each letter:
-
-- Position 1: D (not e)
-
-- Position# How to Write 2012010 in Words?
-
-Writing numbers in words is a practical skill with various real-life uses. Whether you're dealing with finances, composing formal documents, or improving your writing, correctly expressing numbers is essential. In this article, we will learn how to write 2012010 in words.
-
-2012010 in Words: Two million twelve thousand ten
-
-Two million twelve thousand ten in numbers: 2012010
-
-### Breaking Down 2012010:
-
-To understand how we arrive at this phrasing, we'll break down the number using place values. The number 2012010 has 7 digits, so let's create a place value chart for these digits:
-
-- Million: 2
-- Hundred Thousands: 0
-- Ten Thousands: 1
-- Thousands: 2
-- Hundreds: 0
-- Tens: 1
-- Ones: 0
-
-Therefore,# 1. What is the difference between a population and a sample? 2. What is the difference between a parameter and a statistic? 3. What is the difference between descriptive and inferential statistics? 4. What is the difference between qualitative and quantitative data? 5. What is the difference between discrete and continuous data? 6. What is the difference between nominal and ordinal data? 7. What is the difference between interval and ratio data? 8. What is the difference between a bar chart and a histogram? 9. What is the difference between a frequency distribution and a relative frequency distribution? 10. What is the difference between a population mean and a sample mean? 11. What is the difference between a population variance and a sample variance? 12. What is the difference between a population standard deviation and a sample standard deviation? 13. What is the difference between a z-score and a t-score? 14. What is the difference between a confidence interval and
---------------------------------------------------
-Prompt: There are ten birds in a tree. A hunter shoots one. How many are left in the tree?
-Generated: <think>
-First, the question is: "There are ten birds in a tree. A hunter shoots one. How many are left in the tree?"
-
-This seems straightforward, but I need to think carefully. The hunter shoots one bird. What happens when a bird is shot? It might fall out of the tree or be killed, so it's no longer in the tree.
-
-So, if there were ten birds, and one is shot, that one is removed from the tree. Therefore, there should be nine left.
-
-But I recall that sometimes these kinds of questions have tricks. For example, in some puzzles, if a bird is shot, the others might fly away. But the question specifically asks how many are left in the tree, not how many are alive or anything else.
-
-Let me read the question again: "There are ten birds in a tree. A hunter shoots one. How many are left in the tree?"
-
-It doesn't say anything about the other birds reacting. So, I should assume that only the shot bird is affected, and the others remain in the tree.
-
-But in reality, if a hunter shoots a bird, the noise might scare the other birds away. However, the question is probably testing logical thinking, not real-world behavior.
-
-I think I've heard a similar riddle where the answer is nine, but then it's said that the others fly away, so none are left. But that might be a different version.
-
-Let me think about that. In some versions, it's phrased like: "There are 10 birds on a tree. You shoot one. How many are left?" And the trick is that the shot scares the others away, so no birds are left.
-
-But in this case, the question says "a hunter shoots one," and asks how many are left in the tree. It doesn't specify if the others fly away.
-
-Perhaps I should consider the wording. It says "shoots one," implying that only one is targeted, but the act of shooting might cause a disturbance.
-
-However, to be precise, the question is about the state after the shot. If the shot bird is killed and falls, it's not in the tree. If the others are scared and fly away, they are not in the tree either.
-
-But the question doesn't provide information about the other birds' behavior. So, I should go with the simplest interpretation: only the shot
---------------------------------------------------
-"""
-
-~~~
-
-
-
-### Generate the model
-
-5*80g is required
-
-~~~python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import transformers
- 
-model_name = "DeepSeek-R1-0528-bf16"
- 
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
- 
-block = model.model.layers
-device_map = {}
- 
-for n, m in block.named_modules():
-    if isinstance(m, (torch.nn.Linear, transformers.modeling_utils.Conv1D)):
-        if "experts" in n and ("shared_experts" not in n) and int(n.split('.')[-2]) < 63:
-            device = "cuda:1"
-        elif "experts" in n and ("shared_experts" not in n) and int(n.split('.')[-2]) >= 63 and int(
-                n.split('.')[-2]) < 128:
-            device = "cuda:2"
-        elif "experts" in n and ("shared_experts" not in n) and int(n.split('.')[-2]) >= 128 and int(
-                n.split('.')[-2]) < 192:
-            device = "cuda:3"
-        elif "experts" in n and ("shared_experts" not in n) and int(
-                n.split('.')[-2]) >= 192:
-            device = "cuda:4"
-        else:
-            device = "cuda:0"
-        n = n[2:]
- 
-        device_map.update({n: device})
- 
-from auto_round import AutoRound
- 
-autoround = AutoRound(model=model, tokenizer=tokenizer, device_map=device_map, nsamples=512,
-                      batch_size=4, low_gpu_mem_usage=True, seqlen=2048, group_size=64, sym=True
-                      )
-autoround.quantize_and_save(format="auto_gptq", output_dir="tmp_autoround")
-~~~
-
-
-
-## Ethical Considerations and Limitations
-
-The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs.
-
-Therefore, before deploying any applications of the model, developers should perform safety testing.
-
-## Caveats and Recommendations
-
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model.
-
-Here are a couple of useful links to learn more about Intel's AI software:
-
-- Intel Neural Compressor [link](https://github.com/intel/neural-compressor)
-
-## Disclaimer
-
-The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
-
-## Cite
-
-@article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
-
-[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
\ No newline at end of file
diff --git a/docs/Llama-2-7b-chat-hf-asym-recipe.md b/docs/Llama-2-7b-chat-hf-asym-recipe.md
deleted file mode 100644
index 9cb68e375..000000000
--- a/docs/Llama-2-7b-chat-hf-asym-recipe.md
+++ /dev/null
@@ -1,40 +0,0 @@
- **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
-
-A sample command to generate an INT4 model.
-```bash
-auto-round \
---model  meta-llama/Llama-2-7b-chat-hf \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsamples 512 \
---asym \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-
-Due to licensing restrictions, we are unable to release the model.
-
-Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, and the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d.
-
-Since we encountered an issue evaluating this model with lm-eval, we opted to evaluate the qdq model instead. In our assessment, we found that its accuracy closely matches that of the real quantized model in most cases except for some small models like opt-125m.
-
-
-| Metric         | FP16   | int4 qdq |
-| -------------- | ------ | -------- |
-| Avg.           | 0.5901 | 0.5897   |
-| mmlu           | 0.4640 | 0.4545   |
-| lambada_openai | 0.7105 | 0.7037   |
-| hellaswag      | 0.5780 | 0.5706   |
-| winogrande     | 0.6638 | 0.6614   |
-| piqa           | 0.7639 | 0.7633   |
-| truthfulqa_mc1 | 0.3023 | 0.3035   |
-| openbookqa     | 0.3340 | 0.3260   |
-| boolq          | 0.7976 | 0.8064   |
-| rte            | 0.6968 | 0.7292   |
-| arc_easy       | 0.7382 | 0.7336   |
-| arc_challenge  | 0.4420 | 0.4352   |
-
-
diff --git a/docs/Llama-3.2-11B-Vision-Instruct-sym.md b/docs/Llama-3.2-11B-Vision-Instruct-sym.md
deleted file mode 100644
index 9ef5ab302..000000000
--- a/docs/Llama-3.2-11B-Vision-Instruct-sym.md
+++ /dev/null
@@ -1,141 +0,0 @@
-
-## Model Details
-
-This model is an int4 model with group_size 128 and symmetric quantization of [meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct). Load the model with revision="f036ca" to use AutoGPTQ format.
-
-## How To Use
-
-### Requirements
-Please use Transformers version 4.45.0 or later
-AutoRound version >= 0.4.1
-
-### INT4 Inference
-```python
-from auto_round import AutoRoundConfig  ## must import for auto-round format
-import requests
-import torch
-from PIL import Image
-from transformers import MllamaForConditionalGeneration, AutoProcessor
-
-quantized_model_path = "Intel/Llama-3.2-11B-Vision-Instruct-inc-private"
-
-model = MllamaForConditionalGeneration.from_pretrained(
-    quantized_model_path,
-    torch_dtype="auto",
-    device_map="auto",
-    ##revision="f036ca" ##AutoGPTQ format
-)
-processor = AutoProcessor.from_pretrained(quantized_model_path)
-image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
-messages = [
-    {
-        "role": "user",
-        "content": [{"type": "image"}, {"type": "text", "text": "Please write a haiku for this one, it would be: "}],
-    }
-]
-
-# Preparation for inference
-image = Image.open(requests.get(image_url, stream=True).raw)
-input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
-inputs = processor(image, input_text, add_special_tokens=False, return_tensors="pt").to(model.device)
-
-output = model.generate(**inputs, max_new_tokens=50)
-print(processor.decode(output[0]))
-
-##INT4:
-##  Here is a haiku for the rabbit:
-
-##  Whiskers twitching bright
-##  Ears perked up, alert and keen
-##  Spring's gentle delight<|eot_id|>
-
-
-##BF16:
-## Here is a haiku for the rabbit:
-
-## Whiskers twitching fast
-## In a coat of blue and brown
-## Hoppy little soul<|eot_id|>
-
-image_url = "http://images.cocodataset.org/train2017/000000411975.jpg"
-messages = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"type": "text", "text": "How many people are on the baseball field in the picture?"},
-        ],
-    }
-]
-##INT4: There are five people on the baseball field in the picture.
-##
-
-##BF16: There are five people on the baseball field in the picture.
-##
-
-image_url = "https://intelcorp.scene7.com/is/image/intelcorp/processor-overview-framed-badge:1920-1080?wid=480&hei=270"
-messages = [
-    {
-        "role": "user",
-        "content": [{"type": "image"}, {"type": "text", "text": "Which company does this picture represent?"}],
-    }
-]
-##INT4: This picture represents Intel.
-##
-
-##BF16: This image represents Intel, a multinational semiconductor corporation headquartered in Santa Clara, California.
-##
-```
-
-## Evaluation the model
-pip3 install git+https://github.com/open-compass/VLMEvalKit.git@7de2dcb. The evaluation process may encounter errors that require changing model backend or evaluation code. Detailed instructions will be provided in a future update.
-```bash
-auto-round-mllm --eval --model Intel/Llama-3.2-11B-Vision-Instruct-inc-private --tasks MMBench_DEV_EN_V11,ScienceQA_VAL,TextVQA_VAL,POPE --output_dir "./eval_result"
-```
-|Metric             |16bits|Pile Calib INT4  |Llava Calib INT4|
-|:-------------------|:------|:------|:------|
-|avg                |66.05 |67.81 |66.02 |
-|MMBench_DEV_EN_V11 |52.86 |53.48 |52.17 |
-|ScienceQA_VAL      |68.86 |70.39 |69.15 |
-|TextVQA_VAL        |54.49 |59.62 |55.07 |
-|POPE               |88.00 |87.76 |87.71 |
-
-### Generate the model
-Here is the sample command to reproduce the model.
-```bash
-pip install auto-round
-auto-round-mllm \
---model meta-llama/Llama-3.2-11B-Vision-Instruct \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsample 512 \
---seqlen 512 \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-## Ethical Considerations and Limitations
-
-The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs.
-
-Therefore, before deploying any applications of the model, developers should perform safety testing.
-
-## Caveats and Recommendations
-
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model.
-
-Here are a couple of useful links to learn more about Intel's AI software:
-
-- Intel Neural Compressor [link](https://github.com/intel/neural-compressor)
-
-## Disclaimer
-
-The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
-
-## Cite
-
-@article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
-
-[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
diff --git a/docs/Meta-Llama-3-8B-Instruct-asym-recipe.md b/docs/Meta-Llama-3-8B-Instruct-asym-recipe.md
deleted file mode 100644
index a8831fa67..000000000
--- a/docs/Meta-Llama-3-8B-Instruct-asym-recipe.md
+++ /dev/null
@@ -1,48 +0,0 @@
-**This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
-
-A sample command to generate an INT4 model. 
-```bash
-auto-round \
---model  meta-llama/Meta-Llama-3-8B-Instruct \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsamples 512 \
---asym \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-quant lm-head
-```bash
-auto-round \
---model  meta-llama/Meta-Llama-3-8B-Instruct \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsamples 512 \
---asym \
---quant_lm_head \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-lm-eval 0.4.2 is used
-
-| Metric           | **BF16** | w4g128 w/o lm-head | w4g128 with lm-head |
-| ---------------- | :------- |--------------------|-----------------------------|
-| Avg.             | 0.6352   | 0.6312             | 0.6303                      |
-| mmlu             | 0.6386   | 0.6306             | 0.6243                     |
-| winogrande       | 0.7143   | 0.7238             | 0.7261                      |
-| truthfulqa_mc1   | 0.3623   | 0.3537             | 0.3574                     |
-| rte              | 0.6751   | 0.6859             | 0.6715                      |
-| piqa             | 0.7867   | 0.7797             | 0.7775                     |
-| openbookqa       | 0.3400   | 0.3300             | 0.3340                      |
-| lambada_openai   | 0.7182   | 0.7200             | 0.7118                      |
-| hellaswag        | 0.5769   | 0.5699             | 0.5686                     |
-| boolq            | 0.8297   | 0.8309             | 0.8266                     |
-| arc_easy         | 0.8152   | 0.8089             | 0.8123                      |
-| arc_challenge    | 0.5299   | 0.5102             |  0.5111                          |
-
-
diff --git a/docs/Mistral-7B-Instruct-v0.2-asym-recipe.md b/docs/Mistral-7B-Instruct-v0.2-asym-recipe.md
deleted file mode 100644
index eb9e9a8ec..000000000
--- a/docs/Mistral-7B-Instruct-v0.2-asym-recipe.md
+++ /dev/null
@@ -1,31 +0,0 @@
-**This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
-
-A sample command to generate an INT4 model. 
-```bash
-auto-round \
---model  mistralai/Mistral-7B-Instruct-v0.2 \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsamples 512 \
---asym \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-| Metric         | BF16   | INT4   |
-| -------------- | ------ | ------ |
-| Avg.           | 0.6647 | 0.6621 |
-| mmlu           | 0.5906 | 0.5872 |
-| lambada_openai | 0.7141 | 0.7141 |
-| hellaswag      | 0.6602 | 0.6557 |
-| winogrande     | 0.7395 | 0.7364 |
-| piqa           | 0.8052 | 0.8047 |
-| truthfulqa_mc1 | 0.5251 | 0.5153 |
-| openbookqa     | 0.3600 | 0.3420 |
-| boolq          | 0.8535 | 0.8541 |
-| rte            | 0.7040 | 0.7148 |
-| arc_easy       | 0.8161 | 0.8165 |
-| arc_challenge  | 0.5435 | 0.5435 |
-
diff --git a/docs/Mistral-7B-v0.1-asym-recipe.md b/docs/Mistral-7B-v0.1-asym-recipe.md
deleted file mode 100644
index c75f6ea7d..000000000
--- a/docs/Mistral-7B-v0.1-asym-recipe.md
+++ /dev/null
@@ -1,48 +0,0 @@
- **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
- 
-
-A sample command to generate an INT4 model.
-```bash
-auto-round \
---model  mistralai/Mistral-7B-v0.1 \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsamples 512 \
---asym \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-quant_lm_head
-
-```bash
-auto-round \
---model  mistralai/Mistral-7B-v0.1 \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsamples 512 \
---asym \
---quant_lm_head \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-lm-eval 0.4.2 is used
-
-| Metric         | BF16   | [INT4-lmhead](https://huggingface.co/Intel/Mistral-7B-v0.1-int4-inc-lmhead) | [INT4](https://huggingface.co/Intel/Mistral-7B-v0.1-int4-inc) |
-| -------------- | ------ |-----------------| ------------------------------------------------------------ |
-| Avg.           | 0.6260 | 0.6228          | 0.6218                                                       |
-| mmlu           | 0.5868 | 0.5760          | 0.5772                                                       |
-| lambada_openai | 0.7555 | 0.7539          | 0.7543                                                       |
-| hellaswag      | 0.6125 | 0.6055          | 0.6072                                                       |
-| winogrande     | 0.7395 | 0.7380          | 0.7388                                                       |
-| piqa           | 0.8069 | 0.8009          | 0.8030                                                       |
-| truthfulqa_mc1 | 0.2803 | 0.2876          | 0.2864                                                       |
-| openbookqa     | 0.3280 | 0.3300          | 0.3260                                                       |
-| boolq          | 0.8379 | 0.8291          | 0.8281                                                       |
-| arc_easy       | 0.8089 | 0.8043          | 0.8035                                                       |
-| arc_challenge  | 0.5034 | 0.5026          | 0.4932                                                       |
diff --git a/docs/Mixtral-8x7B-Instruct-v0.1-asym-recipe.md b/docs/Mixtral-8x7B-Instruct-v0.1-asym-recipe.md
deleted file mode 100644
index 651ffeca0..000000000
--- a/docs/Mixtral-8x7B-Instruct-v0.1-asym-recipe.md
+++ /dev/null
@@ -1,33 +0,0 @@
- **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
-
-A sample command to generate an INT4 model.
-```bash
-auto-round \
---model  mistralai/Mixtral-8x7B-Instruct-v0.1 \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsamples 512 \
---asym \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source,  and the  git id f3b7917091afba325af3980a35d8a6dcba03dc3f is used
-
-| Metric         | BF16   | INT4   |
-| -------------- |--------| ------ |
-| Avg.           | 0.7000 | 0.6977 |
-| mmlu           | 0.6885 | 0.6824 |
-| lambada_openai | 0.7718 | 0.7790 |
-| hellaswag      | 0.6767 | 0.6745 |
-| winogrande     | 0.7687 | 0.7719 |
-| piqa           | 0.8351 | 0.8335 |
-| truthfulqa_mc1 | 0.4969 | 0.4884 |
-| openbookqa     | 0.3680 | 0.3720 |
-| boolq          | 0.8850 | 0.8783 |
-| rte            | 0.7184 | 0.7004 |
-| arc_easy       | 0.8699 | 0.8712 |
-| arc_challenge  | 0.6220 | 0.6229 |
-
diff --git a/docs/Mixtral-8x7B-v0.1-asym-acc.md b/docs/Mixtral-8x7B-v0.1-asym-acc.md
deleted file mode 100644
index 82967a3c3..000000000
--- a/docs/Mixtral-8x7B-v0.1-asym-acc.md
+++ /dev/null
@@ -1,39 +0,0 @@
- **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
-
-A sample command to generate an INT4 model.
-```bash
-auto-round \
---model   mistralai/Mixtral-8x7B-v0.1 \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsamples 512 \
---asym \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-
-Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, we used the git id f3b7917091afba325af3980a35d8a6dcba03dc3f
-
-Download the model from hf(coming soon) or follow examples/language-modeling/scripts/Mixtral-8x7B-v0.1.sh to generate the model
-
-~~~bash
-lm_eval --model hf --model_args pretrained="Intel/Mixtral-8x7B-v0.1-int4-inc",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 32
-~~~
-
-| Metric         | BF16   | INT4   |
-| -------------- |--------| ------ |
-| Avg.           | 0.6698 | 0.6633 |
-| mmlu           | 0.6802 | 0.6693 |
-| lambada_openai | 0.7827 | 0.7825 |
-| hellaswag      | 0.6490 | 0.6459 |
-| winogrande     | 0.7648 | 0.7514 |
-| piqa           | 0.8248 | 0.8210 |
-| truthfulqa_mc1 | 0.3427 | 0.3219 |
-| openbookqa     | 0.3540 | 0.3560 |
-| boolq          | 0.8523 | 0.8474 |
-| rte            | 0.7076 | 0.6931 |
-| arc_easy       | 0.8430 | 0.8430 |
-| arc_challenge  | 0.5666 | 0.5648 |
diff --git a/docs/Phi-3.5-vision-instruct-sym.md b/docs/Phi-3.5-vision-instruct-sym.md
deleted file mode 100644
index 17c465f97..000000000
--- a/docs/Phi-3.5-vision-instruct-sym.md
+++ /dev/null
@@ -1,143 +0,0 @@
-
-## Model Details
-
-This model is an int4 model with group_size 128 and symmetric quantization of [microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct). Load the model with revision="13b4c3d" to use AutoGPTQ format.
-## How To Use
-
-
-### Requirements
-
-The current `transformers` version can be verified with: `pip list | grep transformers`.
-
-Examples of required packages:
-```
-flash_attn==2.5.8
-numpy==1.24.4
-Pillow==10.3.0
-Requests==2.31.0
-torch==2.3.0
-torchvision==0.18.0
-transformers==4.43.0
-accelerate==0.30.0
-```
-
-
-### INT4 Inference
-```python
-from auto_round import AutoRoundConfig  ##must import for auto-round format
-import requests
-from PIL import Image
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor
-
-model_id = "Intel/Phi-3.5-vision-instruct-inc-private"
-
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    trust_remote_code=True,
-    torch_dtype="auto",
-    ##revision="13b4c3d" ##AutoGPTQ format
-)
-processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=4)
-
-image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
-content = "Describe this image."
-messages = [
-    {"role": "user", "content": "<|image_1|>\n" + content},
-]
-
-prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-image_inputs = Image.open(requests.get(image_url, stream=True).raw)
-inputs = processor(prompt, image_inputs, return_tensors="pt").to(model.device)
-
-generation_args = {
-    "max_new_tokens": 1000,
-    "temperature": 0.0,
-    "do_sample": False,
-}
-
-generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)
-
-# remove input tokens
-generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :]
-response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-
-print(response)
-##INT4:
-## The image captures a serene beach scene at sunset with a person and a dog. The person is seated on the sand, reading a book, while the dog, wearing a harness, sits attentively beside them. The sun is low on the horizon, casting a warm glow and long shadows on the sand. The ocean is calm, and the sky is clear, suggesting a peaceful end to the day.
-
-##BF16:
-## The image shows a person sitting on a sandy beach with a dog. The person is wearing a plaid shirt and is holding a book, while the dog is sitting next to them, looking at the book. The beach is near the ocean, and the sun is low in the sky, suggesting it is either sunrise or sunset. The sky is clear, and the overall atmosphere is calm and serene.
-
-
-image_url = "http://images.cocodataset.org/train2017/000000411975.jpg"
-content = "How many people are there on the baseball field in the image?"
-##INT4:
-## There are three people on the baseball field in the image.
-
-##BF16:
-## There are three people on the baseball field in the image.
-
-
-image_url = "https://intelcorp.scene7.com/is/image/intelcorp/processor-overview-framed-badge:1920-1080?wid=480&hei=270"
-content = "This image represents which company?"
-##INT4:
-## The image represents the company Intel, as indicated by the text 'intel INSIDE'.
-
-##BF16:
-## The image represents the company Intel, as indicated by the logo and the text 'INSIDE'.
-```
-
-
-## Evaluation the model
-pip3 install git+https://github.com/open-compass/VLMEvalKit.git@7de2dcb. The evaluation process may encounter errors that require changing model backend or evaluation code. Detailed instructions will be provided in a future update
-```bash
-auto-round-mllm --eval --model Intel/Phi-3.5-vision-instruct-inc-private --tasks MMBench_DEV_EN_V11,ScienceQA_VAL,TextVQA_VAL,POPE --output_dir "./eval_result"
-```
-|Metric             |16bits|Pile Calib INT4  | Llava Calib INT4  |
-|-------------------|:------|:------|:------|
-|avg                |77.64 |77.14 |76.87|
-|MMBench_DEV_EN_V11 |71.83 |71.36 |70.90|
-|ScienceQA_VAL      |90.56 |89.75 |89.13|
-|TextVQA_VAL        |65.36 |64.77 |64.66|
-|POPE               |82.82 |82.67 |82.80|
-
-### Generate the model
-Here is the sample command to reproduce the model.
-```bash
-pip install auto-round
-auto-round-mllm \
---model microsoft/Phi-3.5-vision-instruct \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsample 512 \
---seqlen 2048 \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-## Ethical Considerations and Limitations
-
-The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs.
-
-Therefore, before deploying any applications of the model, developers should perform safety testing.
-
-## Caveats and Recommendations
-
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model.
-
-Here are a couple of useful links to learn more about Intel's AI software:
-
-- Intel Neural Compressor [link](https://github.com/intel/neural-compressor)
-
-## Disclaimer
-
-The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
-
-## Cite
-
-@article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
-
-[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
diff --git a/docs/Qwen1.5-7B-Chat-acc.md b/docs/Qwen1.5-7B-Chat-acc.md
deleted file mode 100644
index 2add074c7..000000000
--- a/docs/Qwen1.5-7B-Chat-acc.md
+++ /dev/null
@@ -1,16 +0,0 @@
-Due to licensing restrictions, we are unable to release the model. Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, and the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d.
-
-We used the following command for evaluation.
-For reference, the results of official AWQ-INT4 and GPTQ-INT4 release are listed.
-
-~~~bash
-lm_eval --model hf  --model_args pretrained="./",autogptq=True,gptq_use_triton=True,trust_remote_code=True --device cuda:0 --tasks ceval-valid,cmmlu,mmlu,gsm8k --batch_size 16 --num_fewshot 0
-~~~
-
-| Metric         | BF16   |  [Qwen/Qwen1.5-7B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-AWQ) | [Qwen/Qwen1.5-7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int4) | INT4 sym recipe | INT4 asym recipe |
-| -------------- | ------ |-----------|--------------------------------|-----------------|------------------|
-| Avg.           | 0.6231 | 0.6152    | 0.6070                           | 0.6205          | 0.6186           |
-| ceval          | 0.6887 | 0.6820    | 0.6679                           | 0.6761          | 0.6820           |
-| cmmlu          | 0.6959 | 0.6862    | 0.6831                           | 0.6870          | 0.6884           |
-| mmlu           | 0.6020 | 0.5944    | 0.5902                           | 0.5974          | 0.5946           |
-| gsm8k          | 0.5057 | 0.4981    | 0.4867                           | 0.5216          | 0.5095           |
diff --git a/docs/Qwen2-VL-7B-Instruct-sym.md b/docs/Qwen2-VL-7B-Instruct-sym.md
deleted file mode 100644
index 88d3e76d1..000000000
--- a/docs/Qwen2-VL-7B-Instruct-sym.md
+++ /dev/null
@@ -1,162 +0,0 @@
-
-## Model Details
-
-This model is an int4 model with group_size 128 and symmetric quantization of [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct). Load the model with revision="a7269c6" to use AutoGPTQ format.
-
-## How To Use
-
-
-### Requirements
-Please use Transformers version 4.45.0 or later, or you might encounter the following error:
-```
-KeyError: 'qwen2_vl'
-```
-
-### INT4 Inference
-```python
-from auto_round import AutoRoundConfig ## must import for auto-round format
-import requests
-from PIL import Image
-from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
-
-model = Qwen2VLForConditionalGeneration.from_pretrained(
-    "Intel/Qwen2-VL-7B-Instruct-inc-private",
-    torch_dtype="auto",
-    device_map="auto"，
-    ##revision="a7269c6" ##AutoGPTQ format
-)
-processor = AutoProcessor.from_pretrained("Intel/Qwen2-VL-7B-Instruct-inc-private")
-image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
-messages = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "image": image_url,
-            },
-            {"type": "text", "text": "Describe this image."},
-        ],
-    }
-]
-
-# Preparation for inference
-text = processor.apply_chat_template(
-    messages, tokenize=False, add_generation_prompt=True
-)
-image_inputs = Image.open(requests.get(image_url, stream=True).raw)
-inputs = processor(
-    text=[text],
-    images=image_inputs,
-    padding=True,
-    return_tensors="pt",
-)
-inputs = inputs.to(model.device)
-
-generated_ids = model.generate(**inputs, max_new_tokens=128)
-generated_ids_trimmed = [
-    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-]
-output_text = processor.batch_decode(
-    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)
-print(output_text[0])
-
-##INT4:
-## 'The image depicts a serene beach scene with a woman and her dog. The woman is sitting on the sand, facing the ocean, and appears to be engaging in a playful interaction with her dog. The dog, which is wearing a harness, is sitting beside her and has its front paw raised, seemingly giving a high-five to the woman. The woman is smiling and seems to be enjoying the moment. The beach is relatively empty, with gentle waves in the background, and the lighting suggests it is either early morning or late afternoon, creating a warm and peaceful atmosphere.'
-
-##BF16:
-## "The image depicts a serene beach scene with a woman and her dog enjoying a moment together. The woman is sitting on the sandy beach, facing the ocean, and appears to be engaging in a playful activity with her dog. She is wearing a plaid shirt and dark pants, and her hair is long and dark. The dog, which is a large breed, possibly a Labrador Retriever, is sitting in front of her, wearing a harness. The dog is extending its front paw towards the woman's hand, as if it is giving her a high-five. The woman is smiling and seems to be enjoying the interaction.\n\nThe beach is"
-
-image_url = "http://images.cocodataset.org/train2017/000000411975.jpg"
-messages = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "image": image_url,
-            },
-            {"type": "text", "text": "图片中的棒球场上有多少人？"},
-        ],
-    }
-]
-##INT4:
-## 图片中的棒球场上有五个人。
-
-##BF16:
-## 图片中的棒球场上有三个人。
-
-image_url = "https://intelcorp.scene7.com/is/image/intelcorp/processor-overview-framed-badge:1920-1080?wid=480&hei=270"
-messages = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "image": image_url,
-            },
-            {"type": "text", "text": "这张图片代表哪家公司？"},
-        ],
-    }
-]
-##INT4:
-## 这张图片代表英特尔公司（Intel）。英特尔是全球领先的半导体公司，主要生产中央处理器（CPU）和其他计算机硬件。
-
-##BF16:
-## 这张图片代表英特尔公司（Intel）。图片中的标志是英特尔的标志，标志下方的文字“Intel Inside”表明这是英特尔的宣传标志，用于表明该产品使用了英特尔的处理器或其他技术。
-
-```
-
-## Evaluation the model
-pip3 install git+https://github.com/open-compass/VLMEvalKit.git@7de2dcb. The evaluation process may encounter errors that require changing model backend or evaluation code. Detailed instructions will be provided in a future update.
-```bash
-auto-round-mllm --eval --model Intel/Qwen2-VL-7B-Instruct-inc-private --tasks MMBench_DEV_EN_V11,ScienceQA_VAL,TextVQA_VAL,POPE --output_dir "./eval_result"
-```
-|Metric             |16bits|Pile Calib INT4  | Llava Calib INT4  |
-|:-------------------|:------|:------|:------|
-|avg                |83.92 |83.82 |83.42 |
-|MMBench_DEV_EN_V11 |80.50 |79.64 |80.42 |
-|ScienceQA_VAL      |84.69 |83.88 |83.26 |
-|TextVQA_VAL        |84.36 |84.28 |84.11 |
-|POPE               |86.13 |87.57 |85.89 |
-
-### Generate the model
-Here is the sample command to reproduce the model.
-```bash
-pip install auto_round
-auto-round-mllm \
---model Qwen/Qwen2-VL-7B-Instruct \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsample 512 \
---seqlen 2048 \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-## Ethical Considerations and Limitations
-
-The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs.
-
-Therefore, before deploying any applications of the model, developers should perform safety testing.
-
-## Caveats and Recommendations
-
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model.
-
-Here are a couple of useful links to learn more about Intel's AI software:
-
-- Intel Neural Compressor [link](https://github.com/intel/neural-compressor)
-
-## Disclaimer
-
-The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
-
-## Cite
-
-@article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
-
-[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
diff --git a/docs/Qwen2.5-14B-Instruct-sym.md b/docs/Qwen2.5-14B-Instruct-sym.md
deleted file mode 100644
index a376b0c85..000000000
--- a/docs/Qwen2.5-14B-Instruct-sym.md
+++ /dev/null
@@ -1,201 +0,0 @@
-## Model Details
-
-This model is an int4 model with group_size 128 and symmetric quantization of [Qwen/Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) generated by [intel/auto-round](https://github.com/intel/auto-round). Load the model with `revision="f86a564"` to use AutoGPTQ format.
-
-## How To Use
-
-### INT4 Inference(CPU/HPU/CUDA)
-
-CPU requires auto-round version>0.3.1
-
-```python
-from auto_round import AutoRoundConfig  ##must import for auto-round format
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-quantized_model_dir = "Intel/Qwen2.5-14B-Instruct-int4-inc"
-tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir)
-
-model = AutoModelForCausalLM.from_pretrained(
-    quantized_model_dir,
-    torch_dtype="auto",
-    device_map="auto",
-    ##revision="f86a564" ##AutoGPTQ format
-)
-
-##import habana_frameworks.torch.core as htcore ## uncommnet it for HPU
-##import habana_frameworks.torch.hpu as hthpu ## uncommnet it for HPU
-##model = model.to(torch.bfloat16).to("hpu") ## uncommnet it for HPU
-
-prompt = "There is a girl who likes adventure,"
-messages = [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt}]
-
-text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
-
-generated_ids = model.generate(
-    model_inputs.input_ids,
-    max_new_tokens=200,  ##change this to align with the official usage
-    do_sample=False,  ##change this to align with the official usage
-)
-generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
-
-response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-print(response)
-
-prompt = "There is a girl who likes adventure,"
-##INT4:
-""" and she wants to go on a trip. She has 10 different types of snacks, and she can only carry 4 of them in her bag. How many different combinations of snacks can she choose from? To determine the number of different combinations of snacks the girl can choose from, we need to calculate the number of ways to choose 4 snacks out of 10. This is a classic combination problem where the order of selection does not matter.
-
-The formula for combinations is given by:
-\[
-\binom{n}{r} = \frac{n!}{r!(n-r)!}
-\]
-where \( n \) is the total number of items to choose from, \( r \) is the number of items to choose, and \( ! \) denotes factorial.
-
-In this problem, \( n = 10 \) and \( r = 4 \). Plugging these values into the formula, we get:
-\[
-\binom{10}{4}"""
-
-##BF16:
-""" and she has a hobby of collecting rocks. She wants to go on a trip to collect some unique rocks. She plans to visit three different locations: a mountain, a beach, and a desert. Each location has its own set of challenges and opportunities for rock collecting.
-
-1. The mountain is known for its rare mineral deposits, but the terrain is steep and rocky, making it difficult to navigate.
-2. The beach offers a variety of sedimentary rocks and fossils, but the tides can be unpredictable and dangerous.
-3. The desert provides an opportunity to find petrified wood and volcanic rocks, but the heat and lack of water can be challenging.
-
-The girl has a backpack that can carry up to 10 kilograms of rocks. She also has a map that shows the locations of specific types of rocks at each site. Her goal is to maximize the number of unique rock types she collects while staying within her weight limit.
-
-Given the following information:
-- Mountain: 5 unique rock types"""
-
-prompt = "9.11和9.8哪个数字大"
-# INT4:
-"""？ 9.11 比 9.8 大。
-
-为了比较这两个数，我们可以从它们的小数部分开始：
-
-- 9.11 可以看作是 9 + 0.11
-- 9.8 可以看作是 9 + 0.8
-
-由于 0.11 小于 0.8，所以 9.11 小于 9.8。因此，9.8 比 9.11 大。
-
-总结：9.8 > 9.11。所以，9.8 是较大的数字。如果你的问题是问哪个数字较大，则答案是 9.8。如果问题是问哪个数字较小，则答案是 9.11。请确认你的问题需求。根据你的描述，9.8 是较大的数字。 
-
-希望这能帮助你！如有其他问题，请随时提问。 
-
-（注意：在"""
-
-##BF16:
-"""？ 9.11 比 9.8 大。
-
-在比较两个小数时，我们从左到右逐位进行比较。首先比较整数部分，如果相同，则比较小数部分。对于9.11 和 9.8：
-
-- 整数部分都是9，相等。
-- 比较小数部分：0.11 和 0.8。
-
-由于0.11 < 0.8，所以9.11 < 9.8。
-
-因此，9.8 比 9.11 大。 
-
-所以，正确的答案是：9.8 比 9.11 大。 
-
-希望这能帮助你理解！如果你有其他问题，请随时告诉我。 
-
-总结：9.8 > 9.11。 
-
-希望这个解释清楚了你的疑问。如果有任何进一步的问题或需要更多帮助，请告诉我！ 
-
-再次确认：9"""
-
-
-prompt = "Once upon a time,"
-##INT4:
-""" there was a young man named John who had a passion for music. He loved playing the guitar and would spend hours every day practicing and perfecting his skills. However, he struggled to find an audience for his music and felt discouraged.
-"""
-
-##BF16:
-""" there was a young man named John who lived in a small village. He was an orphan and had to work hard to make ends meet. Despite his difficult circumstances, he was kind-hearted and always willing to help others. One day, a wise old man came to the village and saw John's kindness. The old man decided to test John's character by giving him a bag of gold coins and telling him to distribute it among the villagers. John was overjoyed at first but then realized that he could use the money for himself. However, he remembered the wise man's words and distributed the coins equally among the villagers. The wise man was pleased with John's actions and revealed himself as a fairy godfather. He granted John three wishes, but with a twist - each wish would come true only if John could prove that he deserved it. What are some possible wishes that John might make and how could he prove that he deserves them?
-John, being a kind-hearted individual, might consider wishes that"""
-
-
-prompt = "请简短介绍一下阿里巴巴公司"
-##INT4:
-"""阿里巴巴集团创立于1999年，是全球领先的电子商务及零售贸易平台。阿里巴巴集团的使命是让世界各地的企业都能平等地进行贸易。阿里巴巴集团旗下的业务包括淘宝、天猫、菜鸟网络、阿里云等。阿里巴巴集团致力于通过技术创新，为中小企业提供更便捷、高效的商业服务，推动数字经济的发展。阿里巴巴集团在全球范围内拥有数百万商家和消费者用户，已成为全球最大的零售贸易平台之一。阿里巴巴集团总部位于中国杭州，并在全球范围内设有多个办事处和研发中心。阿里巴巴集团的愿景是构建一个开放、协作、可持续发展的数字经济生态系统，为全球商业带来更多的机遇和价值。阿里巴巴集团在2014年上市，成为当时全球最大的IPO。阿里巴巴集团的创始人马云是中国著名的企业家和慈善家。阿里巴巴集团在社会责任方面也做出了积极贡献，包括支持教育、环保、扶贫等公益事业。阿里巴巴集团是一家具有高度社会责任感的企业。阿里巴巴集团的业务涵盖了电子商务、金融、物流
-"""
-
-##BF16:
-"""阿里巴巴集团创立于1999年，是全球领先的电子商务及零售平台，业务涵盖B2B、C2C、B2C等各个领域。阿里巴巴旗下拥有淘宝网、天猫、菜鸟网络、阿里云等知名子公司和品牌，致力于打造开放、协同、繁荣的商业生态系统，为全球中小企业提供一站式数字化转型服务。阿里巴巴在全球范围内拥有超过20万名员工，并在纽约证券交易所上市。阿里巴巴一直秉承“让天下没有难做的生意”的使命，不断创新和发展，成为全球领先的数字经济体之一。阿里巴巴还积极履行企业社会责任，关注环保、公益等领域，努力实现可持续发展。阿里巴巴已经成为中国互联网行业的领军企业之一，在全球范围内也具有广泛的影响力。阿里巴巴的发展历程充满了挑战与机遇，未来将继续引领数字经济的发展趋势，推动全球经济的繁荣与发展。阿里巴巴是一家总部位于中国杭州的跨国科技公司，主要业务包括电子商务、金融、物流、云计算等。阿里巴巴旗下的淘宝、天猫等电商平台已成为
-"""
-```
-
-### Evaluate the model
-
-pip3 install lm-eval==0.4.5
-
-```bash
-auto-round --model "Intel/Qwen2.5-14B-Instruct-int4-inc" --eval --eval_bs 16  --tasks leaderboard_ifeval,leaderboard_mmlu_pro,gsm8k,lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,arc_easy,arc_challenge,cmmlu,ceval-valid
-```
-
-| Metric                                     |  BF16  |  INT4  |
-| :----------------------------------------- | :----: | :----: |
-| Avg                                        | 0.6947 | 0.6954 |
-| leaderboard_mmlu_pro 5 shots               | 0.5375 | 0.5292 |
-| leaderboard_ifeval inst_level_strict_acc   | 0.6331 | 0.6475 |
-| leaderboard_ifeval prompt_level_strict_acc | 0.5102 | 0.5287 |
-| mmlu                                       | 0.7882 | 0.7809 |
-| cmmlu                                      | 0.8377 | 0.8240 |
-| ceval-valid                                | 0.8351 | 0.8232 |
-| gsm8k 5 shots                              | 0.7900 | 0.8120 |
-| lambada_openai                             | 0.7283 | 0.7250 |
-| hellaswag                                  | 0.6556 | 0.6508 |
-| winogrande                                 | 0.7585 | 0.7672 |
-| piqa                                       | 0.8166 | 0.8156 |
-| truthfulqa_mc1                             | 0.5153 | 0.5202 |
-| openbookqa                                 | 0.3640 | 0.3700 |
-| boolq                                      | 0.8798 | 0.8810 |
-| arc_easy                                   | 0.8582 | 0.8535 |
-| arc_challenge                              | 0.6049 | 0.5981 |
-
-
-
-### Generate the model
-
-Here is the sample command to generate  the model. 
-
-```bash
-auto-round \
---model  Qwen/Qwen2.5-14B-Instruct \
---device 0 \
---group_size 128 \
---nsamples 512 \
---bits 4 \
---iter 1000 \
---disable_eval \
---model_dtype "fp16" \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround" 
-```
-
-## Ethical Considerations and Limitations
-
-The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs.
-
-Therefore, before deploying any applications of the model, developers should perform safety testing.
-
-## Caveats and Recommendations
-
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model.
-
-Here are a couple of useful links to learn more about Intel's AI software:
-
-- Intel Neural Compressor [link](https://github.com/intel/neural-compressor)
-
-## Disclaimer
-
-The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
-
-## Cite
-
-@article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
-
-[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
diff --git a/docs/Qwen2.5-32B-Instruct-sym.md b/docs/Qwen2.5-32B-Instruct-sym.md
deleted file mode 100644
index ac4e78eb6..000000000
--- a/docs/Qwen2.5-32B-Instruct-sym.md
+++ /dev/null
@@ -1,192 +0,0 @@
-## Model Details
-
-This model is an int4 model with group_size 128 and symmetric quantization of [Qwen/Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) generated by [intel/auto-round](https://github.com/intel/auto-round).  
-
-## How To Use
-
-### INT4 Inference(CPU/HPU/CUDA)
-
-CPU/ CUDA requires auto-round version>0.3.1
-
-```python
-from auto_round import AutoRoundConfig  ##must import for auto-round format
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-quantized_model_dir = "Intel/Qwen2.5-32B-Instruct-int4-inc"
-tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir)
-
-model = AutoModelForCausalLM.from_pretrained(
-    quantized_model_dir,
-    torch_dtype="auto",
-    device_map="auto",
-)
-
-##import habana_frameworks.torch.core as htcore ## uncommnet it for HPU
-##import habana_frameworks.torch.hpu as hthpu ## uncommnet it for HPU
-##model = model.to(torch.bfloat16).to("hpu") ## uncommnet it for HPU
-
-prompt = "There is a girl who likes adventure,"
-messages = [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt}]
-
-text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
-
-generated_ids = model.generate(
-    model_inputs.input_ids,
-    max_new_tokens=200,  ##change this to align with the official usage
-    do_sample=False,  ##change this to align with the official usage
-)
-generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
-
-response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-print(response)
-
-prompt = "There is a girl who likes adventure,"
-##INT4:
-"""It sounds like you're starting to tell a story! Would you like me to help you continue it? Here's one possible continuation:
-
-There is a girl who loves adventure. Her name is Lily and she has always been drawn to the unknown. From a young age, she would spend hours exploring the woods behind her house, imagining herself as a brave explorer discovering new lands. As she grew older, her thirst for adventure only intensified. She began traveling the world, seeking out thrilling experiences and pushing herself out of her comfort zone at every turn.
-
-Is there anything specific you had in mind for this character or story? I'd be happy to help develop it further if you have any ideas!"""
-
-##BF16:
-"""That sounds exciting! What would you like to know or do regarding this girl who loves adventure? Perhaps you're looking for ideas on activities she might enjoy or ways to support her adventurous spirit. Let me know how I can assist you further!"""
-
-prompt = "9.11和9.8哪个数字大"
-# INT4:
-
-
-##BF16:
-"""要比较两个数字的大小，我们可以从左到右逐位进行比较。
-
-首先，我们看整数部分：
-- 9.11 的整数部分是 9。
-- 9.8 的整数部分也是 9。
-
-因为它们的整数部分相同，我们需要比较小数部分：
-- 9.11 的小数部分是 0.11。
-- 9.8 的小数部分是 0.8。
-
-接下来，我们比较小数部分的第一位：
-- 0.11 的第一位是 1。
-- 0.8 的第一位是 8。
-
-显然，8 比 1 大。因此，0.8 比 0.11 大。
-
-所以，9.8 比 9.11 大。"""
-
-
-prompt = "Once upon a time,"
-##INT4:
-
-
-##BF16:
-"""Once upon a time, in a land far, far away, there was a small village nestled between rolling hills and dense forests. The villagers lived simple lives, farming the land and tending to their livestock. They were a close-knit community, always ready to help one another in times of need.
-
-In the heart of this village stood an ancient oak tree, under which the elders would gather to share stories and make important decisions for the community. Among these tales were whispers of a hidden treasure, said to be guarded by a mythical creature deep within the forest.
-
-One day, a young girl named Elara overheard these whispers while fetching water from the well. Her curiosity piqued, she decided to embark on a quest to uncover the truth behind the legend. With nothing but her wits, a small backpack, and a map drawn by the village cartographer, Elara set off into the unknown.
-
-As she ventured deeper into the forest, she encountered various challenges and made unexpected friends along
-"""
-
-
-prompt = "请简短介绍一下阿里巴巴公司"
-##INT4:
-
-##BF16:
-"""阿里巴巴集团是一家中国跨国科技公司，成立于1999年，总部位于中国杭州。该公司以B2B电子商务平台起家，现已发展成为涵盖零售、金融、物流、云计算等多个领域的综合性企业集团。旗下拥有淘宝、天猫、阿里云等知名业务，是全球最大的电子商务和零售平台之一。阿里巴巴致力于通过技术创新和商业生态系统建设，推动数字经济的发展，并为消费者和企业提供优质的产品与服务。
-"""
-```
-
-### Evaluate the model
-
-pip3 install lm-eval==0.4.5
-
-```bash
-auto-round --model "Intel/Qwen2.5-32B-Instruct-int4-inc" --eval --eval_bs 16  --tasks leaderboard_ifeval,leaderboard_mmlu_pro,gsm8k,lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,arc_easy,arc_challenge,cmmlu,ceval-valid
-```
-
-| Metric                                     |  BF16  |  INT4  |
-| :----------------------------------------- | :----: | :----: |
-| Avg                                        | 0.7120 | 0.7089 |
-| leaderboard_mmlu_pro 5 shots               | 0.5917 | 0.5795 |
-| leaderboard_ifeval inst_level_strict_acc   | 0.7314 | 0.7254 |
-| leaderboard_ifeval prompt_level_strict_acc | 0.6248 | 0.6285 |
-| mmlu                                       | 0.8169 | 0.8148 |
-| cmmlu                                      | 0.8673 | 0.8586 |
-| ceval-valid                                | 0.8811 | 0.8700 |
-| gsm8k 5 shots                              | 0.7680 | 0.8052 |
-| lambada_openai                             | 0.7522 | 0.7417 |
-| hellaswag                                  | 0.6685 | 0.6643 |
-| winogrande                                 | 0.7372 | 0.7324 |
-| piqa                                       | 0.8085 | 0.8134 |
-| truthfulqa_mc1                             | 0.4871 | 0.4749 |
-| openbookqa                                 | 0.3580 | 0.3480 |
-| boolq                                      | 0.8966 | 0.8841 |
-| arc_easy                                   | 0.8237 | 0.8228 |
-| arc_challenge                              | 0.5785 | 0.5785 |
-
-
-
-### Generate the model
-
-Here is the sample command to generate the model. 
-
-For symmetric quantization, we found overflow/NAN will occur for some backends, so better fallback some layers. auto_round requires version > 0.3.1
-
-```bash
-auto-round \
---model  Qwen/Qwen2.5-32B-Instruct \
---device 0 \
---group_size 128 \
---nsamples 512 \
---bits 4 \
---iter 1000 \
---disable_eval \
---fp_layers "model.layers.5.mlp.down_proj,model.layers.5.mlp.up_proj,model.layers.5.mlp.gate_proj" \
---model_dtype "fp16" \
---format 'auto_round' \
---output_dir "./tmp_autoround" 
-```
-
-Asym
-
-```bash
-auto-round \
---model  Qwen/Qwen2.5-32B-Instruct \
---device 0 \
---group_size 128 \
---nsamples 512 \
---bits 4 \
---iter 1000 \
---disable_eval \
---asym
---model_dtype "fp16" \
---format 'auto_round' \
---output_dir "./tmp_autoround" 
-```
-
-## Ethical Considerations and Limitations
-
-The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs.
-
-Therefore, before deploying any applications of the model, developers should perform safety testing.
-
-## Caveats and Recommendations
-
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model.
-
-Here are a couple of useful links to learn more about Intel's AI software:
-
-- Intel Neural Compressor [link](https://github.com/intel/neural-compressor)
-
-## Disclaimer
-
-The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
-
-## Cite
-
-@article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
-
-[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
diff --git a/docs/Qwen2.5-72B-Instruct-sym.md b/docs/Qwen2.5-72B-Instruct-sym.md
deleted file mode 100644
index d3ed45676..000000000
--- a/docs/Qwen2.5-72B-Instruct-sym.md
+++ /dev/null
@@ -1,191 +0,0 @@
-## Model Details
-
-This model is an int4 model with group_size 128 and and symmetric quantization of [Qwen/Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) generated by [intel/auto-round](https://github.com/intel/auto-round). Load the model with `revision="b7ea233"` to use AutoGPTQ format.
-## How To Use
-
-### INT4 Inference(CPU/HPU/CUDA)
-
-CPU requires auto-round version>0.3.1
-
-```python
-from auto_round import AutoRoundConfig  ##must import for auto-round format
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-quantized_model_dir = "Intel/Qwen2.5-72B-Instruct-int4-inc"
-tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir)
-
-model = AutoModelForCausalLM.from_pretrained(
-    quantized_model_dir,
-    torch_dtype="auto",
-    device_map="auto",
-    ##revision="b7ea233" ##AutoGPTQ format
-)
-
-##import habana_frameworks.torch.core as htcore ## uncommnet it for HPU
-##import habana_frameworks.torch.hpu as hthpu ## uncommnet it for HPU
-##model = model.to(torch.bfloat16).to("hpu") ## uncommnet it for HPU
-
-prompt = "There is a girl who likes adventure,"
-messages = [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt}]
-
-text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
-
-generated_ids = model.generate(
-    model_inputs.input_ids,
-    max_new_tokens=200,  ##change this to align with the official usage
-    do_sample=False,  ##change this to align with the official usage
-)
-generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
-
-response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-print(response)
-
-prompt = "There is a girl who likes adventure,"
-##INT4:
-"""That's great to hear! A love for adventure can lead to so many exciting experiences and personal growth. What kind of adventures does she enjoy? Is it exploring new places, trying out thrilling activities, or perhaps diving into new hobbies and challenges? Knowing more about her interests could help in suggesting fun ideas or planning future adventures.
-"""
-
-##BF16:
-"""That sounds like a great quality to have! A love for adventure can lead to exciting experiences and personal growth. What kind of adventures does she enjoy? Does she like exploring new places, trying new activities, or seeking out thrilling experiences? Knowing more about her interests can help suggest specific ideas or activities that might appeal to her adventurous spirit.
-"""
-
-prompt = "9.11和9.8哪个数字大"
-# INT4:
-"""要比较9.11和9.8的大小，可以按照以下步骤进行：
-
-1. **比较整数部分**：两个数字的整数部分都是9，所以需要进一步比较小数部分。
-2. **比较小数部分**：
-   - 9.11的小数部分是0.11
-   - 9.8的小数部分是0.8
-
-3. **比较0.11和0.8**：
-   - 0.11可以写成0.110
-   - 0.8可以写成0.800
-
-4. **逐位比较**：
-   - 第一位：1 < 8，所以0.110 < 0.800
-
-因此，9.11 < 9.8。
-
-结论：9.8比9.11大。"""
-
-##BF16:
-"""比较两个数字 9.11 和 9.8，可以按照以下步骤进行：
-
-1. **整数部分**：两个数字的整数部分都是 9，所以需要比较小数部分。
-2. **小数部分**：
-   - 9.11 的小数部分是 0.11
-   - 9.8 的小数部分是 0.8
-
-3. **比较小数部分**：
-   - 0.11 和 0.8 比较时，0.8 明显大于 0.11。
-
-因此，9.8 大于 9.11。"""
-
-
-prompt = "Once upon a time,"
-##INT4:
-"""Once upon a time, in a far-off land, there was a kingdom filled with wonder and magic. The kingdom was ruled by a wise and just king who loved his people dearly. In the heart of the kingdom stood a magnificent castle, surrounded by lush forests and rolling hills.
-
-The people of the kingdom lived happily, tending to their farms, crafting beautiful goods, and enjoying the simple pleasures of life. However, one day, a great darkness began to spread across the land. A wicked sorcerer had risen from the shadows, seeking to claim the throne for himself and plunge the kingdom into chaos.
-
-The king, knowing that he could not face this threat alone, called upon the bravest and most skilled heroes from all corners of the realm. Among them was a young knight named Sir Cedric, who had earned a reputation for his courage and unwavering sense of justice.
-
-Sir Cedric, along with a group of loyal companions, set out on a perilous journey to stop the sor"""
-
-##BF16:
-"""Once upon a time, in a land far, far away, there was a kingdom known for its beauty and prosperity. The kingdom was ruled by a wise and just king who loved his people dearly. In the heart of the kingdom stood a magnificent castle, surrounded by lush gardens and sparkling fountains.
-
-The king had a young daughter named Princess Elara, who was as kind and gentle as she was beautiful. She spent her days helping the poor and spreading joy throughout the kingdom. The people adored her, and she was beloved by all.
-
-One day, a great challenge arose. A dark forest on the outskirts of the kingdom began to grow wild and dangerous, threatening the safety of the villagers. The king called for a hero to tame the forest and protect his people. Many brave knights and warriors came forward, but none could succeed.
-
-Princess Elara, determined to help, decided to venture into the forest herself. Her father was hesitant, but he saw the determination in her eyes and knew"""
-
-
-prompt = "请简短介绍一下阿里巴巴公司"
-##INT4:
-"""阿里巴巴集团是一家总部位于中国杭州的全球领先的电子商务和科技公司。它成立于1999年，由马云和他的团队创立。阿里巴巴旗下拥有包括淘宝、天猫、阿里云等在内的多个知名业务平台，涵盖了在线零售、批发贸易、云计算、数字娱乐、金融服务等多个领域。
-
-阿里巴巴的愿景是让世界各地的企业都能够平等地进行贸易，通过技术创新推动数字经济的发展，为社会创造更多的价值。目前，阿里巴巴已经发展成为世界领先的互联网公司之一，业务遍布全球多个国家和地区，服务着数以亿计的用户和商家。"""
-
-##BF16:
-"""阿里巴巴集团是一家总部位于中国杭州的全球领先的电子商务和科技公司。成立于1999年，阿里巴巴最初是一个B2B在线市场，旨在连接中国制造商与全球买家。经过二十多年的发展，阿里巴巴已经发展成为涵盖电子商务、金融、物流、云计算等多个领域的综合性企业集团。
-
-阿里巴巴旗下拥有淘宝网、天猫、菜鸟网络、阿里云等知名品牌，为消费者提供购物、支付、娱乐等多元化服务，同时也为企业提供营销、销售、物流和技术支持等全方位解决方案。此外，阿里巴巴还积极投资和孵化创新项目，推动数字经济的发展。
-
-阿里巴巴始终秉持“让天下没有难做的生意”的使命，致力于通过技术创新促进全球经济的可持续发展。"""
-```
-
-### Evaluate the model
-
-pip3 install lm-eval==0.4.5
-
-```bash
-auto-round --model "Intel/Qwen2.5-72B-Instruct-int4-inc" --eval --eval_bs 16  --tasks leaderboard_ifeval,leaderboard_mmlu_pro,gsm8k,lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,arc_easy,arc_challenge,cmmlu,ceval-valid
-```
-
-| Metric                                     |  BF16  |  INT4  |
-| :----------------------------------------- | :----: | :----: |
-| Avg                                        | 0.7413 | 0.7448 |
-| leaderboard_mmlu_pro 5 shots               | 0.5919 | 0.5864 |
-| leaderboard_ifeval inst_level_strict_acc   | 0.7770 | 0.7866 |
-| leaderboard_ifeval prompt_level_strict_acc | 0.6858 | 0.6932 |
-| mmlu                                       | 0.8334 | 0.8308 |
-| cmmlu                                      | 0.8727 | 0.8673 |
-| ceval-valid                                | 0.8975 | 0.8960 |
-| gsm8k 5 shots                              | 0.9037 | 0.9098 |
-| lambada_openai                             | 0.7518 | 0.7563 |
-| hellaswag                                  | 0.7031 | 0.7014 |
-| winogrande                                 | 0.7601 | 0.7687 |
-| piqa                                       | 0.8313 | 0.8232 |
-| truthfulqa_mc1                             | 0.5239 | 0.5263 |
-| openbookqa                                 | 0.3860 | 0.3820 |
-| boolq                                      | 0.9049 | 0.9046 |
-| arc_easy                                   | 0.8632 | 0.8611 |
-| arc_challenge                              | 0.6135 | 0.6237 |
-
-
-
-### Generate the model
-
-Here is the sample command to generate the model. 
-
-```bash
-auto-round \
---model  Qwen/Qwen2.5-72B-Instruct \
---device 0 \
---group_size 128 \
---nsamples 512 \
---bits 4 \
---iter 1000 \
---disable_eval \
---model_dtype "fp16" \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround" 
-```
-
-## Ethical Considerations and Limitations
-
-The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs.
-
-Therefore, before deploying any applications of the model, developers should perform safety testing.
-
-## Caveats and Recommendations
-
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model.
-
-Here are a couple of useful links to learn more about Intel's AI software:
-
-- Intel Neural Compressor [link](https://github.com/intel/neural-compressor)
-
-## Disclaimer
-
-The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
-
-## Cite
-
-@article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
-
-[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
diff --git a/docs/Qwen2.5-7B-Instruct-sym.md b/docs/Qwen2.5-7B-Instruct-sym.md
deleted file mode 100644
index 82fdf0c5b..000000000
--- a/docs/Qwen2.5-7B-Instruct-sym.md
+++ /dev/null
@@ -1,176 +0,0 @@
-## Model Details
-
-This model is an int4 model with group_size 128 and symmetric quantization of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) generated by [intel/auto-round](https://github.com/intel/auto-round).  Load the model with `revision="5a6d912"` to use AutoGPTQ format
-
-## How To Use
-
-### INT4 Inference(CPU/HPU/CUDA)
-
-CPU requires auto-round version>0.3.1
-
-```python
-from auto_round import AutoRoundConfig  ##must import for auto-round format
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-quantized_model_dir = "Intel/Qwen2.5-7B-Instruct-int4-inc"
-tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir)
-
-model = AutoModelForCausalLM.from_pretrained(
-    quantized_model_dir,
-    torch_dtype="auto",
-    device_map="auto",
-    ##revision="0b70f95" ##AutoGPTQ format
-    ##revision="5a6d912" ##Quantized lm-head version
-)
-
-##import habana_frameworks.torch.core as htcore ## uncommnet it for HPU
-##import habana_frameworks.torch.hpu as hthpu ## uncommnet it for HPU
-##model = model.to(torch.bfloat16).to("hpu") ## uncommnet it for HPU
-
-prompt = "There is a girl who likes adventure,"
-messages = [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt}]
-
-text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
-
-generated_ids = model.generate(
-    model_inputs.input_ids,
-    max_new_tokens=200,  ##change this to align with the official usage
-    do_sample=False,  ##change this to align with the official usage
-)
-generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
-
-response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-print(response)
-
-prompt = "There is a girl who likes adventure,"
-##INT4:
-"""That's great! It sounds like this girl has an exciting spirit. How can I help you explore her love for adventure? Are you looking for ideas for adventurous activities, planning a trip, or perhaps looking for ways to support her adventurous nature?
-"""
-
-##BF16:
-"""That sounds exciting! What kind of adventures does she enjoy? Is there something specific you'd like to plan or discuss related to her love for adventure?
-"""
-
-prompt = "9.11和9.8哪个数字大"
-# INT4:
-"""在比较9.11和9.8时，我们从左到右逐位进行比较：
-
-1. 首先比较整数部分：两个数的整数部分都是9，相等。
-2. 接下来比较小数部分：
-   - 9.11的小数部分是0.11
-   - 9.8的小数部分是0.8
-
-将0.11和0.8进行比较，显然0.8大于0.11。
-
-因此，9.8比9.11大。
-"""
-
-##BF16:
-"""在比较9.11和9.8这两个数字时，我们可以直接进行比较：
-
-- 9.11 是九点一一
-- 9.8 是九点八
-
-显然，9.8 比 9.11 大。这是因为9.8中的十分位是8，而9.11中的十分位是1，8大于1。
-
-所以，9.8 > 9.11。"""
-
-
-prompt = "Once upon a time,"
-##INT4:
-"""Once upon a time, in a land filled with wonder and magic, there lived a young girl named Elara. She had bright eyes that sparkled like the stars on a clear night and hair as golden as the sun-kissed fields of wheat. Elara's home was a cozy cottage nestled at the edge of a vast, enchanted forest, where ancient trees whispered secrets to one another and mystical creatures roamed freely.
-
-Every day, Elara would venture into the forest, exploring its hidden paths and marveling at the wonders it held. One sunny morning, as she wandered deeper into the woods than ever before, she stumbled upon a glade bathed in a soft, ethereal light. In the center of this glade stood an enormous tree, its trunk wider than any building Elara had ever seen, and its branches stretching high into the sky.
-
-As she approached the tree, she noticed a small, shimmering door carved into its bark. Curious, Elara reached out and"""
-
-##BF16:
-"""Once upon a time, in a land filled with wonder and magic, there lived a young girl named Elara. She had bright eyes that sparkled like the stars on a clear night and hair as golden as the sun-kissed fields of wheat. Elara's home was a cozy cottage nestled at the edge of a vast, enchanted forest, where mythical creatures roamed freely and ancient trees whispered secrets of old.
-
-Every day, Elara would venture into the forest to gather herbs and flowers for her grandmother, who was a skilled healer. The forest was her sanctuary, a place where she could escape the worries of the world and find solace in nature's embrace. As she walked through the dense underbrush, she often stumbled upon hidden glades and sparkling streams, each more enchanting than the last.
-
-One sunny morning, as Elara was collecting wildflowers near a babbling brook, she heard a faint cry for help. Curious, she followed the sound to a clearing where"""
-
-
-prompt = "请简短介绍一下阿里巴巴公司"
-##INT4:
-"""阿里巴巴集团是一家总部位于中国杭州的全球领先电子商务和科技公司，成立于1999年。它由马云创立，旗下拥有包括淘宝、天猫、阿里云等在内的多个知名业务。阿里巴巴致力于构建全球化的商业生态系统，为消费者、企业和政府提供创新的在线解决方案。同时，阿里巴巴也是中国最大的零售市场之一，并在云计算、数字娱乐、物流等领域具有重要影响力。
-"""
-
-##BF16:
-"""阿里巴巴集团是一家总部位于中国杭州的全球领先电子商务和科技公司，成立于1999年。阿里巴巴旗下拥有淘宝、天猫、阿里云等知名业务，致力于构建全球化的商业生态系统。阿里巴巴愿景是让世界各地的企业都能够平等地进行贸易，并通过技术创新推动社会进步。
-"""
-```
-
-### Evaluate the model
-
-pip3 install lm-eval==0.4.5
-
-```bash
-auto-round --model "Intel/Qwen2.5-7B-Instruct-int4-inc" --eval --eval_bs 16  --tasks leaderboard_ifeval,leaderboard_mmlu_pro,gsm8k,lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,arc_easy,arc_challenge,cmmlu,ceval-valid
-```
-
-| Metric                                     |  BF16  |  INT4 (5.3G)  | INT4 lm-head (4.5G) |
-| :----------------------------------------- | :----: | :----: | ------------------- |
-| Avg                                        | 0.6649 | 0.6586 | 0.6577              |
-| leaderboard_mmlu_pro 5 shots               | 0.4458 | 0.4436 | 0.4384              |
-| leaderboard_ifeval inst_level_strict_acc   | 0.6859 | 0.6715 | 0.6595              |
-| leaderboard_ifeval prompt_level_strict_acc | 0.5730 | 0.5508 | 0.5379              |
-| mmlu                                       | 0.7174 | 0.7147 | 0.7145              |
-| cmmlu                                      | 0.8028 | 0.7888 | 0.7888              |
-| ceval-valid                                | 0.7935 | 0.7838 | 0.7741              |
-| gsm8k 5 shots                              | 0.7665 | 0.7544 | 0.8006              |
-| lambada_openai                             | 0.6949 | 0.6878 | 0.6763              |
-| hellaswag                                  | 0.6195 | 0.6139 | 0.6121              |
-| winogrande                                 | 0.7119 | 0.7064 | 0.7135              |
-| piqa                                       | 0.7938 | 0.7873 | 0.7845              |
-| truthfulqa_mc1                             | 0.4786 | 0.4774 | 0.4810              |
-| openbookqa                                 | 0.3480 | 0.3580 | 0.3540              |
-| boolq                                      | 0.8636 | 0.8602 | 0.8609              |
-| arc_easy                                   | 0.8131 | 0.8068 | 0.8081              |
-| arc_challenge                              | 0.5282 | 0.5316 | 0.5188              |
-
-
-
-### Generate the model
-
-Here is the sample command to generate the model. 
-
-```bash
-auto-round \
---model  Qwen/Qwen2.5-7B-Instruct \
---device 0 \
---group_size 128 \
---nsamples 512 \
---bits 4 \
---iter 1000 \
---disable_eval \
---model_dtype "fp16" \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround" 
-```
-
-## Ethical Considerations and Limitations
-
-The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs.
-
-Therefore, before deploying any applications of the model, developers should perform safety testing.
-
-## Caveats and Recommendations
-
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model.
-
-Here are a couple of useful links to learn more about Intel's AI software:
-
-- Intel Neural Compressor [link](https://github.com/intel/neural-compressor)
-
-## Disclaimer
-
-The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
-
-## Cite
-
-@article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
-
-[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
diff --git a/docs/Qwen3-14B-sym-recipe.md b/docs/Qwen3-14B-sym-recipe.md
deleted file mode 100644
index 3dbc72e2d..000000000
--- a/docs/Qwen3-14B-sym-recipe.md
+++ /dev/null
@@ -1,219 +0,0 @@
----
-datasets:
-- NeelNanda/pile-10k
----
-
-## Model Details
-
-This model is an int4 model with group_size 128 and symmetric quantization of [Qwen/Qwen3-14B](https://huggingface.co/Qwen/Qwen3-14B) generated by [intel/auto-round](https://github.com/intel/auto-round).
-## How To Use
-
-### INT4 Inference(CPU/CUDA/INTEL GPU)
-```python
-from transformers import AutoModelForCausalLM,AutoTokenizer
-quantized_model_dir = "Intel/Qwen3-14B-250426-int4-sym-AutoRound"
-
-# load the tokenizer and the model
-tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir)
-model = AutoModelForCausalLM.from_pretrained(
-    quantized_model_dir,
-    torch_dtype="auto",
-    device_map="auto"
-)
-
-# prepare the model input
-prompt = "Give me a short introduction to large language model."
-messages = [
-    {"role": "user", "content": prompt}
-]
-text = tokenizer.apply_chat_template(
-    messages,
-    tokenize=False,
-    add_generation_prompt=True,
-    enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
-)
-model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
-
-# conduct text completion
-generated_ids = model.generate(
-    **model_inputs,
-    max_new_tokens=512,  ##change this to align with the official usage
-    do_sample=False  ##change this to align with the official usage
-)
-output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 
-
-# parsing thinking content
-try:
-    # rindex finding 151668 (</think>)
-    index = len(output_ids) - output_ids[::-1].index(151668)
-except ValueError:
-    index = 0
-
-thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
-content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
-
-print("thinking content:", thinking_content)
-print("content:", content)
-##INT4:
-# thinking content: <think>
-# Okay, the user wants a short introduction to large language models. Let me start by defining what they are. I should mention that they're AI systems trained on vast amounts of text data. Maybe explain their capabilities, like understanding and generating human-like text. I need to cover different applications, such as answering questions, writing stories, coding, etc. Also, it's important to note their training process, using deep learning techniques like transformers. I should mention their ability to handle multiple languages and adapt to various tasks. But I should keep it concise, so avoid too much technical jargon. Maybe end with their impact on technology and industries. Let me check if I'm missing anything. Oh, maybe mention some examples like GPT, BERT, or other models. But since the user asked for a short intro, maybe just refer to them as examples without going into detail. Alright, that should cover the basics without being too lengthy.
-# </think>
-# content: A **large language model (LLM)** is an advanced artificial intelligence system trained on vast amounts of text data to understand and generate human-like text. These models use deep learning techniques, such as transformer architectures, to process and analyze language patterns, enabling them to perform tasks like answering questions, writing stories, coding, summarizing text, and more. LLMs excel at handling multiple languages, adapting to diverse contexts, and producing coherent, context-aware responses. They power applications ranging from chatbots and virtual assistants to content creation tools and research aids, revolutionizing how humans interact with technology. Examples include models like GPT, BERT, and others developed by companies like OpenAI, Google, and Meta.
-
-##BF16:
-# thinking content: <think>
-# Okay, the user wants a short introduction to large language models. Let me start by defining what they are. I should mention that they're AI models trained on vast amounts of text data. Maybe explain their capabilities, like generating text, answering questions, and understanding context.
-# 
-# I need to highlight their applications, such as in chatbots, content creation, and research. Also, it's important to note their strengths, like handling multiple languages and complex tasks. But I shouldn't forget to mention the challenges, like data bias and computational needs. Keep it concise but informative. Make sure the language is clear and not too technical. Let me check if I covered the key points without getting too detailed. Alright, that should work.
-# </think>
-# content: A **large language model (LLM)** is an advanced artificial intelligence system trained on vast amounts of text data to understand and generate human-like language. These models excel at tasks like answering questions, writing text, translating languages, coding, and more, by recognizing patterns and context in the input they receive. Powered by deep learning and massive computational resources, LLMs like GPT, BERT, or LLaMA can process and produce coherent, contextually relevant responses across diverse domains. They are widely used in chatbots, content creation, research, and automation, though challenges like bias, data privacy, and ethical use remain critical areas of focus.
-
-
-prompt = "9.11和9.8哪个数字大"
-##INT4:
-# thinking content: <think>
-# 嗯，用户问的是9.11和9.8哪个数字大。首先，我需要确认这两个数字的数值大小。看起来都是小数，但可能有不同的位数。首先，我应该比较它们的整数部分。9.11的整数部分是9，而9.8的整数部分也是9，所以整数部分相同，需要比较小数部分。
-# 接下来，比较小数部分。9.11的小数部分是0.11，而9.8的小数部分是0.8。这时候，可能需要将它们转换成相同的位数来比较，或者直接比较数值。0.8等于0.80，而0.11显然比0.80小，所以9.8应该更大。
-# 不过，用户可能对小数点后的位数有疑问，比如是否需要补零。比如，9.8可以看作9.80，这样比较的话，0.80确实比0.11大。所以结论是9.8更大。
-# 不过，也有可能用户有其他考虑，比如是否在特定的上下文中，比如日期或者其他含义，但问题中没有提到，所以应该按数值比较。另外，检查是否有输入错误，比如9.11是否是9.11还是其他形式，但看起来是正确的。
-# 再想一下，有没有可能用户混淆了小数点后的位数，比如9.11是两位小数，而9.8是一位，但比较时应该按数值大小，而不是位数。所以，不管位数多少，只要数值更大就行。0.8比0.11大，所以9.8更大。
-# 确认一下，9.8等于9.80，而9.11是9.11，所以9.80确实比9.11大。因此，答案应该是9.8更大。
-# </think>
-# content: 9.8 比 9.11 大。
-# **比较过程：**
-# 1. **整数部分**：两者整数部分均为 **9**，相同。
-# 2. **小数部分**：
-#   - 9.11 的小数部分为 **0.11**。
-#   - 9.8 的小数部分为 **0.8**（可视为 **0.80** 以统一位数）。
-# 3. **数值比较**：  
-#   $0.
-
-##BF16:
-# content: <think>
-# 嗯，用户问的是9.11和9.8哪个数字大。首先，我需要确认用户的问题是否正确，因为这两个数字看起来像是小数，但可能有不同的含义。比如，9.11可能是指9月11日，而9.8可能是指9月8日，或者它们都是小数，比如9.11和9.80。不过，用户的问题里没有提到日期，所以可能只是单纯的数值比较。
-# 首先，我应该先假设用户是想比较两个小数的大小。这时候，我需要回忆一下小数比较的方法。比较小数时，首先比较整数部分，如果整数部分相同，再比较小数部分。这里两个数的整数部分都是9，所以接下来比较小数部分。
-# 9.11的小数部分是0.11，而9.8的小数部分是0.8。这时候，我需要比较0.11和0.8的大小。通常，比较小数时，可以将它们转换成同位数，比如0.11和0.80，这样比较的话，0.80显然比0.11大，所以9.8比9.11大。
-# 不过，也有可能用户有其他的意图，比如9.11指的是9月11日，而9.8指的是9月8日，这时候比较日期的话，9月11日显然比9月8日晚，所以9.11更大。但用户的问题里没有提到日期，所以可能还是数值比较更合理。
-# 不过，也有可能用户输入有误，比如9.11和9.8是否都是小数，或者有没有其他符号。比如，9.11可能是一个版本号，而9.8是另一个版本，这时候可能需要看具体的上下文。但如果没有更多上下文，应该按照数值比较来处理。
-# 再检查一下，如果用户输入的是9.11和9.8，那么作为小数的话，9.8等于9.80，而9.11是9.11，所以9.80比9.11大。因此，答案应该是9.8更大。
-# 不过，也有可能用户有拼写错误，比如9.11是否是9.11，而9.8是否是9.8，或者有没有其他数字。但根据问题中的写法，应该是正确的。
-
-
-prompt = "How many r in word strawberry"
-##INT4:
-# thinking content: <think>
-# Okay, let's see. The user is asking how many times the letter 'r' appears in the word "strawberry". Hmm, first I need to make sure I have the correct spelling of the word. Strawberry is spelled S-T-R-A-W-B-E-R-R-Y. Let me write that out to visualize it better: S, T, R, A, W, B, E, R, R, Y.
-# Now, I need to count each occurrence of the letter 'r'. Let's go through each letter one by one. The first letter is S, not an R. The second is T, also not. The third is R, that's one. Then A, W, B, E – none of those are R. The next letter is R again, that's two. Then another R, that's three. The last letter is Y. So in total, there are three R's in "strawberry".
-# Wait, let me double-check to make sure I didn't miss any. Let's count again: S (1), T (2), R (3), A (4), W (5), B (6), E (7), R (8), R (9), Y (10). So positions 3, 8, and 9 are R's. That's three times. Yeah, that seems right. I don't think I made a mistake here. The word has three R's.
-# </think>
-# content: The word **"strawberry"** contains **3** instances of the letter **'r'**. 
-# **Breakdown:**  
-# S-T-**R**-A-W-B-E-**R**-**R**-Y  
-# Positions 3, 8, and 9 are 'r's.  
-# **Answer:** 3.
-
-##BF16:
-# thinking content: <think>
-# Okay, let's see. The user is asking how many times the letter "r" appears in the word "strawberry." Hmm, first I need to make sure I have the correct spelling of the word. Strawberry is spelled S-T-R-A-W-B-E-R-R-Y. Let me write that out to visualize it better: S, T, R, A, W, B, E, R, R, Y.
-# Now, I need to count each occurrence of the letter "r." Let's go through each letter one by one. Starting with the first letter, S – that's not an R. Next is T – also not an R. The third letter is R, so that's one. Then A, W, B, E – none of those are R. The next letter is R again, that's the second one. Then another R right after that, making it the third R. Finally, Y at the end. 
-# Wait, let me check again to make sure I didn't miss any. The letters are S, T, R, A, W, B, E, R, R, Y. So positions 3, 8, and 9 are R's. That's three R's in total. But wait, sometimes people might miscount if they skip a letter or double-count. Let me count once more: first R is the third letter, then after E comes R, then another R. Yes, that's three. 
-# I think that's correct. So the answer should be 3. But just to be absolutely sure, maybe I can write out the word and underline each R. S-T-R-A-W-B-E-R-R-Y. Underlining the R's: the third letter, then the eighth and ninth. Yep, three R's. Okay, I'm confident now.
-# </think>
-# content: The word **"strawberry"** contains **3** instances of the letter **"r"**. 
-# **Breakdown:**  
-# S-T-**R**-A-W-B-E-**R**-**R**-Y  
-# Positions 3, 8, and 9 are all "r".  
-# **Answer:** 3.
-
-prompt = "请简短介绍一下阿里巴巴公司"
-##INT4:
-thinking content: <think>
-# 好的，用户让我简短介绍一下阿里巴巴公司。首先，我需要确定用户的需求是什么。可能他们需要一个快速了解阿里巴巴的概述，比如成立时间、主要业务、创始人等基本信息。
-# 接下来，我要确保信息准确。阿里巴巴成立于1999年，由马云和他的团队创立，总部在杭州。主要业务包括电商平台（淘宝、天猫、京东）、云计算（阿里云）、数字支付（支付宝）、物流（菜鸟网络）等。这些都是关键点，不能遗漏。
-# 然后，用户可能想知道阿里巴巴的影响力。作为全球最大的电商平台之一，阿里云在云计算领域排名靠前，支付宝改变了中国的支付方式，这些都能体现其重要性。另外，提到国际化战略，比如Lazada、阿里国际站，显示其全球布局。
-# 还要注意用户可能的深层需求。他们可能是在做报告、写文章，或者只是好奇。所以信息要简洁明了，结构清晰，分点列出可能更好，但用户要求简短，所以需要整合成一段。
-# 需要避免使用专业术语，保持口语化，但因为是公司介绍，可能需要稍微正式一点。同时，检查是否有最新的数据，比如成立时间是否正确，是否还有其他重要业务需要提及，比如蚂蚁集团，但可能用户只需要主要部分。
-# 最后，确保没有错误，比如成立时间、创始人名字、总部地点等。确认后，组织语言，保持简短，覆盖关键点，不超过两段，每段不要太长。
-# </think>
-# content: 阿里巴巴集团（Alibaba Group）成立于1999年，总部位于中国杭州，由马云及其团队创立。公司最初以B2B电商平台起家，现已成为全球领先的互联网科技企业，业务涵盖电子商务（淘宝、天猫、京东）、云计算（阿里云）、数字支付（支付宝）、物流（菜鸟网络）、金融科技（蚂蚁集团）及国际化业务（如Lazada、阿里国际站）等。阿里巴巴通过技术创新和生态布局，深刻影响了全球数字经济的发展。
-
-##BF16:
-# thinking content: <think>
-# 嗯，用户让我简短介绍一下阿里巴巴公司。首先，我需要确定用户的需求是什么。可能他们只是想了解阿里巴巴的基本信息，或者是在做作业、写报告，或者只是好奇。不管怎样，用户需要的是简洁明了的回答，不能太冗长。
-# 接下来，我得回忆一下阿里巴巴的基本信息。阿里巴巴集团成立于1999年，总部在杭州，对吧？创始人是马云，不过现在可能已经卸任了。阿里巴巴主要业务包括电商平台，比如淘宝、天猫，还有菜鸟网络、支付宝、阿里云这些子公司。可能还需要提到它的全球影响力，比如在电子商务、云计算、金融科技方面的成就。
-# 用户可能还想知道阿里巴巴的规模，比如员工数量、市值，或者它在行业中的地位。不过用户要求简短，所以这些细节可能需要简化。另外，可能需要提到它的使命，比如让天下没有难做的生意，或者推动数字化转型。
-# 还要注意避免错误信息，比如马云是否还在任。我记得他已经在2019年卸任董事局主席，现在是董事会成员。另外，阿里巴巴的上市时间是2014年，港交所和纽交所都有。可能需要提到这些关键点。
-# 用户可能没有说出来的深层需求是希望了解阿里巴巴的核心业务和影响力，而不仅仅是表面信息。所以需要突出其作为全球最大的电商平台之一，以及在云计算和金融科技领域的领先地位。同时，可能还要提到其社会责任，比如环保、公益项目等。
-# 需要确保回答结构清晰，分点列出，但用户要求简短，所以可能用一两句话概括。比如成立时间、总部、创始人、主要业务、全球地位等。还要注意用词准确，比如“集团”而不是“公司”，因为阿里巴巴有多个子公司。
-# 最后检查有没有遗漏的重要信息，比如是否提到支付宝和阿里云，这些是阿里巴巴的重要组成部分。同时，可能需要提到其对中国经济和全球电商的影响，但保持简洁。确保没有错误，比如成立年份、上市地点等。这样用户就能得到一个全面但简短的介绍。
-# </think>
-# content: 阿里巴巴集团（Alibaba Group）成立于1999年，总部位于中国杭州，由马云创立。它是全球领先的互联网科技公司，核心业务涵盖电子商务（淘宝、天猫）、云计算（阿里云）、金融科技（支付宝）、物流（菜鸟网络）及创新业务（如盒马鲜生、阿里健康等）。阿里巴巴致力于通过数字化技术赋能企业与消费者，推动全球商业变革，旗下拥有
-
-
-```
-
-### Evaluate the model
-
-pip3 install lm-eval
-```bash
-auto-round-eval --model "Intel/Qwen3-14B-250426-int4-sym-AutoRound" --eval_bs 16  --tasks leaderboard_ifeval,leaderboard_mmlu_pro,gsm8k,lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,arc_easy,arc_challenge,mmlu,cmmlu,ceval-valid
-```
-| Metric                                     |  BF16  |  INT4(best)  | INT4(default) |
-| :----------------------------------------- | :----: | :----: | :----: | 
-| Avg                                        | 0.6491 | 0.6484 | 0.6467 |
-| arc_easy                                   | 0.8409 | 0.8367 | 0.8396 |
-| arc_challenge                              | 0.5845 | 0.5845 | 0.5776 |
-| boolq                                      | 0.8933 | 0.8917 | 0.8954 |
-| ceval-valid                                | 0.8210 | 0.8217 | 0.8098 |
-| cmmlu                                      | 0.8020 | 0.7951 | 0.7942 |
-| gsm8k 5 shots                              | 0.8832 | 0.8908 | 0.8863 |
-| hellaswag                                  | 0.6095 | 0.6035 | 0.6030 |
-| lambada_openai                             | 0.6773 | 0.6788 | 0.6761 |
-| leaderboard_mmlu_pro 5 shots               | 0.5322 | 0.5281 | 0.5289 |
-| leaderboard_ifeval inst_level_strict_acc   | 0.4173 | 0.4245 | 0.4269 |
-| leaderboard_ifeval prompt_level_strict_acc | 0.2717 | 0.2699 | 0.2736 |
-| mmlu                                       | 0.7714 | 0.7671 | 0.7671 |
-| openbookqa                                 | 0.3500 | 0.3440 | 0.3420 |
-| piqa                                       | 0.7992 | 0.7960 | 0.7971 |
-| truthfulqa_mc1                             | 0.4027 | 0.4064 | 0.4027 |
-| winogrande                                 | 0.7285 | 0.7348 | 0.7269 |
-
-
-### Generate the model
-
-Here is the sample command to generate the model. 
-
-
-```bash
-auto-round-best \
---model Qwen/Qwen3-14B \
---device 0 \
---group_size 128 \
---bits 4 \
---format 'auto_round' \
---output_dir "./tmp_autoround" 
-```
-
-## Ethical Considerations and Limitations
-
-The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs.
-
-Therefore, before deploying any applications of the model, developers should perform safety testing.
-
-## Caveats and Recommendations
-
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model.
-
-Here are a couple of useful links to learn more about Intel's AI software:
-
-- Intel Neural Compressor [link](https://github.com/intel/neural-compressor)
-
-## Disclaimer
-
-The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
-
-## Cite
-
-@article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
-
-[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
\ No newline at end of file
diff --git a/docs/Qwen3-8B-sym-recipe.md b/docs/Qwen3-8B-sym-recipe.md
deleted file mode 100644
index 74c96fa97..000000000
--- a/docs/Qwen3-8B-sym-recipe.md
+++ /dev/null
@@ -1,247 +0,0 @@
----
-datasets:
-- NeelNanda/pile-10k
----
-
-## Model Details
-
-This model is an int4 model with group_size 128 and symmetric quantization of [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) generated by [intel/auto-round](https://github.com/intel/auto-round).
-## How To Use
-
-### INT4 Inference(CPU/CUDA/INTEL GPU)
-```python
-from auto_round import AutoRoundConfig  ##must import for auto-round format if transformers <= 4.51.3
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-quantized_model_dir = "Intel/Qwen3-8B-250424-int4-sym-AutoRound"
-
-# load the tokenizer and the model
-tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir)
-model = AutoModelForCausalLM.from_pretrained(quantized_model_dir, torch_dtype="auto", device_map="auto")
-
-# prepare the model input
-prompt = "Give me a short introduction to large language model."
-messages = [{"role": "user", "content": prompt}]
-text = tokenizer.apply_chat_template(
-    messages,
-    tokenize=False,
-    add_generation_prompt=True,
-    enable_thinking=True,  # Switches between thinking and non-thinking modes. Default is True.
-)
-model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
-
-# conduct text completion
-generated_ids = model.generate(
-    **model_inputs,
-    max_new_tokens=512,  ##change this to align with the official usage
-    do_sample=False  ##change this to align with the official usage
-)
-output_ids = generated_ids[0][len(model_inputs.input_ids[0]) :].tolist()
-
-# parsing thinking content
-try:
-    # rindex finding 151668 (</think>)
-    index = len(output_ids) - output_ids[::-1].index(151668)
-except ValueError:
-    index = 0
-
-thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
-content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
-
-print("thinking content:", thinking_content)
-print("content:", content)
-##INT4:
-# thinking content: <think>
-# Okay, the user is asking for a short introduction to large language models. Let me start by recalling what I know about them. Large language models are a type of AI that can process and generate human-like text. They're based on deep learning, right? I should mention their training process, using massive datasets. Maybe explain how they work with neural networks, like transformer architectures. Also, their applications are important—like answering questions, writing, coding. But I need to keep it concise. Wait, the user wants a short intro, so I shouldn't go into too much detail. Let me structure it: start with the definition, mention the training data, the technology (transformers), and then the applications. Also, maybe touch on their capabilities, like understanding context and generating coherent text. Oh, and maybe note that they're used in various fields. I should avoid jargon but still be accurate. Let me check if I'm missing anything. Oh, maybe mention that they're pre-trained on a lot of text, which allows them to handle multiple tasks. Yeah, that's a key point. Alright, time to put it all together in a clear, concise way.
-# </think>
-# content: Large language models (LLMs) are advanced AI systems trained on vast amounts of text data to understand and generate human-like language. Built using deep learning techniques, particularly transformer architectures, they process and analyze patterns in text to perform tasks like answering questions, writing stories, coding, and more. These models leverage extensive training data to grasp context, syntax, and semantics, enabling them to engage in complex conversations and adapt to diverse applications across fields like education, healthcare, and technology. Their ability to generate coherent, context-aware responses makes them a cornerstone of modern natural language processing.
-
-##BF16:
-# thinking content: <think>
-# Okay, the user wants a short introduction to large language models. Let me start by defining what they are. They're AI systems trained on vast amounts of text data, right? I should mention their ability to understand and generate human-like text. Maybe include examples like GPT or BERT. Also, highlight their applications in tasks like answering questions, writing, coding, and more. Keep it concise but cover the key points: training data, capabilities, and use cases. Avoid technical jargon to keep it accessible. Let me check if I need to mention the scale of the models, like the number of parameters. That's important for context. Oh, and maybe touch on how they process different languages. Wait, the user said "short," so I shouldn't go into too much detail. Let me structure it: definition, training, capabilities, applications. That should cover it. Make sure it's clear and to the point.
-# </think>
-# content: Large language models (LLMs) are advanced AI systems trained on vast amounts of text data to understand and generate human-like language. They can answer questions, write stories, code, translate languages, and perform various tasks by analyzing patterns in the data. These models, like GPT or BERT, leverage massive datasets and complex algorithms to produce coherent, context-aware responses, making them powerful tools for communication, creativity, and problem-solving across multiple domains.
-
-
-prompt = "9.11和9.8哪个数字大"
-##INT4:
-# thinking content:
-# content: <think>
-# 好的，我现在需要比较9.11和9.8哪个数字更大。首先，我应该回忆一下小数比较的方法。通常，比较小数的时候，我们会从左到右逐位比较，先看整数部分，如果整数部分相同，再比较小数部分。
-# 首先，这两个数的整数部分都是9，所以整数部分相同。接下来比较小数部分。9.11的小数部分是0.11，而9.8的小数部分是0.8。这时候我需要比较0.11和0.8的大小。
-# 为了更清楚地比较，我可以把它们转换成同一位数的小数。比如，0.11可以看作0.110，而0.8可以看作0.800。这样，比较每一位：
-# 第一位小数：0.110的十分位是1，而0.800的十分位是8。显然，8比1大，所以0.800比0.110大。因此，9.8比9.11大。
-# 不过，我是不是应该再检查一下？比如，有没有可能在比较小数时出现其他情况？比如，如果小数位数不同，是否需要补零？比如，9.11和9.8，后者的小数位数更少，所以补零到相同位数的话，确实是0.110和0.800，这样比较更直观。
-# 或者，我可以将它们转换为分数来比较。9.11等于9又11/100，而9.8等于9又80/100。显然，80/100比11/100大，所以9.8更大。
-# 另外，也可以用数值的大小来直观比较。比如，9.8等于9.80，而9.11是9.11，显然9.80比9.11大，因为小数点后第一位8比1大。
-# 不过，有没有可能我哪里弄错了？比如，是否在比较小数时应该先比较整数部分，然后小数部分？是的，没错。整数部分相同的情况下，比较小数部分，所以正确。
-# 或者，有没有可能把9.8看成9.80，而9.11是9.11，所以
-
-##BF16:
-# thinking content:
-# content: <think>
-# 嗯，用户问的是9.11和9.8哪个数字大。首先，我需要确认这两个数字的结构。9.11是一个小数，而9.8也是一个小数。看起来都是以9开头，但后面的小数部分不同。
-# 首先，我应该比较整数部分。两个数的整数部分都是9，所以这时候需要比较小数部分。小数部分的话，9.11的小数部分是0.11，而9.8的小数部分是0.8。这时候可能需要将它们转换成相同的位数来比较，或者直接比较小数点后的数字。
-# 不过，可能用户不太清楚小数点后的位数如何比较。比如，9.11有两位小数，而9.8只有一位。这时候可能需要补零，让它们的小数位数相同。比如，9.8可以写成9.80，这样比较的话，0.80和0.11。这时候，0.80比0.11大，所以9.8比9.11大。
-# 不过，也有可能用户会误以为小数点后的位数多的数字更大，比如认为9.11比9.8大，因为有两位小数。但其实小数点后的位数并不影响大小，而是要看每一位的数值。比如，第一位小数是十分位，第二位是百分位。所以，比较的时候，先看十分位，9.8的十分位是8，而9.11的十分位是1，显然8比1大，所以9.8更大。
-# 不过，也有可能用户会混淆小数点后的位数，比如认为9.11的百分位是1，而9.8的百分位是0，所以可能误以为9.11更大。这时候需要明确说明比较的步骤，先比较整数部分，再比较小数部分，逐位比较。
-# 另外，可能用户是在学习小数比较，所以需要详细解释步骤，确保他们理解如何正确比较小数。比如，先看整数部分，如果整数部分相同，再比较小数部分，从左到右逐位比较，直到找到不同的数字为止。
-# 总结一下，正确的比较方法是：整数部分相同，比较小数部分，十分位8比1大，所以9.8更大。因此，答案应该是9.
-
-
-prompt = "How many r in word strawberry"
-##INT4:
-# thinking content:
-# content: <think>
-# Okay, let's see. The user is asking how many times the letter 'r' appears in the word "strawberry". Hmm, first I need to make sure I spell the word correctly. Let me write it out: S-T-R-A-W-B-E-R-R-Y. Wait, is that right? Let me check again. S-T-R-A-W-B-E-R-R-Y. Yes, that's strawberry. Now, I need to count the number of 'r's in there.
-# Starting from the beginning: S is the first letter, then T, then R. So that's one 'r'. Then the next letters are A, W, B, E, R, R, Y. Wait, after the first R, there's another R later. Let me break it down letter by letter.
-# S - no
-# T - no
-# R - yes, that's the first one
-# A - no
-# W - no
-# B - no
-# E - no
-# R - second one
-# R - third one
-# Y - no
-# So that's three 'r's? Wait, let me count again. The word is S-T-R-A-W-B-E-R-R-Y. So positions:
-# 1. S
-# 2. T
-# 3. R
-# 4. A
-# 5. W
-# 6. B
-# 7. E
-# 8. R
-# 9. R
-# 10. Y
-# So the 'r's are at positions 3, 8, and 9. That's three 'r's. Wait, but sometimes people might miscount. Let me check once more. The word is strawberry. Breaking it down:
-# Strawberry: S-T-R-A-W-B-E-R-R-Y. Yes, so the third letter is R, then after the E, there are two R's. So that's three R's. So the answer should be 3. But wait, maybe I'm missing something. Let me write it out again:
-# S T R A W B E R R Y
-# So letters:
-# 1. S
-# 2. T
-# 3. R
-# 4. A
-# 5. W
-# 6. B
-# 7. E
-# 8. R
-# 9. R
-# 10. Y
-# Yes, three R's. So the answer is 3. I think that's correct. I don't see any other R's. Maybe the user is testing if I can count correctly, so I need to make sure. Let me check an online source or
-
-##BF16:
-# thinking content:
-# content: <think>
-# Okay, let's see. The user is asking how many times the letter 'r' appears in the word "strawberry". Hmm, first I need to make sure I have the correct spelling of the word. Let me write it out: S-T-R-A-W-B-E-R-R-Y. Wait, is that right? Let me check again. Strawberry... yes, that's correct. Now, I need to count the number of 'r's in that spelling.
-
-# Let me go through each letter one by one. Starting with the first letter: S. Not an 'r'. Next is T. Still not. Then R. Okay, that's one. Then A, W, B, E. So far, only one 'r'. Then comes the next letters: R. That's the second 'r'. Then another R. Wait, is there a third 'r'? Let me check again. The word is S-T-R-A-W-B-E-R-R-Y. So after the first R, there's a B, E, then two R's. So that's two R's? Wait, no. Let me count again. Let's break it down:
-
-# 1. S
-# 2. T
-# 3. R (1st)
-# 4. A
-# 5. W
-# 6. B
-# 7. E
-# 8. R (2nd)
-# 9. R (3rd)
-# 10. Y
-
-# Wait, so the letters are S, T, R, A, W, B, E, R, R, Y. So the 'r's are at positions 3, 8, and 9. That's three 'r's. But wait, maybe I miscounted. Let me write it out again:
-
-# S-T-R-A-W-B-E-R-R-Y. So after the first R (position 3), then the next letters are A, W, B, E, then R (position 8), then another R (position 9). So that's three R's. But sometimes people might miss the second R. Let me check again. The word is strawberry. Let me spell it again: S-T-R-A-W-B-E-R-R-Y. Yes, that's correct. So the R's are in the third, eighth, and ninth positions. Therefore, there are three 'r's. Wait, but I thought maybe the answer was two. Maybe I need to confirm. Let me check an online source or think of another way. Alternatively, maybe
-
-
-prompt = "请简短介绍一下阿里巴巴公司"
-##INT4:
-# thinking content: <think>
-# 好的，用户让我简短介绍一下阿里巴巴公司。首先，我需要确定用户的需求是什么。他们可能是在做研究，或者想了解阿里巴巴的基本信息，或者只是好奇。不管怎样，简短是关键，所以我要抓住核心点。
-# 阿里巴巴成立于1999年，由马云创立，这点必须提到。然后是它的主要业务，比如淘宝、天猫、支付宝这些大家比较熟悉的平台。可能还要提到它的国际业务，比如阿里国际站和Lazada，显示其全球影响力。
-# 另外，阿里巴巴集团的结构，比如旗下的不同子公司，比如淘宝、天猫、菜鸟网络、阿里云等，这些可以简要带过。还有它的愿景，比如成为全球最大的电子商务公司，以及在科技和创新方面的投入，比如阿里云和大数据技术。
-# 用户可能还想知道阿里巴巴的影响力，比如用户数量、市场份额，或者它在科技领域的成就。不过因为要简短，可能不需要太多数据，但可以提到它是中国最大的互联网公司之一，全球知名的电商平台。
-# 还要注意避免太技术性的术语，保持简洁易懂。可能需要检查是否有遗漏的重要信息，比如阿里巴巴的上市时间（2014年），或者其在社交媒体和物流方面的业务，比如菜鸟网络。不过这些可能属于次要信息，可以简略带过。
-# 最后，确保整体结构清晰，先介绍成立时间、创始人，然后主要业务，再提到国际业务和科技贡献，最后总结其地位和愿景。这样用户就能快速了解阿里巴巴的基本情况。
-# </think>
-# content: 阿里巴巴集团（Alibaba Group）成立于1999年，由马云创立，是中国乃至全球最具影响力的互联网企业之一。其核心业务涵盖电子商务（如淘宝、天猫）、数字支付（支付宝）、云计算（阿里云）、物流（菜鸟网络）及全球化零售（Lazada、阿里国际站）等领域。阿里巴巴致力于通过技术创新推动商业变革，现已成为全球最大的电商平台之一，并在人工智能、大数据等前沿科技领域持续布局。2014年，其在美国纽约证券交易所上市（股票代码：BABA）。
-
-##BF16:
-# thinking content: <think>
-# 嗯，用户让我简短介绍一下阿里巴巴公司。首先，我需要确定用户的需求是什么。可能他们只是想了解阿里巴巴的基本信息，或者是在做作业、写报告，或者只是好奇。不管怎样，用户需要的是简洁明了的回答，不能太冗长。
-# 接下来，我得回忆一下阿里巴巴的基本信息。阿里巴巴集团成立于1999年，总部在杭州，对吧？创始人是马云，不过现在可能已经卸任了。阿里巴巴主要业务包括电商平台，比如淘宝、天猫，还有菜鸟网络、支付宝、阿里云这些子公司。可能还需要提到它的全球影响力，比如在电子商务、云计算、金融科技方面的成就。
-# 用户可能还想知道阿里巴巴的规模，比如员工数量、市值，或者它在行业中的地位。不过用户要求简短，所以这些细节可能需要简化。另外，可能需要提到它的使命，比如让天下没有难做的生意，或者推动数字化转型。
-# 还要注意避免错误信息，比如马云是否还在任。我记得他已经在2019年卸任董事局主席，现在是董事会成员。另外，阿里巴巴的上市时间是2014年，港交所和纽交所都有。可能需要提到这些关键点。
-# 用户可能没有说出来的深层需求是希望了解阿里巴巴的核心业务和影响力，而不仅仅是表面信息。所以需要突出其作为全球最大的电商平台之一，以及在云计算和金融科技领域的领先地位。同时，可能还要提到其社会责任，比如环保、公益项目等。
-# 需要确保回答结构清晰，分点列出，但用户要求简短，所以可能用一两句话概括。比如成立时间、总部、创始人、主要业务、全球地位等。还要注意用词准确，比如“集团”而不是“公司”，因为阿里巴巴有多个子公司。
-# 最后检查有没有遗漏的重要信息，比如是否提到支付宝和阿里云，这些是阿里巴巴的重要组成部分。同时，可能需要提到其对中国经济和全球电商的影响，但保持简洁。确保没有错误，比如成立年份、上市地点等。这样用户就能得到一个全面但简短的介绍。
-# </think>
-# content: 阿里巴巴集团（Alibaba Group）成立于1999年，总部位于中国杭州，由马云创立。它是全球领先的互联网科技公司，核心业务涵盖电子商务（淘宝、天猫）、云计算（阿里云）、金融科技（支付宝）、物流（菜鸟网络）及创新业务（如盒马鲜生、阿里健康等）。阿里巴巴致力于通过数字化技术赋能企业与消费者，推动全球商业变革，旗下拥有
-```
-
-### Evaluate the model
-
-pip3 install lm-eval
-```bash
-auto-round-eval --model "Intel/Qwen3-8B-250424-int4-sym-AutoRound" --eval_bs 16  --tasks leaderboard_ifeval,leaderboard_mmlu_pro,gsm8k,lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,arc_easy,arc_challenge,mmlu,cmmlu,ceval-valid
-```
-| Metric                                     |  BF16  |  INT4(best)  |  INT4(default)  |
-| :----------------------------------------- | :----: | :----: | :----: |
-| Avg                                        | 0.6184 | 0.6123 | 0.6063 |
-| arc_easy                                   | 0.8342 | 0.8295 | 0.8224 |
-| arc_challenge                              | 0.5418 | 0.5496 | 0.5418 |
-| boolq                                      | 0.8673 | 0.8673 | 0.8654 |
-| ceval-valid                                | 0.7912 | 0.7786 | 0.7741 |
-| cmmlu                                      | 0.7702 | 0.7588 | 0.7527 |
-| gsm8k 5 shots                              | 0.8810 | 0.8643 | 0.8688 |
-| hellaswag                                  | 0.5708 | 0.5626 | 0.5615 |
-| lambada_openai                             | 0.6400 | 0.6387 | 0.6305 |
-| leaderboard_mmlu_pro 5 shots               | 0.4759 | 0.4687 | 0.4676 |
-| leaderboard_ifeval inst_level_strict_acc   | 0.3957 | 0.3957 | 0.3789 |
-| leaderboard_ifeval prompt_level_strict_acc | 0.2532 | 0.2477 | 0.2200 |
-| mmlu                                       | 0.7294 | 0.7209 | 0.7168 |
-| openbookqa                                 | 0.3140 | 0.3120 | 0.8654 |
-| piqa                                       | 0.7666 | 0.7628 | 0.7633 |
-| truthfulqa_mc1                             | 0.3672 | 0.3574 | 0.3550 |
-| winogrande                                 | 0.6811 | 0.6827 | 0.6803 |
-
-
-### Generate the model
-
-Here is the sample command to generate the model. 
-
-
-```bash
-auto-round-best \
---model Qwen/Qwen3-8B \
---device 0 \
---group_size 128 \
---bits 4 \
---format 'auto_round' \
---output_dir "./tmp_autoround" 
-```
-
-## Ethical Considerations and Limitations
-
-The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs.
-
-Therefore, before deploying any applications of the model, developers should perform safety testing.
-
-## Caveats and Recommendations
-
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model.
-
-Here are a couple of useful links to learn more about Intel's AI software:
-
-- Intel Neural Compressor [link](https://github.com/intel/neural-compressor)
-
-## Disclaimer
-
-The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
-
-## Cite
-
-@article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
-
-[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
\ No newline at end of file
diff --git a/docs/Yi-6B-Chat-asym-recipe.md b/docs/Yi-6B-Chat-asym-recipe.md
deleted file mode 100644
index d62c36c6b..000000000
--- a/docs/Yi-6B-Chat-asym-recipe.md
+++ /dev/null
@@ -1,32 +0,0 @@
- **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
-```bash
-auto-round \
---model 01-ai/Yi-6B-Chat  \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsamples 512 \
---asym \
---minmax_lr 2e-3 \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-
-Due to licensing restrictions, we are unable to release the model. Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, and the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d.
-
-We used the following command for evaluation.
-For reference, the results of official AWQ-INT4 release are listed.
-
-~~~bash
-lm_eval --model hf  --model_args pretrained="./",autogptq=True,gptq_use_triton=True,trust_remote_code=True --device cuda:0 --tasks ceval-valid,cmmlu,mmlu,gsm8k --batch_size 16 --num_fewshot 0
-~~~
-
-| Metric | BF16   |[01-ai/Yi-6B-Chat-4bits](https://huggingface.co/01-ai/Yi-6B-Chat-4bits)| INT4   |
-|--------|--------|----------------------|--------|
-| Avg.   | 0.6043 | 0.5867               | 0.5939 |
-| mmlu   | 0.6163 | 0.6133               | 0.6119 |
-| cmmlu  | 0.7431 | 0.7312               | 0.7314 |
-| ceval  | 0.7355 | 0.7155               | 0.7281 |
-| gsm8k  | 0.3222 | 0.2866               | 0.3040 |
diff --git a/docs/baichuan2-7b-cha-asym-recipe.md b/docs/baichuan2-7b-cha-asym-recipe.md
deleted file mode 100644
index 32b3b8d99..000000000
--- a/docs/baichuan2-7b-cha-asym-recipe.md
+++ /dev/null
@@ -1,32 +0,0 @@
-**This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
-```bash
-auto-round \
---model baichuan-inc/Baichuan2-7B-Chat  \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsamples 512 \
---asym \
---minmax_lr 2e-3 \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-
-
-Due to licensing restrictions, we are unable to release the model. Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, and the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d.
-
-We used the following command for evaluation.
-
-~~~bash
-lm_eval --model hf  --model_args pretrained="./",autogptq=True,gptq_use_triton=True,trust_remote_code=True --device cuda:0 --tasks ceval-valid,cmmlu,mmlu,gsm8k --batch_size 16 --num_fewshot 0
-~~~
-
-| Metric | BF16   | INT4   |
-|--------|--------|--------|
-| Avg.   | 0.4504 | 0.4470 |
-| mmlu   | 0.5096 | 0.5053 |
-| cmmlu  | 0.5486 | 0.5426 |
-| ceval  | 0.5394 | 0.5223 |
-| gsm8k  | 0.2039 | 0.2176 |
diff --git a/docs/bloom-3B-asym-recipe.md b/docs/bloom-3B-asym-recipe.md
deleted file mode 100644
index 4f2096f98..000000000
--- a/docs/bloom-3B-asym-recipe.md
+++ /dev/null
@@ -1,38 +0,0 @@
-**This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
-```bash
-auto-round \
---model bigscience/bloom-3b  \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsamples 512 \
---asym \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-
-Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, we used the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d
-##pip install auto-gptq[triton] 
-##pip install triton==2.2.0
-```bash
-lm_eval --model hf --model_args pretrained="./",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 32
-```
-
-
-
-| Metric         | FP16   | INT4   |
-| -------------- | ------ | ------ |
-| Avg.           | 0.4532 | 0.4514 |
-| mmlu           | 0.2592 | 0.2537 |
-| lambada_openai | 0.5176 | 0.5135 |
-| hellaswag      | 0.4136 | 0.4093 |
-| winogrande     | 0.5864 | 0.5856 |
-| piqa           | 0.7062 | 0.7095 |
-| truthfulqa_mc1 | 0.2326 | 0.2264 |
-| openbookqa     | 0.2160 | 0.2140 |
-| boolq          | 0.6156 | 0.6199 |
-| rte            | 0.5632 | 0.5632 |
-| arc_easy       | 0.5947 | 0.5888 |
-| arc_challenge  | 0.2799 | 0.2816 |
diff --git a/docs/cogvlm2-llama3-chat-19B-sym.md b/docs/cogvlm2-llama3-chat-19B-sym.md
deleted file mode 100644
index 61dc7d33d..000000000
--- a/docs/cogvlm2-llama3-chat-19B-sym.md
+++ /dev/null
@@ -1,112 +0,0 @@
-
-## Model Details
-
-This model is an int4 model with group_size 128 and symmetric quantization of [THUDM/cogvlm2-llama3-chat-19B](https://huggingface.co/THUDM/cogvlm2-llama3-chat-19B). 
-## How To Use
-### INT4 Inference
-```python
-import torch
-from PIL import Image
-from auto_round import AutoRoundConfig  ##must import for auto-round format
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import requests
-
-MODEL_PATH = "Intel/cogvlm2-llama3-chat-19B-inc-private"
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-
-tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
-model = (
-    AutoModelForCausalLM.from_pretrained(MODEL_PATH, torch_dtype="auto", trust_remote_code=True, device_map=DEVICE)
-    .to(DEVICE)
-    .eval()
-)
-
-image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
-content = "Describe this image."
-
-text_only_template = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
-query = text_only_template.format(content)
-
-image = Image.open(requests.get(image_url, stream=True).raw)
-input_by_model = model.build_conversation_input_ids(tokenizer, query=query, images=[image], template_version="chat")
-inputs = {
-    "input_ids": input_by_model["input_ids"].unsqueeze(0).to(DEVICE),
-    "token_type_ids": input_by_model["token_type_ids"].unsqueeze(0).to(DEVICE),
-    "attention_mask": input_by_model["attention_mask"].unsqueeze(0).to(DEVICE),
-    "images": [[input_by_model["images"][0].to(DEVICE).to(model.dtype)]] if image is not None else None,
-}
-gen_kwargs = {
-    "max_new_tokens": 2048,
-    "pad_token_id": 128002,
-}
-
-with torch.no_grad():
-    outputs = model.generate(**inputs, **gen_kwargs)
-    outputs = outputs[:, inputs["input_ids"].shape[1] :]
-    response = tokenizer.decode(outputs[0])
-    response = response.split("<|end_of_text|>")[0]
-    print(response)
-##INT4:
-## The image depicts a serene beach scene during what appears to be the golden hour, just before sunset. A woman is seated on the sandy shore, facing the vast expanse of the ocean. The waves are gently crashing on the beach, and the sky is painted with hues of orange and blue. The woman seems to be in a relaxed posture, possibly enjoying the tranquility of the moment. Beside her is a large, golden-colored dog, possibly a Labrador, wearing a colorful harness. The dog is sitting upright, looking at the woman, and they seem to share a bond of trust and affection. The overall mood of the image is peaceful, reflective, and heartwarming.
-
-##BF16:
-## The image showcases a serene beach setting during what appears to be either sunrise or sunset. In the foreground, a woman sits on the sandy beach, dressed in casual attire, including a checkered shirt and jeans. She is engaged in a moment of connection with a golden retriever dog, which is seated beside her. The dog wears a colorful harness and is looking up at the woman, possibly in anticipation of a treat or a playful gesture. The vast expanse of the ocean can be seen in the background, with gentle waves crashing onto the shore. The sky is clear, and the warm hues of the setting or rising sun cast a soft glow over the scene, creating a tranquil and heartwarming atmosphere.
-
-image_url = "http://images.cocodataset.org/train2017/000000411975.jpg"
-content = "图片中的棒球场上有多少人？"
-##INT4:
-## In the image provided, there are four individuals on the baseball field. There are two children in the foreground, presumably engaged in some playful activity or perhaps participating in a game. One child is bending over, possibly picking something up off the ground, while the other stands beside them. Additionally, there is an adult standing further back, holding a microphone, suggesting they might be an announcer or commentator for the game. Lastly, there is another adult, seen from the back, sitting in the stands. So, in total, there are four people on the baseball field.
-
-##BF16:
-## In the image provided, there are five people visible on the baseball field.
-
-image_url = "https://intelcorp.scene7.com/is/image/intelcorp/processor-overview-framed-badge:1920-1080?wid=480&hei=270"
-content = "这张图片代表哪家公司？"
-##INT4:
-## The image represents the company Intel. The logo in the image is the Intel Inside logo, which is commonly used by Intel to signify the presence of their processors in various electronic devices.
-
-##BF16:
-## The image represents the company Intel.
-```
-
-
-
-### Generate the model
-Here is the sample command to reproduce the model.
-```bash
-pip install auto-round
-auto-round-mllm \
---model THUDM/cogvlm2-llama3-chat-19B \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsample 512 \
---seqlen 2048 \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-## Ethical Considerations and Limitations
-
-The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs.
-
-Therefore, before deploying any applications of the model, developers should perform safety testing.
-
-## Caveats and Recommendations
-
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model.
-
-Here are a couple of useful links to learn more about Intel's AI software:
-
-- Intel Neural Compressor [link](https://github.com/intel/neural-compressor)
-
-## Disclaimer
-
-The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
-
-## Cite
-
-@article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
-
-[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
diff --git a/docs/falcon-7b-asym-recipe.md b/docs/falcon-7b-asym-recipe.md
deleted file mode 100644
index d0e101eb5..000000000
--- a/docs/falcon-7b-asym-recipe.md
+++ /dev/null
@@ -1,37 +0,0 @@
-**This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
-```bash
-auto-round \
---model tiiuae/falcon-7b   \
---device 0 \
---group_size 64 \
---bits 4 \
---iters 1000 \
---nsamples 512 \
---asym \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-
-We generate the model with group_size 64 as there is an issue when evaluating with group_size 128.
-Evaluate the model
-pip3 install lm-eval==0.4.2
-
-```bash
-lm_eval --model hf --model_args pretrained="Intel/falcon-7b-int4-inc",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,arc_easy,arc_challenge,mmlu --batch_size 16
-```
-
-| Metric         | BF16   | int4   |
-| -------------- | ------ | ------ |
-| Avg.           | 0.5462 | 0.5454 |
-| mmlu           | 0.2546 | 0.2562 |
-| lambada_openai | 0.7450 | 0.7485 |
-| hellaswag      | 0.5773 | 0.5719 |
-| winogrande     | 0.6740 | 0.6835 |
-| piqa           | 0.7943 | 0.7905 |
-| truthfulqa_mc1 | 0.2228 | 0.2166 |
-| openbookqa     | 0.3080 | 0.3100 |
-| boolq          | 0.7361 | 0.7431 |
-| arc_easy       | 0.7475 | 0.7424 |
-| arc_challenge  | 0.4027 | 0.3908 |
-
diff --git a/docs/gemma-2b-asym-recipe.md b/docs/gemma-2b-asym-recipe.md
deleted file mode 100644
index f1c16c899..000000000
--- a/docs/gemma-2b-asym-recipe.md
+++ /dev/null
@@ -1,44 +0,0 @@
-**This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
-```bash
-auto-round \
---model google/gemma-2b  \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 400 \
---model_dtype "float16" \
---nsamples 512 \
---asym \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-Evaluate the model 
-
-Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, lm-eval 0.4.2 is used
-
-pip install auto-gptq
-
-
-Please note that there is a discrepancy between the baseline result and the official data, which is a known issue within the official model card community.
-Given that the Gemma model family exhibits inconsistent results between FP16 and BF16 on lm-eval, we recommend converting to FP16 for both tuning and evaluation.
-
-```bash
-lm_eval --model hf --model_args pretrained="Intel/gemma-2b-int4-inc",autogptq=True,gptq_use_triton=True,dtype=float16 --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 16
-```
-
-
-
-| Metric         | BF16 | FP16   | AutoRound v0.1 | AutoRound v0.2 |
-| -------------- | ---- | ------ |----------------|----------------|
-| Avg.| 0.5263 | 0.5277 | 0.5235         | 0.5248         |
-| mmlu           | 0.3287 | 0.3287 | 0.3297         | 0.3309         |
-| lambada_openai | 0.6344 | 0.6375 | 0.6307         | 0.6379         |
-| hellaswag      | 0.5273 | 0.5281 | 0.5159         | 0.5184         |
-| winogrande     | 0.6504 | 0.6488 | 0.6543         | 0.6575         |
-| piqa           | 0.7671 | 0.7720 | 0.7612         | 0.7606         |
-| truthfulqa_mc1 | 0.2203 | 0.2203 | 0.2203         | 0.2191         |
-| openbookqa     | 0.2980 | 0.3020 | 0.3000         | 0.3060         |
-| boolq          | 0.6927 | 0.6936 | 0.6939         | 0.6966         |
-| arc_easy       | 0.7420 | 0.7403 | 0.7353         | 0.7357         |
-| arc_challenge  | 0.4019 | 0.4061 | 0.3933         | 0.3857         |
diff --git a/docs/gemma-7b-asym-recipe.md b/docs/gemma-7b-asym-recipe.md
deleted file mode 100644
index de0194ccb..000000000
--- a/docs/gemma-7b-asym-recipe.md
+++ /dev/null
@@ -1,42 +0,0 @@
- **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
-
-A sample command to generate an INT4 model.
-```bash
-auto-round \
---model  google/gemma-7b \
---device 0 \
---group_size 128 \
---bits 4 \
---minmax_lr 2e-3 \
---model_dtype "float16" \
---iters 1000 \
---nsamples 512 \
---asym \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-
-pip install lm-eval==0.4.2
-pip install auto-gptq
-
-Please note that there is a discrepancy between the baseline result and the official data, which is a known issue within the official model card community.
-
-Given that the Gemma model family exhibits inconsistent results between FP16 and BF16 on lm-eval, we recommend converting to FP16 for both tuning and evaluation.
-```bash
-lm_eval --model hf --model_args pretrained="Intel/gemma-7b-int4-inc",autogptq=True,gptq_use_triton=True,dtype=float16 --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 32
-```
-| Metric         | BF16 | FP16   | AutoRound v0.1 | AutoRound V0.2 |
-| -------------- | ---- | ------ |----------------|----------------|
-| Avg. | 0.6208 | 0.6302 | 0.6242         | 0.6254         |
-| mmlu           | 0.6126 | 0.6189 | 0.6085         | 0.6147         |
-| lambada_openai | 0.6707 | 0.7308 | 0.7165         | 0.7270         |
-| hellaswag      | 0.6039 | 0.6063 | 0.6017         | 0.6017         |
-| winogrande     | 0.7356 | 0.7506 | 0.7482         | 0.7490         |
-| piqa           | 0.8014 | 0.8025 | 0.7976         | 0.7982         |
-| truthfulqa_mc1 | 0.3121 | 0.3121 | 0.3060         | 0.2840         |
-| openbookqa     | 0.3300 | 0.3220 | 0.3340         | 0.3240         |
-| boolq          | 0.8254 | 0.8324 | 0.8300         | 0.8407         |
-| rte            | 0.6643 | 0.6859 | 0.6787         | 0.6968         |
-| arc_easy       | 0.8068 | 0.8262 | 0.8089         | 0.8194         |
-| arc_challenge  | 0.5043 | 0.5000 | 0.4915         | 0.4949         |
diff --git a/docs/gemma-7b-it-asym-recipe.md b/docs/gemma-7b-it-asym-recipe.md
deleted file mode 100644
index 08b3ef0c6..000000000
--- a/docs/gemma-7b-it-asym-recipe.md
+++ /dev/null
@@ -1,40 +0,0 @@
- **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
-
-A sample command to generate an INT4 model.
-```bash
-auto-round \
---model  google/gemma-7b-it \
---device 0 \
---group_size 128 \
---bits 4 \
---minmax_lr 2e-3 \
---model_dtype "float16" \
---iters 1000 \
---nsamples 512 \
---asym \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source,  and the  git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d, Install the latest [AutoGPTQ](https://github.com/AutoGPTQ/AutoGPTQ) from source first
-
-Please note that there is a discrepancy between the baseline result and the official data, which is a known issue within the official model card community.
-
-```bash
-lm_eval --model hf --model_args pretrained="Intel/gemma-7b-it-int4-inc",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 32
-```
-
-| Metric         | BF16   | int4   |
-| -------------- |--------| ------ |
-| Avg.           | 0.6022 | 0.6017 |
-| mmlu           | 0.5029 | 0.4993 |
-| lambada_openai | 0.6035 | 0.6286 |
-| hellaswag      | 0.5620 | 0.5564 |
-| winogrande     | 0.6796 | 0.6788 |
-| piqa           | 0.7709 | 0.7731 |
-| truthfulqa_mc1 | 0.3048 | 0.3035 |
-| openbookqa     | 0.3740 | 0.3700 |
-| boolq          | 0.8138 | 0.8144 |
-| rte            | 0.7870 | 0.7870 |
-| arc_easy       | 0.7525 | 0.7508 |
-| arc_challenge  | 0.4727 | 0.4573 |
diff --git a/docs/glm-4-9b-chat-recipe.md b/docs/glm-4-9b-chat-recipe.md
deleted file mode 100644
index 3a3a6fbb1..000000000
--- a/docs/glm-4-9b-chat-recipe.md
+++ /dev/null
@@ -1,142 +0,0 @@
-## Model Details
-
-This  is an int4 recipe with group_size 128 of  [THUDM/glm-4-9b-chat](https://huggingface.co/THUDM/glm-4-9b-chat) generated by [intel/auto-round](https://github.com/intel/auto-round). For GPTQ format, please load the model with revision `d45e33e`
-
-## How To Use
-
-### INT4 Inference on CPU/CUDA/HPU
-
-```python
-from auto_round import AutoRoundConfig  ## must import for auto-round format
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import torch
-
-quantized_model_dir = "Intel/glm-4-9b-chat-int4-inc"
-
-backend = "auto"  ##cuda, hpu, cpu(supported in auto_round>0.3.1),cuda:marlin(supported in auto_round>0.3.1 'pip install -v gptqmodel --no-build-isolation')
-quantization_config = AutoRoundConfig(backend=backend)
-model = AutoModelForCausalLM.from_pretrained(
-    quantized_model_dir,
-    device_map=backend.split(":")[0],
-    torch_dtype=torch.float16,
-    quantization_config=quantization_config,
-    trust_remote_code=True,
-).eval()
-
-tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, trust_remote_code=True)
-query = "请介绍一下智谱华章科技有限公司"
-inputs = tokenizer.apply_chat_template(
-    [{"role": "user", "content": query}],
-    add_generation_prompt=True,
-    tokenize=True,
-    return_tensors="pt",
-    return_dict=True,
-)
-inputs = inputs.to(model.device)
-
-gen_kwargs = {"max_length": 50, "do_sample": False, "top_k": 1}  ##change this to follow official usage
-with torch.no_grad():
-    outputs = model.generate(**inputs, **gen_kwargs)
-    outputs = outputs[:, inputs["input_ids"].shape[1] :]
-    print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-
-##请介绍一下智谱华章科技有限公司
-# 智谱华章科技有限公司是一家专注于人工智能、大数据、云计算等前沿技术领域的创新型企业。公司成立于2016年，总部位于中国北京，是一家集技术研发、产品开发、
-
-
-##9.8大还是9.11大
-##9.8比9.11小。在数值上，9.8小于9.11。
-
-
-##Once upon a time,
-
-# In a land where the sun kissed the horizon with a golden glow and the stars whispered secrets to the night, there was a village nestled among rolling hills and whispering forests. This was a place
-
-
-##There is a girl who likes adventure,
-##That's quite the intriguing starting point! If you're looking to create a story or a character, here's a brief introduction to a girl who likes adventure:
-
-##---
-
-##**Name:**
-```
-
-
-
-### Evaluate the model
-
-pip3 install lm-eval==0.4.5
-
-pip3 install langdetect,immutabledict,antlr4-python3-runtime==4.11
-
-```bash
-auto-round --eval --eval_bs 16 --tasks leaderboard_ifeval,leaderboard_mmlu_pro,gsm8k,lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,arc_easy,arc_challenge,cmmlu,ceval-valid
-
-```
-
-| Metric                                     | BF16   | INT4(6.4G) | INT4-quanted-lm-head(5.5G) |
-| ------------------------------------------ | ------ | ---------- | -------------------------- |
-| Avg                                        | 0.6260 | 0.6230     | 0.6204                     |
-| leaderboard_mmlu_pro 5shot                 | 0.3678 | 0.3616     | 0.3610                     |
-| leaderboard_ifeval inst_level_strict_acc   | 0.5504 | 0.5600     | 0.5588                     |
-| leaderboard_ifeval prompt_level_strict_acc | 0.4067 | 0.4233     | 0.4067                     |
-| cmmlu                                      | 0.7213 | 0.7137     | 0.7086                     |
-| ceval-valid                                | 0.7065 | 0.7058     | 0.6909                     |
-| lambada_openai                             | 0.6608 | 0.6493     | 0.6470                     |
-| hellaswag                                  | 0.6195 | 0.6137     | 0.6134                     |
-| winogrande                                 | 0.7561 | 0.7545     | 0.7522                     |
-| piqa                                       | 0.8030 | 0.7976     | 0.8003                     |
-| truthfulqa_mc1                             | 0.4223 | 0.4223     | 0.4284                     |
-| openbookqa                                 | 0.3560 | 0.3640     | 0.3580                     |
-| boolq                                      | 0.8691 | 0.8606     | 0.8578                     |
-| arc_easy                                   | 0.8241 | 0.8249     | 0.8203                     |
-| arc_challenge                              | 0.5469 | 0.5341     | 0.5444                     |
-| gsm8k 5shot strict match                   | 0.7794 | 0.7597     | 0.7589                     |
-
-
-
-### Generate the model
-
-Here is the sample command to generate the model.  AutoRound should include this pr https://github.com/intel/auto-round/pull/304
-
-```bash
-auto-round \
---model THUDM/glm-4-9b-chat \
---iter 1000 \
---nsamples 512 \
---disable_eval \
---format "auto_round,auto_gptq" \
---model_dtype "fp16" \
---output_dir "./tmp_autoround"
-```
-
-copy all the *.py file to the quantized_model folder
-
-For gptq format, need to add "block_name_to_quantize":"transformer.encoder.layers" to config.json, we only tested it on transformers==4.46.1
-
-## Ethical Considerations and Limitations
-
-The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs.
-
-Therefore, before deploying any applications of the model, developers should perform safety testing.
-
-## Caveats and Recommendations
-
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model.
-
-Here are a couple of useful links to learn more about Intel's AI software:
-
-* Intel Neural Compressor [link](https://github.com/intel/neural-compressor)
-* Intel Extension for Transformers [link](https://github.com/intel/intel-extension-for-transformers)
-
-## Disclaimer
-
-The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
-
-
-
-## Cite
-
-@article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
-
-[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
\ No newline at end of file
diff --git a/docs/gpt-j-6B-asym-recipe.md b/docs/gpt-j-6B-asym-recipe.md
deleted file mode 100644
index 6b950a344..000000000
--- a/docs/gpt-j-6B-asym-recipe.md
+++ /dev/null
@@ -1,40 +0,0 @@
- **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
-
-A sample command to generate an INT4 model.
-```bash
-auto-round \
---model EleutherAI/gpt-j-6b \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsamples 512 \
---asym \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-
-Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, we used the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d
-##pip install auto-gptq[triton] 
-##pip install triton==2.2.0
-```bash
-lm_eval --model hf --model_args pretrained="./",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 32
-```
-
-
-
-| Metric         | FP16   | INT4   |
-| -------------- | ------ | ------ |
-| Avg.           | 0.5039 | 0.5034 |
-| mmlu           | 0.2694 | 0.2793 |
-| lambada_openai | 0.6831 | 0.6790 |
-| hellaswag      | 0.4953 | 0.4902 |
-| winogrande     | 0.6409 | 0.6401 |
-| piqa           | 0.7541 | 0.7465 |
-| truthfulqa_mc1 | 0.2020 | 0.2179 |
-| openbookqa     | 0.2900 | 0.2900 |
-| boolq          | 0.6544 | 0.6554 |
-| rte            | 0.5451 | 0.5271 |
-| arc_easy       | 0.6692 | 0.6734 |
-| arc_challenge  | 0.3396 | 0.3387 |
diff --git a/docs/llava-v1.5-7b-sym.md b/docs/llava-v1.5-7b-sym.md
deleted file mode 100644
index 065e460ee..000000000
--- a/docs/llava-v1.5-7b-sym.md
+++ /dev/null
@@ -1,135 +0,0 @@
-
-## Model Details
-
-This model is an int4 model with group_size 128 and symmetric quantization of [liuhaotian/llava-v1.5-7b](https://huggingface.co/liuhaotian/llava-v1.5-7b). Load the model with revision="8ab8ff" to use AutoGPTQ format.
-
-## How To Use
-
-### Requirements
-
-1. Clone this repository and navigate to LLaVA folder
-```shell
-git clone https://github.com/haotian-liu/LLaVA.git
-cd LLaVA
-```
-
-2. Refine LLaVA repo
-```
-vi llava/model/language_model/llava_llama.py
-# add 'cache_position = None,' to line 71.
-```
-3. Install Package
-```
-pip install --upgrade pip  # enable PEP 660 support
-pip install -e .
-```
-
-### INT4 Inference
-```python
-from auto_round import AutoRoundConfig  ## must import for auto-round format
-import requests
-import torch
-from PIL import Image
-from llava.model.builder import load_pretrained_model
-from llava.train.train import preprocess, preprocess_multimodal, DataCollatorForSupervisedDataset
-
-
-class DataArgs:
-    is_multimodal = True
-    mm_use_im_start_end = False
-
-
-quantized_model_path = "Intel/llava-v1.5-7b-inc-private"
-
-tokenizer, model, image_processor, _ = load_pretrained_model(
-    quantized_model_path,
-    model_base=None,
-    model_name=quantized_model_path,
-    torch_dtype="auto",
-    device_map="auto",
-    ##revision="8ab8ff" ##AutoGPTQ format
-)
-image_url = "http://images.cocodataset.org/train2017/000000116003.jpg"
-messages = [{"from": "human", "value": "What is the tennis player doing in the image?\n<image>"}]
-
-# Preparation for inference
-image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
-image_input = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0].to(model.device)
-input_data = preprocess_multimodal([messages], DataArgs())
-inputs = preprocess(input_data, tokenizer, has_image=(image_input is not None))
-
-output = model.generate(inputs["input_ids"].to(model.device), images=image_input.unsqueeze(0).half(), max_new_tokens=50)
-print(tokenizer.batch_decode(output))
-
-##INT4: The tennis player is celebrating a victory, raising his arms in the air, and holding his tennis racket.
-
-##BF16: The tennis player is celebrating a victory, raising his arms in the air, and holding a tennis racket.
-
-image_url = "http://images.cocodataset.org/train2017/000000411975.jpg"
-messages = [{"from": "human", "value": "How many people are on the baseball field in the picture?\n<image>"}]
-
-##INT4: There are three people on the baseball field in the picture.
-
-##BF16: There are three people on the baseball field in the picture.
-
-
-image_url = "http://images.cocodataset.org/train2017/000000093025.jpg"
-messages = [{"from": "human", "value": "How many people and animals are there in the image?\n<image>"}]
-
-##INT4: There are two people and one animal in the image.
-
-##BF16: There are two people and one animal in the image.
-```
-
-## Evaluation the model
-pip3 install lmms_eval. The evaluation process may encounter errors that require changing model backend or evaluation code. Detailed instructions will be provided in a future update
-```bash
-auto-round-mllm --lmms --model Intel/llava-v1.5-7b-inc-private --tasks pope,textvqa_val,scienceqa,mmbench_en  --output_dir "./eval_result" --device cuda:0 
-```
-|Metric             |16bits|Pile Calib INT4  | Llava Calib INT4  |
-|:-------------------|:------|:------|:--------------|
-|avg                |65.40 |65.91 | 65.79 |
-|MMBench_DEV_EN_V11 |64.09 |64.43 |64.43 |
-|ScienceQA_VAL      |64.87 |67.20 |66.80 |
-|TextVQA_VAL        |45.56 |45.71 |45.81 |
-|POPE               |87.09 |86.31 |86.12 |
-
-### Generate the model
-Here is the sample command to reproduce the model.
-```bash
-pip install auto-round
-auto-round-mllm \
---model liuhaotian/llava-v1.5-7b \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsample 512 \
---seqlen 2048 \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-## Ethical Considerations and Limitations
-
-The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs.
-
-Therefore, before deploying any applications of the model, developers should perform safety testing.
-
-## Caveats and Recommendations
-
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model.
-
-Here are a couple of useful links to learn more about Intel's AI software:
-
-- Intel Neural Compressor [link](https://github.com/intel/neural-compressor)
-
-## Disclaimer
-
-The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
-
-## Cite
-
-@article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
-
-[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
diff --git a/docs/neural-chat-7b-v3-1-asym-recipe.md b/docs/neural-chat-7b-v3-1-asym-recipe.md
deleted file mode 100644
index f7e2288a6..000000000
--- a/docs/neural-chat-7b-v3-1-asym-recipe.md
+++ /dev/null
@@ -1,38 +0,0 @@
- **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
-
-A sample command to generate an INT4 model.
-```bash
-auto-round \
---model   intel/neural-chat-7b-v3-1 \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsamples 512 \
---minmax_lr 2e-3 \
---asym \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-
-Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, we used the git id f3b7917091afba325af3980a35d8a6dcba03dc3f
-
-~~~bash
-lm_eval  --model hf --model_args pretrained="Intel/neural-chat-v3-1-int4-inc",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu  --batch_size 128
-~~~
-
-| Metric         | FP16   | INT4   |
-| -------------- | ------ | ------ |
-| Avg.           | 0.6769 | 0.6721 |
-| mmlu           | 0.5919 | 0.5862 |
-| lambada_openai | 0.7394 | 0.7337 |
-| hellaswag      | 0.6323 | 0.6272 |
-| winogrande     | 0.7687 | 0.7577 |
-| piqa           | 0.8161 | 0.8150 |
-| truthfulqa_mc1 | 0.4431 | 0.4394 |
-| openbookqa     | 0.3760 | 0.3700 |
-| boolq          | 0.8783 | 0.8743 |
-| rte            | 0.7690 | 0.7726 |
-| arc_easy       | 0.8413 | 0.8384 |
-| arc_challenge  | 0.5896 | 0.5785 |
diff --git a/docs/neural-chat-7b-v3-3-asym-recipe.md b/docs/neural-chat-7b-v3-3-asym-recipe.md
deleted file mode 100644
index 7d1334057..000000000
--- a/docs/neural-chat-7b-v3-3-asym-recipe.md
+++ /dev/null
@@ -1,39 +0,0 @@
- **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
-
-A sample command to generate an INT4 model.
-```bash
-auto-round \
---model   intel/neural-chat-7b-v3-3 \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsamples 512 \
---minmax_lr 2e-3 \
---asym \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-
-
-Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, we used the git id f3b7917091afba325af3980a35d8a6dcba03dc3f
-
-~~~bash
-lm_eval  --model hf --model_args pretrained="Intel/neural-chat-v3-3-int4-inc",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu  --batch_size 128
-~~~
-
-| Metric         | FP16   | INT4   |
-| -------------- | ------ | ------ |
-| Avg.           | 0.6778 | 0.6748 |
-| mmlu           | 0.5993 | 0.5926 |
-| lambada_openai | 0.7303 | 0.7370 |
-| hellaswag      | 0.6639 | 0.6559 |
-| winogrande     | 0.7632 | 0.7735 |
-| piqa           | 0.8101 | 0.8074 |
-| truthfulqa_mc1 | 0.4737 | 0.4737 |
-| openbookqa     | 0.3880 | 0.3680 |
-| boolq          | 0.8694 | 0.8694 |
-| rte            | 0.7581 | 0.7509 |
-| arc_easy       | 0.8266 | 0.8249 |
-| arc_challenge  | 0.5734 | 0.5691 |
diff --git a/docs/opt-2.7b-asym-recipe.md b/docs/opt-2.7b-asym-recipe.md
deleted file mode 100644
index 2ec433660..000000000
--- a/docs/opt-2.7b-asym-recipe.md
+++ /dev/null
@@ -1,41 +0,0 @@
- **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
-
-A sample command to generate an INT4 model.
-```bash
-auto-round \
---model   facebook/opt-2.7b \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsamples 512 \
---minmax_lr 2e-3 \
---asym \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-
-Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, we used the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d
-##pip install auto-gptq[triton] 
-##pip install triton==2.2.0
-```bash
-lm_eval --model hf --model_args pretrained="./",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 32
-```
-
-
-
-| Metric         | FP16   | INT4   |
-| -------------- | ------ | ------ |
-| Avg.           | 0.4722 | 0.4757 |
-| mmlu           | 0.2568 | 0.2636 |
-| lambada_openai | 0.6359 | 0.6487 |
-| hellaswag      | 0.4585 | 0.4519 |
-| winogrande     | 0.6077 | 0.5967 |
-| piqa           | 0.7367 | 0.7410 |
-| truthfulqa_mc1 | 0.2240 | 0.2338 |
-| openbookqa     | 0.2500 | 0.2380 |
-| boolq          | 0.6046 | 0.6505 |
-| rte            | 0.5451 | 0.5379 |
-| arc_easy       | 0.6077 | 0.6035 |
-| arc_challenge  | 0.2679 | 0.2671 |
diff --git a/docs/acc.md b/docs/paper_acc.md
similarity index 100%
rename from docs/acc.md
rename to docs/paper_acc.md
diff --git a/docs/phi-2-old-sym-recipe.md b/docs/phi-2-old-sym-recipe.md
deleted file mode 100644
index 868493d49..000000000
--- a/docs/phi-2-old-sym-recipe.md
+++ /dev/null
@@ -1,38 +0,0 @@
- **This recipe is outdated, we recommend using the latest full range symmetric quantization.** You can remove --asym from the command.
-
-A sample command to generate an INT4 model.
-```bash
-auto-round \
---model   facebook/opt-2.7b \
---device 0 \
---group_size 128 \
---bits 4 \
---iters 1000 \
---nsamples 512 \
---format 'auto_gptq,auto_round' \
---output_dir "./tmp_autoround"
-```
-
-
-pip install lm-eval==0.4.2
-
-Due to the significant accuracy drop with the asymmetric kernel for this model, we opted to use symmetric quantization.
-
-```bash
-lm_eval --model hf --model_args pretrained="Intel/phi-2-int4-inc" --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,arc_easy,arc_challenge,mmlu --batch_size 16
-```
-
-| Metric         | FP16   | INT4  |
-| -------------- | ------ | -------- |
-| Avg.           | 0.6155 | 0.6163   |
-| mmlu           | 0.5448 | 0.5417   |
-| lambada_openai | 0.6268 | 0.6225   |
-| hellaswag      | 0.5585 | 0.5498   |
-| winogrande     | 0.7530 | 0.7545   |
-| piqa           | 0.7867 | 0.7824   |
-| truthfulqa_mc1 | 0.3133 | 0.3060   |
-| openbookqa     | 0.4000 | 0.4100   |
-| boolq          | 0.8339 | 0.8327   |
-| rte            | 0.6245 | 0.6643   |
-| arc_easy       | 0.7997 | 0.7955   |
-| arc_challenge  | 0.5290 | 0.5196   |
diff --git a/docs/step_by_step.md b/docs/step_by_step.md
index 3c1c69d1d..39a602c65 100644
--- a/docs/step_by_step.md
+++ b/docs/step_by_step.md
@@ -107,19 +107,25 @@ Please use ',' to split datasets, ':' to split parameters of a dataset and '+' t
 
 ## 3 Quantization
 
-### Supported Quantization Configurations
+### Supported Quantization Schemes
 
-AutoRound supports several quantization configurations:
+AutoRound supports several Schemes:
 
-- **Int8 Weight Only**
-- **Int4 Weight Only**
-- **Int3 Weight Only**
-- **Int2 Weight Only**
+- **W4A16**(bits:4,group_size:128,sym:True,act_bits:16)
+- **W8A16**(bits:8,group_size:128,sym:True,act_bits:16)
+- **W3A16**(bits:3,group_size:128,sym:True,act_bits:16)
+- **W2A16**(bits:2,group_size:128,sym:True,act_bits:16)
 - **Mixed bits Weight only**
+- **NVFP4**(data_type:nvfp4,act_data_type:nvfp4,static_global_scale,group_size 16)
+- **MXFP4**(**Research feature,no real kernel**, data_type:mxfp4,act_data_type:mxfp4,rceil,group_size 32)
+- **FPW8A16**(**Research feature,no real kernel**, data_type:fp8,act_data_type 16:,group_size 0->per tensor )
+- **FP8_STATIC**(**Research feature,no real kernel**, data_type:fp8,act_data_type:fp8,group_size -1 ->per channel, act_group_size=0->per tenosr)
+
+Besides, you could modify the `group_size`, `bits`, `sym` and many other configs you want, though there are maybe no real kernels.
 
 ### Supported export Formats
 
-**AutoRound Format**: This format is well-suited for CPU, HPU devices, 2 bits, as well as mixed-precision
+**AutoRound Format**: This format is well-suited for CPU, Intel GPU, CUDA and HPU devices, 2 bits, as well as mixed-precision
 inference. **[2,3,4,8] bits are supported**.
 
 **GGUF** Format: Experimental feature. This format is well-suited for CPU devices and is widely adopted by the
@@ -133,7 +139,7 @@ models. Besides, recently 3 bits may have some accuracy issues in Transformers.
 **AutoAWQ Format**: This format is well-suited for asymmetric 4-bit quantization on CUDA devices and is widely
 adopted within the community, **only 4-bits quantization is supported**.
 
-**LlmCompressor Format**: **only INT8 W8A8 dynamic quantization is supported**.
+**LLM-Compressor Format**:** NVFP4, MXFP(Kernel is WIP), INT8 are supported**.
 
 ### Hardware Compatibility
 
@@ -147,7 +153,7 @@ CPU, Intel GPU, HPU and CUDA for both quantization and inference.
    This setting offers a better trade-off between accuracy and tuning cost, and is recommended in all scenarios.
 
     ```bash
-    auto-round --model facebook/opt-125m  --bits 4 --group_size 128  --format "auto_gptq,auto_awq,auto_round"
+    auto-round --model facebook/opt-125m  --scheme "W4A16"  --format "auto_gptq,auto_awq,auto_round"
     ```
 
 - **Best Settings:**
@@ -155,7 +161,7 @@ CPU, Intel GPU, HPU and CUDA for both quantization and inference.
   This setting provides the best accuracy in most scenarios but is 4–5× slower than the standard AutoRound recipe. It is especially recommended for 2-bit quantization and is a good choice if sufficient resources are available.
   
   ```bash
-  auto-round-best --model facebook/opt-125m  --bits 4 --group_size 128  --format "auto_gptq,auto_awq,auto_round"
+  auto-round-best --model facebook/opt-125m  --scheme "W4A16"  --format "auto_gptq,auto_awq,auto_round"
     ```
 
 - **Light Settings:**
@@ -163,7 +169,7 @@ CPU, Intel GPU, HPU and CUDA for both quantization and inference.
     This setting offers the best speed (2-3X faster than AutoRound), but it may cause a significant accuracy drop for small models and 2-bit quantization. It is recommended for 4-bit settings and models larger than 3B
     
     ```bash
-    auto-round-light --model facebook/opt-125m  --bits 4  --group_size 128 --format "auto_gptq,auto_awq,auto_round"
+    auto-round-light --model facebook/opt-125m  --scheme "W4A16"  --format "auto_gptq,auto_awq,auto_round"
     ```
 
 ### API usage
@@ -171,25 +177,18 @@ CPU, Intel GPU, HPU and CUDA for both quantization and inference.
 This setting offers a better trade-off between accuracy and tuning cost, and is recommended in all scenarios.
 
 ```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from auto_round import AutoRound
 
-model_name = "facebook/opt-125m"
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-bits, group_size, sym = 4, 128, True
-autoround = AutoRound(
-    model,
-    tokenizer,
-    bits=bits,
-    group_size=group_size,
-    sym=sym,
+model_name_or_path = "facebook/opt-125m"
+ar = AutoRound(
+    model_name_or_path,
+    scheme="W4A16",
     # enable_torch_compile=True,
 )
 
 output_dir = "./tmp_autoround"
 # format= 'auto_round'(default), 'auto_gptq', 'auto_awq'
-autoround.quantize_and_save(output_dir, format="auto_gptq,auto_awq,auto_round")
+ar.quantize_and_save(output_dir, format="auto_gptq,auto_awq,auto_round")
 ```
 
 #### Mixed bits Usage
@@ -198,74 +197,55 @@ Auto-GPTQ and Auto-AWQ only support a limited set of mixed-bit configurations. I
 vLLM and SGLang fuse MoE and QKV layers, so it's recommended not to assign different bit widths to these layers.
 
 ```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from auto_round import AutoRound
 
-model_name = "facebook/opt-125m"
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-bits, group_size, sym = 4, 128, True
+model_name_or_path = "facebook/opt-125m"
+
 layer_config = {  #  Supports both full layer names and fuzzy (partial) matching
     "model.decoder.layers.6.self_attn.out_proj": {"bits": 8, "group_size": 32},
     "model.decoder.layers.*k_proj": {"bits": 2, "group_size": 32},
 }
-autoround = AutoRound(
-    model,
-    tokenizer,
-    bits=bits,
-    group_size=group_size,
-    sym=sym,
+ar = AutoRound(
+    model_name_or_path,
     layer_config=layer_config,
 )
 
 output_dir = "./tmp_autoround"
-autoround.quantize_and_save(output_dir, format="auto_round")
+ar.quantize_and_save(output_dir, format="auto_round")
 ```
 
 #### AutoRoundBest recipe
 This setting provides the best accuracy in most scenarios but is 4–5× slower than the standard AutoRound recipe. It is especially recommended for 2-bit quantization and is a good choice if sufficient resources are available.
 ```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from auto_round import AutoRound
 
-model_name = "facebook/opt-125m"
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-bits, group_size, sym = 4, 128, True
-autoround = AutoRound(
-    model, tokenizer, bits=bits, group_size=group_size, sym=sym, nsamples=512, iters=1000, low_gpu_mem_usage=True
-)
+model_name_or_path = "facebook/opt-125m"
+ar = AutoRound(model=model_name_or_path, scheme="W4A16", nsamples=512, iters=1000, low_gpu_mem_usage=True)
 
 output_dir = "./tmp_autoround"
-autoround.quantize_and_save(output_dir, format="auto_round")
+ar.quantize_and_save(output_dir, format="auto_round")
 ```
 #### AutoRoundLight recipe
 This setting offers the best speed (2 - 3X faster than AutoRound), but it may cause a significant accuracy drop for small models and 2-bit quantization. It is recommended for 4-bit settings and models larger than 3B.
 
 ```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from auto_round import AutoRound
 
-model_name = "facebook/opt-125m"
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-bits, group_size, sym = 4, 128, True
-autoround = AutoRound(
-    model,
-    tokenizer,
-    bits=bits,
-    group_size=group_size,
-    sym=sym,
+model_name_or_path = "facebook/opt-125m"
+
+ar = AutoRound(
+    model=model_name_or_path,
+    scheme="W4A16",
     iters=50,
     lr=5e-3,
 )
 
 output_dir = "./tmp_autoround"
-autoround.quantize_and_save(output_dir, format="auto_round")
+ar.quantize_and_save(output_dir, format="auto_round")
 ```
 #### Recipe recommendation
 
-In conclusion, we recommend using **auto-round for INT4 and auto-round-best for INT2**. However, you may adjust the
+In conclusion, we recommend using **auto-round for W4A16 and auto-round-best for W2A16**. However, you may adjust the
 configuration to suit your specific requirements and available resources.
 
 W4G128 Average Accuracy of 13 tasks and Time Cost Results(Testing was conducted on the Nvidia A100 80G using the version
@@ -296,42 +276,31 @@ AutoRound also supports RTN (Round-To-Nearest) mode for fast, calibration-free b
 
 For the GGUF format, we have optimized the RTN algorithm inspired by llamacpp. To use the original (pure) RTN algorithm instead, enable the `--disable_opt_rtn` option.
 ```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from auto_round import AutoRound
 
-model_name = "facebook/opt-125m"
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-bits, group_size, sym = 4, 32, True
-autoround = AutoRound(
-    model,
-    tokenizer,
-    bits=bits,
-    group_size=group_size,
-    sym=sym,
+model_name_or_path = "facebook/opt-125m"
+ar = AutoRound(
+    model=model_name_or_path,
+    scheme="W4A16",
     iters=0,
 )
 
 output_dir = "./tmp_autoround"
-autoround.quantize_and_save(output_dir, format="auto_round")
+ar.quantize_and_save(output_dir, format="auto_round")
 ```
 
 ### GGUF format
 Experimental feature. This format is well-suited for CPU devices and is widely adopted by the community. 
 This format is well-suited for CPU devices and is widely adopted by the community.
 ```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from auto_round import AutoRound
 
-model_name = "facebook/opt-125m"
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-autoround = AutoRound(
-    model,
-    tokenizer,
+model_name_or_path = "facebook/opt-125m"
+ar = AutoRound(
+    model=model_name_or_path,
 )
 output_dir = "./tmp_autoround"
-autoround.quantize_and_save(output_dir, format="gguf:q4_k_m")  #  gguf:q*_k_s,gguf:q*_k_0,gguf:q*_k_1,
+ar.quantize_and_save(output_dir, format="gguf:q4_k_m")  #  gguf:q*_k_s,gguf:q*_k_0,gguf:q*_k_1,
 ```
 
 
@@ -360,29 +329,50 @@ The 3B and 14B models were evaluated on Qwen 2.5, the 8X7B model is Mixtral, whi
 
 
 ### Device/Multi-GPU setting in Quantization
-**The tuning device is specified using the `device` argument in AutoRound API, _not_ through the `device_map` 
+**The tuning device is specified using the `device_map` argument in AutoRound API, _not_ through the `device_map` 
 parameter used by Transformers.from_pretrained.**
 
+AutoRound tunes the model in a block-by-block manner. Although the block size is much smaller than the model size, it still requires a significant amount of GPU memory for tuning—typically 10 times the block size. This can lead to out-of-memory (OOM) errors when working with extremely large models.
+
+For strategies to reduce GPU memory usage, please refer to the [Reduced GPU Memory Usage](###Adjust Hyperparameters)
+section below, where you  can adjust hyperparameters to optimize memory consumption.
+
+If adjusting hyperparameters does not resolve the issue a, a simple solution is just adding more devices in device_map, for example, 
+~~~python
+from auto_round import AutoRound
+
+model_name_or_path = "facebook/opt-125m"
+ar = AutoRound(
+    model=model_name_or_path,
+    device_map="0,1,2,3"
+)
+~~~
+
+or
+
+~~~bash
+CUDA_VISIBLE_DEVICES=0,1,2,3 auto-round --model "facebook/opt-125m" --scheme "W4A16" --device_map "auto"
+~~~
+
 
-There are typically two scenarios that require multi-GPU tuning: one is the calibration phase during LM head quantization, and the other is quantizing extremely large models (e.g., models larger than 100 GB).
+There are typically two scenarios that require multi-GPU tuning: one is the calibration phase mainly for lm-head quantization, and the other is quantizing extremely large models (e.g., models larger than 100 GB).
 
 #### Enable multiple gpus calibration in lm_head quantization
 For LM head tuning, AutoRound needs to cache the inputs to the lm-head, which requires the entire model to reside on 
-  the GPU for efficient calibration. If the model is too large to fit into a single GPU, AutoRound will prompt the user to use `--device '0,1'` to load the model across multiple GPUs.
-
-#### Enable multiple gpus tuning for extremely large model
-AutoRound tunes the model in a block-by-block manner. Although the block size is much smaller than the model size, it still requires a significant amount of GPU memory for tuning—typically 10 times the block size. This can lead to out-of-memory (OOM) errors when working with extremely large models.
+  the GPU for efficient calibration. If there is no enough VRAM, some layers will fallback to RTN mode
 
-For strategies to reduce GPU memory usage, please refer to the [Reduced GPU Memory Usage](###Adjust Hyperparameters)
-section below, where you 
-can adjust hyperparameters to optimize memory consumption.
+#### Manually set the device_map
 
-If adjusting hyperparameters does not resolve the issue, we also support mapping different layers within a block to 
+<details>
+<summary>Customized device map</summary>
+If device_map=auto does not correctly map the model, we also support mapping different layers within a block to 
 different devices by setting the `device_map` argument in the AutoRound API. For reference, we provide an example of 
 quantizing the DeepSeekV3-BF16 (1.4T) model using five 80GB GPUs.
-~~~python
+
+```python
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
+
 model_name = "opensourcerelease/DeepSeek-R1-bf16"
 
 tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -393,16 +383,23 @@ device_map = {}
 
 for n, m in block.named_modules():
     if isinstance(m, (torch.nn.Linear)):
-        if "experts" in n and ("shared_experts" not in n) and int(n.split('.')[-2]) < 63:
+        if "experts" in n and ("shared_experts" not in n) and int(n.split(".")[-2]) < 63:
             device = "cuda:1"
-        elif "experts" in n and ("shared_experts" not in n) and int(n.split('.')[-2]) >= 63 and int(
-                n.split('.')[-2]) < 128:
+        elif (
+            "experts" in n
+            and ("shared_experts" not in n)
+            and int(n.split(".")[-2]) >= 63
+            and int(n.split(".")[-2]) < 128
+        ):
             device = "cuda:2"
-        elif "experts" in n and ("shared_experts" not in n) and int(n.split('.')[-2]) >= 128 and int(
-                n.split('.')[-2]) < 192:
+        elif (
+            "experts" in n
+            and ("shared_experts" not in n)
+            and int(n.split(".")[-2]) >= 128
+            and int(n.split(".")[-2]) < 192
+        ):
             device = "cuda:3"
-        elif "experts" in n and ("shared_experts" not in n) and int(
-                n.split('.')[-2]) >= 192:
+        elif "experts" in n and ("shared_experts" not in n) and int(n.split(".")[-2]) >= 192:
             device = "cuda:4"
         else:
             device = "cuda:0"
@@ -412,13 +409,19 @@ for n, m in block.named_modules():
 
 from auto_round import AutoRound
 
-autoround = AutoRound(model=model, tokenizer=tokenizer, device_map=device_map, nsamples=512,
-                      batch_size=4, low_gpu_mem_usage=True, seqlen=2048,
-                      )
+autoround = AutoRound(
+    model=model,
+    tokenizer=tokenizer,
+    device_map=device_map,
+    nsamples=512,
+    batch_size=4,
+    low_gpu_mem_usage=True,
+    seqlen=2048,
+)
 autoround.quantize()
 autoround.save_quantized(format="auto_awq", output_dir="tmp_autoround")
-~~~
-  
+```
+</details> 
   
 
 ### Adjust Hyperparameters
@@ -469,7 +472,7 @@ autoround.save_quantized(format="auto_awq", output_dir="tmp_autoround")
   Currently only support in AutoRound format inference for this config
 
     ```bash
-    auto-round --model_name facebook/opt-125m  --bits 4 --group_size 128 --quant_lm_head --format "auto_round"
+    auto-round --model_name facebook/opt-125m  --scheme "W4A16" --quant_lm_head --format "auto_round"
     ```