Skip to content

Commit

Permalink
[Neural Speed]load model from modelscope (#1382)
Browse files Browse the repository at this point in the history
Co-authored-by: Wenxin Zhang <wenxin.zhang@intel.com>
  • Loading branch information
intellinjun and VincyZhang committed Mar 27, 2024
1 parent 584ed50 commit 20ae003
Show file tree
Hide file tree
Showing 9 changed files with 54 additions and 11 deletions.
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,22 @@ model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True)
outputs = model.generate(inputs)
```

You can also load PyTorch Model from Modelscope
>**Note**:require modelscope
```python
from transformers import TextStreamer
from modelscope import AutoTokenizer
from intel_extension_for_transformers.transformers import AutoModelForCausalLM
model_name = "qwen/Qwen-7B" # Modelscope model_id or local model
prompt = "Once upon a time, there existed a little girl,"

model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True, model_hub="modelscope")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
inputs = tokenizer(prompt, return_tensors="pt").input_ids
streamer = TextStreamer(tokenizer)
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
```

You can also load the low-bit model quantized by GPTQ/AWQ/RTN/AutoRound algorithm.
```python
from transformers import AutoTokenizer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ tiktoken
py-cpuinfo
cmake
gguf
neural-speed
neural-speed==1.0a0
2 changes: 1 addition & 1 deletion examples/huggingface/neural_speed/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
intel_extension_for_transformers
neural-speed
neural-speed==1.0a0
git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
sentencepiece
gguf
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
--extra-index-url https://download.pytorch.org/whl/cpu
accelerate
auto-gptq
cmake
datasets
einops
gguf
neural-speed==1.0a0
numpy
peft
protobuf<3.20
py-cpuinfo
sentencepiece
tiktoken
torch==2.2.0+cpu
transformers
transformers_stream_generator
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2
huggingface_hub
intel_extension_for_pytorch==2.2.0
neural-compressor
neural_speed
neural_speed==1.0a0
numpy==1.23.5
onnx>=1.15.0
optimum
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2
huggingface_hub
intel_extension_for_pytorch==2.2.0
neural-compressor
neural_speed
neural_speed==1.0a0
numpy==1.23.5
optimum
optimum-intel
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ langid
librosa
markdown
neural-compressor
neural_speed
neural_speed==1.0a0
num2words
numba
numpy==1.23.5
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,8 @@ class _BaseQBitsAutoModelClass:
"qwen",
"phi",
"whisper",
"qwen2",
"gemma",
]

model_type_list_for_gptq = [
Expand Down Expand Up @@ -361,12 +363,19 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
)

config = kwargs.pop("config", None)
model_hub = kwargs.pop("model_hub", "huggingface")

if not isinstance(config, PretrainedConfig):
config, _ = AutoConfig.from_pretrained(
pretrained_model_name_or_path,
return_unused_kwargs=True,
**kwargs,
if model_hub == "modelscope":
import modelscope # pylint: disable=E0401
config = modelscope.AutoConfig.from_pretrained(pretrained_model_name_or_path,
trust_remote_code=True)
else:
config, _ = AutoConfig.from_pretrained(
pretrained_model_name_or_path,
return_unused_kwargs=True,
**kwargs,

)

quantization_config = kwargs.pop("quantization_config", None)
Expand Down Expand Up @@ -541,7 +550,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
from neural_speed import Model

model = Model()
model.init(
model.init( # pylint: disable=E1123
pretrained_model_name_or_path,
weight_dtype=quantization_config.weight_dtype,
alg=quantization_config.scheme,
Expand All @@ -557,6 +566,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
use_gptq=quantization_config.quant_method.value == "gptq"
or quantization_config.quant_method.value == "autoround",
use_awq=quantization_config.quant_method.value == "awq",
model_hub=model_hub,
)
model.quantization_config = quantization_config
return model
Expand Down
2 changes: 1 addition & 1 deletion tests/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ git+https://github.com/intel/neural-compressor.git
intel-extension-for-pytorch==2.2.0
intel-tensorflow==2.14.0
mlflow
neural-speed
neural-speed==1.0a0
nlpaug==1.1.9
onnx==1.15.0
onnxruntime==1.17.1
Expand Down

0 comments on commit 20ae003

Please sign in to comment.