-
Notifications
You must be signed in to change notification settings - Fork 25.3k
/
quantization_config.py
148 lines (125 loc) 路 6.65 KB
/
quantization_config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import json
import os
from dataclasses import dataclass
from typing import Any, Dict, Union
from ..utils import logging
logger = logging.get_logger(__name__)
@dataclass
class BitsAndBytesConfig:
"""
This is a wrapper class about all possible attributes and features that you can play with a model that has been
loaded using `bitsandbytes`.
This replaces `load_in_8bit` therefore both options are mutually exclusive.
For now, only arguments that are relative to `LLM.int8()` are supported, therefore the arguments are all termed as
`llm_int8_*`. If more methods are added to `bitsandbytes`, then more arguments will be added to this class.
Args:
load_in_8bit (`bool`, *optional*, defaults to `False`):
This flag is used to enable 8-bit quantization with LLM.int8().
llm_int8_threshold (`float`, *optional*, defaults to 6):
This corresponds to the outlier threshold for outlier detection as described in `LLM.int8() : 8-bit Matrix
Multiplication for Transformers at Scale` paper: https://arxiv.org/abs/2208.07339 Any hidden states value
that is above this threshold will be considered an outlier and the operation on those values will be done
in fp16. Values are usually normally distributed, that is, most values are in the range [-3.5, 3.5], but
there are some exceptional systematic outliers that are very differently distributed for large models.
These outliers are often in the interval [-60, -6] or [6, 60]. Int8 quantization works well for values of
magnitude ~5, but beyond that, there is a significant performance penalty. A good default threshold is 6,
but a lower threshold might be needed for more unstable models (small models, fine-tuning).
llm_int8_skip_modules (`List[str]`, *optional*):
An explicit list of the modules that we do not want to convert in 8-bit. This is useful for models such as
Jukebox that has several heads in different places and not necessarily at the last position. For example
for `CausalLM` models, the last `lm_head` is kept in its original `dtype`.
llm_int8_enable_fp32_cpu_offload (`bool`, *optional*, defaults to `False`):
This flag is used for advanced use cases and users that are aware of this feature. If you want to split
your model in different parts and run some parts in int8 on GPU and some parts in fp32 on CPU, you can use
this flag. This is useful for offloading large models such as `google/flan-t5-xxl`. Note that the int8
operations will not be run on CPU.
kwargs (`Dict[str, Any]`, *optional*):
Additional parameters from which to initialize the configuration object.
"""
def __init__(
self,
load_in_8bit=False,
llm_int8_threshold=6.0,
llm_int8_skip_modules=None,
llm_int8_enable_fp32_cpu_offload=False,
**kwargs,
):
self.load_in_8bit = load_in_8bit
self.llm_int8_threshold = llm_int8_threshold
self.llm_int8_skip_modules = llm_int8_skip_modules
self.llm_int8_enable_fp32_cpu_offload = llm_int8_enable_fp32_cpu_offload
self.post_init()
def post_init(self):
r"""
Safety checker that arguments are correct - also replaces some NoneType arguments with their default values.
"""
if not isinstance(self.llm_int8_threshold, float):
raise ValueError("llm_int8_threshold must be a float")
if self.llm_int8_skip_modules is not None and not isinstance(self.llm_int8_skip_modules, list):
raise ValueError("llm_int8_skip_modules must be a list of strings")
if not isinstance(self.llm_int8_enable_fp32_cpu_offload, bool):
raise ValueError("llm_int8_enable_fp32_cpu_offload must be a boolean")
@classmethod
def from_dict(cls, config_dict, return_unused_kwargs, **kwargs):
"""
Instantiates a [`BitsAndBytesConfig`] from a Python dictionary of parameters.
Args:
config_dict (`Dict[str, Any]`):
Dictionary that will be used to instantiate the configuration object.
return_unused_kwargs (`bool`):
Whether or not to return a list of unused keyword arguments. Used for `from_pretrained` method in
`PreTrainedModel`.
kwargs (`Dict[str, Any]`):
Additional parameters from which to initialize the configuration object.
Returns:
[`BitsAndBytesConfig`]: The configuration object instantiated from those parameters.
"""
config = cls(**config_dict)
to_remove = []
for key, value in kwargs.items():
if hasattr(config, key):
setattr(config, key, value)
to_remove.append(key)
for key in to_remove:
kwargs.pop(key, None)
if return_unused_kwargs:
return config, kwargs
else:
return config
def to_json_file(self, json_file_path: Union[str, os.PathLike]):
"""
Save this instance to a JSON file.
Args:
json_file_path (`str` or `os.PathLike`):
Path to the JSON file in which this configuration instance's parameters will be saved.
use_diff (`bool`, *optional*, defaults to `True`):
If set to `True`, only the difference between the config instance and the default
`BitsAndBytesConfig()` is serialized to JSON file.
"""
with open(json_file_path, "w", encoding="utf-8") as writer:
config_dict = self.to_dict()
json_string = json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
writer.write(json_string)
def to_dict(self) -> Dict[str, Any]:
"""
Serializes this instance to a Python dictionary. Returns:
`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
"""
output = copy.deepcopy(self.__dict__)
return output