[`bnb`] Fix blip2 4bit #23895

younesbelkada · 2023-05-31T09:37:22Z

What does this PR do?

Indeed, for models such as Blip2 that have lm head inside submodules (and not directly on the top level of the model), the lm head does get converted to 4bit / 8bit models, leading to unexpected behavior for 4bit models. The PR fixes this by making sure to consider the last term after . when creating modules_not_to_convert.

cc @sgugger

HuggingFaceDocBuilderDev · 2023-05-31T09:53:03Z

The docs for this PR live here. All of your documentation changes will be reflected on that endpoint.

sgugger

This feels a tiny bit brittle. Are you sure it doesn't break any other model quantization?

younesbelkada · 2023-05-31T14:41:09Z

It should be all good, I have verified the slow tests pass for 8bit and 4bit. Let me know if there is anything particular I should have a look. Per my understanding this only affects Blip2 as it is the only model (from what I know) that have an lm head as part of a submodule.

younesbelkada · 2023-07-06T07:10:27Z

Hmm getting some gibberish output with the fix, need to investigate more

github-actions · 2023-07-30T15:02:40Z

This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.

Please note that issues that do not follow the contributing guidelines are likely to be ignored.

kevinknights29 · 2023-09-01T23:53:44Z

I know this was closed, but I'm getting the following error:
FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first.

Runner: Docker container using python:3.9 image.

Usage:

input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to("cuda:0" if torch.cuda.is_available() else "cpu")
outputs = model.generate(input_ids, max_length=500)

Model class:

class ModelLoader:
    def __init__(self, model_path: str):
        self.model_path = model_path
        self.config = AutoConfig.from_pretrained(
            self.model_path,
            trust_remote_code=True,
            use_auth_token=os.getenv("HUGGINGFACE_TOKEN"),
        )
        self.model = self._load_model()
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_path,
            use_auth_token=os.getenv("HUGGINGFACE_TOKEN"),
        )

    def _load_model(self):
        if torch.cuda.is_available():
            return AutoModelForCausalLM.from_pretrained(
                self.model_path,
                config=self.config,
                trust_remote_code=True,
                device_map="cuda:0", # or "auto"
                use_auth_token=os.getenv("HUGGINGFACE_TOKEN"),
            )
        return AutoModelForCausalLM.from_pretrained(
            self.model_path,
            config=self.config,
            trust_remote_code=True,
            load_in_8bit=True,
            device_map="cpu",
            torch_dtype=torch.float16,
            quantization_config=BitsAndBytesConfig(
                load_in_4bit=True, 
                bnb_4bit_compute_dtype=torch.bfloat16,
                llm_int8_enable_fp32_cpu_offload=True,
            ),
            use_auth_token=os.getenv("HUGGINGFACE_TOKEN"),
        )

Complete traceback:

2023-09-01 18:34:47 Traceback (most recent call last):
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/celery/app/trace.py", line 477, in trace_task
2023-09-01 18:34:47     R = retval = fun(*args, **kwargs)
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/celery/app/trace.py", line 760, in __protected_call__
2023-09-01 18:34:47     return self.run(*args, **kwargs)
2023-09-01 18:34:47   File "/app/src/celery/celery.py", line 29, in generate_text_task
2023-09-01 18:34:47     time, memory, outputs = generate_output(
2023-09-01 18:34:47   File "/app/src/utils/utils.py", line 36, in wrapper
2023-09-01 18:34:47     result, exec_time = func(*args, **kwargs)
2023-09-01 18:34:47   File "/app/src/utils/utils.py", line 16, in wrapper
2023-09-01 18:34:47     result = func(*args, **kwargs)
2023-09-01 18:34:47   File "/app/src/utils/utils.py", line 53, in generate_output
2023-09-01 18:34:47     outputs = model.generate(input_ids, max_length=500)
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
2023-09-01 18:34:47     return func(*args, **kwargs)
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/transformers/generation/utils.py", line 1572, in generate
2023-09-01 18:34:47     return self.sample(
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/transformers/generation/utils.py", line 2619, in sample
2023-09-01 18:34:47     outputs = self(
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
2023-09-01 18:34:47     return forward_call(*args, **kwargs)
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
2023-09-01 18:34:47     output = old_forward(*args, **kwargs)
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 688, in forward
2023-09-01 18:34:47     outputs = self.model(
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
2023-09-01 18:34:47     return forward_call(*args, **kwargs)
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
2023-09-01 18:34:47     output = old_forward(*args, **kwargs)
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 578, in forward
2023-09-01 18:34:47     layer_outputs = decoder_layer(
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
2023-09-01 18:34:47     return forward_call(*args, **kwargs)
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
2023-09-01 18:34:47     output = old_forward(*args, **kwargs)
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 292, in forward
2023-09-01 18:34:47     hidden_states, self_attn_weights, present_key_value = self.self_attn(
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
2023-09-01 18:34:47     return forward_call(*args, **kwargs)
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
2023-09-01 18:34:47     output = old_forward(*args, **kwargs)
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 194, in forward
2023-09-01 18:34:47     query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
2023-09-01 18:34:47     return forward_call(*args, **kwargs)
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
2023-09-01 18:34:47     output = old_forward(*args, **kwargs)
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/bitsandbytes/nn/modules.py", line 248, in forward
2023-09-01 18:34:47     out = bnb.matmul_4bit(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state)
2023-09-01 18:34:47   File "/usr/local/lib/python3.9/site-packages/bitsandbytes/autograd/_functions.py", line 567, in matmul_4bit
2023-09-01 18:34:47     assert quant_state is not None
2023-09-01 18:34:47 AssertionError

SunMarc · 2023-09-22T10:12:10Z

Hi @kevinknights29 , I see that in your script, you are trying to load in 8-bit and in 4-bit at the same time. Please select only one option.

return AutoModelForCausalLM.from_pretrained(
            self.model_path,
            config=self.config,
            trust_remote_code=True,
# either remove load_in_8bit arg
            load_in_8bit=True,
            device_map="cpu",
            torch_dtype=torch.float16,
# or remove quantization_config 
            quantization_config=BitsAndBytesConfig(
                load_in_4bit=True, 
                bnb_4bit_compute_dtype=torch.bfloat16,
                llm_int8_enable_fp32_cpu_offload=True,
            )

On my side I was able to execute the following script with :

transformers version: 4.34.0.dev0 (main branch)
accelerate version: 0.23
bitsandbytes version: 0.41.1

import torch
from transformers import Blip2ForConditionalGeneration, Blip2Processor, BitsAndBytesConfig
from PIL import Image
import requests

nf4_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.float16
)

processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-6.7b-coco")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-6.7b-coco", device_map='auto', quantization_config=nf4_config)

def prepare_img():
    url = "https://huggingface.co/hf-internal-testing/blip-test-image/resolve/main/demo.jpg"
    image = Image.open(requests.get(url, stream=True).raw)
    return image

image = prepare_img()
inputs = processor(images=[image, image], return_tensors="pt").to(dtype=torch.float16)

predictions = model.generate(**inputs, num_beams=2)
print(processor.batch_decode(predictions, skip_special_tokens=True)[0].strip())
# print -> a woman sitting on the beach with her dog

robinsonmhj · 2024-02-22T14:21:39Z

I am getting similar error while using llma2 7b model, and I am using the latest version of transformers

here is the code

from transformers import AutoTokenizer, set_seed, BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )
model_name = 'llm-models/Llama-2-7b-hf'
model = AutoModelForCausalLM.from_pretrained(
    model_name,  
    quantization_config=bnb_config,
    device_map="cuda",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map='cuda')
def generate_text(prompt):
    # Tokenize the prompt
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    
    print(f'inputs is {inputs} on {inputs.device}')
    
    inputs = inputs.to('cuda:0')
    
    print(f'inputs is {inputs} on {inputs.device}')
    
    # Generate a response
    outputs = model.generate(inputs)
    
    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return response

prompt = 'User1: Hey, I need a new laptop. Which one should I buy?'
response = generate_text(prompt)
print(response)
package info
transformers==4.38.1
accelerate==0.21.0
bitsandbytes==0.42.0

I also tried 4.34, it doesn't work either. Besides that, I check this #23895, it doesn't look like that it is in any of the release branch nor the maser branch

here is the error I get

FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first.
AssertionError Traceback (most recent call last)
Cell In[18], line 2
1 prompt = 'User1: Hey, I need a new laptop. Which one should I buy?'
----> 2 response = generate_text(prompt)
3 print(response)

Cell In[17], line 13, in generate_text(prompt)
10 print(f'inputs is {inputs} on {inputs.device}')
12 # Generate a response
---> 13 outputs = model.generate(inputs)
15 # Decode the response
16 response = tokenizer.decode(outputs[0], skip_special_tokens=True)

File /opt/conda/envs/domino-ray/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)

File /opt/conda/envs/domino-ray/lib/python3.10/site-packages/transformers/generation/utils.py:1345, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, **kwargs)
1337 logger.warning(
1338 "A decoder-only architecture is being used, but right-padding was detected! For correct "
1339 "generation results, please set padding_side='left' when initializing the tokenizer."
1340 )
1342 if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
1343 # if model is encoder decoder encoder_outputs are created
1344 # and added to model_kwargs
-> 1345 model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
1346 inputs_tensor, model_kwargs, model_input_name
1347 )
1349 # 5. Prepare input_ids which will be used for auto-regressive generation
1350 if self.config.is_encoder_decoder:

File /opt/conda/envs/domino-ray/lib/python3.10/site-packages/transformers/generation/utils.py:644, in GenerationMixin._prepare_encoder_decoder_kwargs_for_generation(self, inputs_tensor, model_kwargs, model_input_name)
642 encoder_kwargs["return_dict"] = True
643 encoder_kwargs[model_input_name] = inputs_tensor
--> 644 model_kwargs["encoder_outputs"]: ModelOutput = encoder(**encoder_kwargs)
646 return model_kwargs