In [None]:
#!pip install transformers

In [1]:
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration, InstructBlipConfig, AutoModelForVision2Seq
import torch
from PIL import Image
import requests

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#!pip install accelerate

In [2]:
from accelerate import infer_auto_device_map, init_empty_weights

In [3]:
# Load the model configuration.
config = InstructBlipConfig.from_pretrained("Salesforce/instructblip-vicuna-13b")

# Initialize the model with the given configuration.
with init_empty_weights():
    model = AutoModelForVision2Seq.from_config(config)
    model.tie_weights()

# Infer device map based on the available resources.
device_map = infer_auto_device_map(model, max_memory={0: "11GiB", 1: "11GiB", 2: "11GiB", 3: "11GiB"},
                                   no_split_module_classes=['InstructBlipEncoderLayer', 'InstructBlipQFormerLayer',
                                                            'LlamaDecoderLayer'])
device_map['language_model.lm_head'] = device_map['language_projection'] = device_map[('language_model.model'
                                                                                       '.embed_tokens')]

offload = ""
# Load the processor and model for image processing.
processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-13b", device_map="auto")
model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-13b",
                                                             device_map=device_map,
                                                             offload_folder=offload, offload_state_dict=True)

Downloading (…)lve/main/config.json: 100%|██████████| 6.66k/6.66k [00:00<00:00, 15.6MB/s]
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Downloading (…)okenizer_config.json: 100%|██████████| 343/343 [00:00<00:00, 1.10MB/s]
Downloading (…)_tokenizer/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 13.3MB/s]
Downloading (…)nizer/tokenizer.json: 100%|██████████| 712k/712k [00:00<00:00, 1.38MB/s]
Downloading (…)er/added_tokens.json: 100%|██████████| 21.0/21.0 [00:00<00:00, 77.5kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 149/149 [00:00<00:00, 555kB/s]
Downloading (…)rocessor_config.json: 100%|██████████| 439/439 [00:00<00:00, 1.68MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 442/442 [00:00<00:00, 1.45MB/s]
Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 12.3MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.84M/1.84M [00:00<00:00, 5.30MB/s]
Downloading (…)in/

In [2]:
model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map = "auto")
processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

Downloading (…)l-00002-of-00004.bin: 100%|██████████| 9.96G/9.96G [17:34<00:00, 9.45MB/s]
Downloading (…)l-00003-of-00004.bin: 100%|██████████| 9.92G/9.92G [17:29<00:00, 9.45MB/s]
Downloading (…)l-00004-of-00004.bin: 100%|██████████| 1.87G/1.87G [03:18<00:00, 9.47MB/s]
Downloading shards: 100%|██████████| 4/4 [38:23<00:00, 575.79s/it]
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Loading checkpoint shards: 100%|██████████| 4/4 [00:17<00:00,  4.44s/it]
Downloading (…)okenizer_config.json: 100%|██████████| 343/343 [00:00<00:00, 1.21MB/s]
Downloading (…)_tokenizer/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 14.3MB/s]
Downloading (…)nizer/tokenizer.json: 100%|██████████| 712k/712k [00:00<00:00, 12.2MB/s]
Downloading (…)er/added_tokens.json: 100%|██████████| 21.0/21.0 [00:00<00:00, 82.2kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 149/149 [00:00<00:00, 584kB/s]
Downloading (…)rocessor_config.json: 100

# Accelerate

In [None]:
!pip install accelerate

Collecting accelerate
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━[0m [32m143.4/244.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.21.0


In [None]:
!accelerate config

In which compute environment are you running?
Please input a choice index (starting from 0), and press enter
 ➔  [32mThis machine[0m
    AWS (Amazon SageMaker)
[2A[?25l
[32mThis machine[0m
Which type of machine are you using?
Please input a choice index (starting from 0), and press enter
 ➔  [32mNo distributed training[0m
    multi-CPU
    multi-XPU
    multi-GPU
    multi-NPU
    TPU
[6A[?25l
[32mNo distributed training[0m
[?25hDo you want to run your training on CPU only (even if a GPU / Apple Silicon device is available)? [yes/NO]:NO
Do you wish to optimize your script with torch dynamo?[yes/NO]:NO
Do you want to use DeepSpeed? [yes/NO]: NO
What GPU(s) (by id) should be used for training on this machine as a comma-seperated list? [all]:NO
Do you wish to use FP16 or BF16 (mixed precision)?
Please input a choice index (starting from 0), and press enter
 ➔  [32mno[0m
    fp16
    bf16
    fp8
[4A[?25lno
[32mno[0m
[?25haccelerate configuration saved at /root/.cache/h

In [None]:
!cat /root/.cache/huggingface/accelerate/default_config.yaml

compute_environment: LOCAL_MACHINE
distributed_type: 'NO'
downcast_bf16: 'no'
gpu_ids: 'NO'
machine_rank: 0
main_training_function: main
mixed_precision: 'no'
num_machines: 1
num_processes: 1
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false


In [None]:
from huggingface_hub import snapshot_download
checkpoint = "Salesforce/instructblip-vicuna-7b"
weights_location = snapshot_download(repo_id=checkpoint)

Fetching 19 files:   0%|          | 0/19 [00:00<?, ?it/s]

Downloading (…)22/added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading (…)2ef22/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

Downloading (…)944312ef22/README.md:   0%|          | 0.00/2.14k [00:00<?, ?B/s]

Downloading (…)4312ef22/config.json:   0%|          | 0.00/6.66k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/149 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading (…)er/added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading (…)nizer/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/343 [00:00<?, ?B/s]

Downloading (…)_tokenizer/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/209 [00:00<?, ?B/s]

Downloading (…)2ef22/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)l-00001-of-00004.bin:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/442 [00:00<?, ?B/s]

Downloading (…)l-00003-of-00004.bin:   0%|          | 0.00/9.92G [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)l-00002-of-00004.bin:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

Downloading (…)l-00004-of-00004.bin:   0%|          | 0.00/1.87G [00:00<?, ?B/s]

In [None]:
from accelerate import infer_auto_device_map, init_empty_weights, load_checkpoint_and_dispatch

with init_empty_weights():
  model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b")
  #LlamaForCausalLM

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
model = load_checkpoint_and_dispatch(
    model, checkpoint=weights_location,
    max_memory = {0: "10GiB", "cpu":"25GiB"},
    #device_map="auto",
    no_split_module_classes=["LlamaDecoderLayer"]
)



In [None]:
with init_empty_weights():
  processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

In [None]:
processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

In [None]:
model

InstructBlipForConditionalGeneration(
  (vision_model): InstructBlipVisionModel(
    (embeddings): InstructBlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): InstructBlipEncoder(
      (layers): ModuleList(
        (0-38): 39 x InstructBlipEncoderLayer(
          (self_attn): InstructBlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1408, out_features=4224, bias=True)
            (projection): Linear(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
          (mlp): InstructBlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        

# Inference

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
#model.to(device)
url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
prompt = "What is unusual about this image?"
inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)

In [7]:
print(repr(model))

InstructBlipForConditionalGeneration(
  (vision_model): InstructBlipVisionModel(
    (embeddings): InstructBlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): InstructBlipEncoder(
      (layers): ModuleList(
        (0-38): 39 x InstructBlipEncoderLayer(
          (self_attn): InstructBlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1408, out_features=4224, bias=True)
            (projection): Linear(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
          (mlp): InstructBlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        

In [5]:
outputs = model.generate(
    **inputs,
    do_sample=False,
    num_beams=5,
    max_length=256,
    min_length=1,
    top_p=0.9,
    repetition_penalty=1.5,
    length_penalty=1.0,
    temperature=1,
)
generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
print(generated_text)



RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0!