#### Install dependencies

In [1]:
!git clone https://github.com/deepseek-ai/DeepSeek-VL

Cloning into 'DeepSeek-VL'...
remote: Enumerating objects: 95, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 95 (delta 19), reused 5 (delta 5), pack-reused 61[K
Receiving objects: 100% (95/95), 11.16 MiB | 4.82 MiB/s, done.
Resolving deltas: 100% (28/28), done.


In [1]:
import os
os.chdir("DeepSeek-VL")

In [None]:
!pip install -U -e .
!pip install -U -q quanto
!pip install git+https://github.com/huggingface/transformers.git

#### Create quanto configuration

In [2]:
from transformers import QuantoConfig

quanto_config = QuantoConfig(weights="int8")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Monkey patch collections
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import collections
import collections.abc
for type_name in collections.abc.__all__:
    setattr(collections, type_name, getattr(collections.abc, type_name))
from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
from deepseek_vl.utils.io import load_pil_images

#### Load model

In [4]:
# specify the path to the model
model_path = "deepseek-ai/deepseek-vl-1.3b-chat"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, quantization_config=quanto_config)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Let's check that **dtype=torch.int8** and the scale tensor is **dtype=torch.bfloat16**

In [5]:
vl_gpt.vision_model.vision_tower.blocks[0].attn.qkv.weight

QTensor(tensor([[ 16,   2,  23,  ...,  -7,  -3,   0],
        [  3,  10,   3,  ..., -59,  -8, -31],
        [  3,   4,  41,  ...,  31,  16, -13],
        ...,
        [  1,  22,  21,  ...,  -1,  19,  -6],
        [-17,  30,   5,  ...,   1, -16,  -9],
        [-24, -53, -28,  ...,  -5,  -4,  28]], device='cuda:0',
       dtype=torch.int8), scale=tensor([[0.0014],
        [0.0010],
        [0.0010],
        ...,
        [0.0006],
        [0.0005],
        [0.0006]], device='cuda:0', dtype=torch.bfloat16), public_dtype=torch.bfloat16)

*For use if you are using google colab*

In [8]:
from google.colab import files

# choose and upload local images
uploaded_images = files.upload()

Saving Screenshot from 2024-03-18 09-43-21.png to Screenshot from 2024-03-18 09-43-21.png


#### Let's test the new quantized model! :)

In [None]:
conversation = [
    {
        "role": "User",
        "content": "<image_placeholder>Describe this image.",
        "images": ["image_13.png"]
    },
    {
        "role": "Assistant",
        "content": ""
    }
]

# load images and prepare for inputs
pil_images = load_pil_images(conversation)
prepare_inputs = vl_chat_processor(
    conversations=conversation,
    images=pil_images,
    force_batchify=True
).to(vl_gpt.device)

# run image encoder to get the image embeddings
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

# run the model to get the response
outputs = vl_gpt.language_model.generate(
    inputs_embeds=inputs_embeds,
    attention_mask=prepare_inputs.attention_mask,
    pad_token_id=tokenizer.eos_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=512,
    do_sample=False,
    use_cache=True
)

answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
print(f"{prepare_inputs['sft_format'][0]}", answer)


In [None]:
state_dict = vl_gpt.state_dict()
print(state_dict)

# can't be pushed to the hub due to serialization errors
# save_file(state_dict, "vl_gpt.safetensors")
# vl_gpt.push_to_hub("jucamohedano/deepseek-vl-1.3b-chat-Quanto-8bit")

#### Check the size of the model

In [7]:
# monkey patched for quanto
def named_module_tensors(module, recurse=False):
    for named_parameter in module.named_parameters(recurse=recurse):
      name, val = named_parameter
      flag = True
      if hasattr(val,"_data") or hasattr(val,"_scale"):
        if hasattr(val,"_data"):
          yield name + "._data", val._data
        if hasattr(val,"_scale"):
          yield name + "._scale", val._scale
      else:
        yield named_parameter

    for named_buffer in module.named_buffers(recurse=recurse):
      yield named_buffer

def dtype_byte_size(dtype):
    """
    Returns the size (in bytes) occupied by one parameter of type `dtype`.
    """
    import re
    if dtype == torch.bool:
        return 1 / 8
    bit_search = re.search(r"[^\d](\d+)$", str(dtype))
    if bit_search is None:
        raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
    bit_size = int(bit_search.groups()[0])
    return bit_size // 8

def compute_module_sizes(model):
    """
    Compute the size of each submodule of a given model.
    """
    from collections import defaultdict
    module_sizes = defaultdict(int)
    for name, tensor in named_module_tensors(model, recurse=True):
      size = tensor.numel() * dtype_byte_size(tensor.dtype)
      name_parts = name.split(".")
      for idx in range(len(name_parts) + 1):
        module_sizes[".".join(name_parts[:idx])] += size

    return module_sizes

In [8]:
module_sizes = compute_module_sizes(vl_gpt)

print(f"The model size is {module_sizes[''] * 1e-9} GB")

The model size is 2.6328679000000004 GB


In [9]:
vl_gpt

MultiModalityCausalLM(
  (vision_model): CLIPVisionTower(
    (vision_tower): VisionTransformer(
      (patch_embed): PatchEmbed(
        (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
        (norm): Identity()
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (patch_drop): Identity()
      (norm_pre): Identity()
      (blocks): Sequential(
        (0): Block(
          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (attn): Attention(
            (qkv): QLinear(in_features=1024, out_features=3072, bias=True)
            (q_norm): Identity()
            (k_norm): Identity()
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj): QLinear(in_features=1024, out_features=1024, bias=True)
            (proj_drop): Identity()
          )
          (ls1): Identity()
          (drop_path1): Identity()
          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (mlp): Mlp(
            (fc1): QLinear