In [1]:
from transformers import (
    LlavaForConditionalGeneration,
    LlavaConfig
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
clip_model_name_or_path = (
    "/root/autodl-tmp/Models/clip-vit-large-patch14-336"
)
qwen_model_name_or_path = "/root/autodl-tmp/Models/Qwen1.5-4B-Chat"

In [3]:
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoProcessor

clip_model = AutoModel.from_pretrained(clip_model_name_or_path, device_map="cuda:0")
llm_model = AutoModelForCausalLM.from_pretrained(
    qwen_model_name_or_path, device_map="cuda:0"
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.77s/it]


In [4]:
llm_tokenizer = AutoTokenizer.from_pretrained(qwen_model_name_or_path)
llm_tokenizer.encode("<image>")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[151646]

In [5]:
from transformers import (
    LlavaForConditionalGeneration,
    LlavaConfig
)

In [6]:
# Initializing a CLIP-vision config
vision_config = clip_model.vision_model.config

# Initializing a Llama config
text_config = llm_model.config

# Initializing a Llava llava-1.5-7b style configuration
configuration = LlavaConfig(vision_config, text_config)

# Initializing a model from the llava-1.5-7b style configuration
model = LlavaForConditionalGeneration(configuration)

In [7]:
model.vision_tower.vision_model

CLIPVisionTransformer(
  (embeddings): CLIPVisionEmbeddings(
    (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (position_embedding): Embedding(577, 1024)
  )
  (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  (encoder): CLIPEncoder(
    (layers): ModuleList(
      (0-23): 24 x CLIPEncoderLayer(
        (self_attn): CLIPAttention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): CLIPMLP(
          (activation_fn): QuickGELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias

In [8]:
clip_model.vision_model

CLIPVisionTransformer(
  (embeddings): CLIPVisionEmbeddings(
    (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (position_embedding): Embedding(577, 1024)
  )
  (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  (encoder): CLIPEncoder(
    (layers): ModuleList(
      (0-23): 24 x CLIPEncoderLayer(
        (self_attn): CLIPAttention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): CLIPMLP(
          (activation_fn): QuickGELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias

In [9]:
model.vision_tower.vision_model = clip_model.vision_model

In [10]:
model.language_model = llm_model

In [11]:
llm_model.model.embed_tokens.weight.data[:, :2]

tensor([[ 4.7302e-03, -6.2866e-03],
        [-5.9814e-03,  1.1841e-02],
        [-5.8899e-03,  1.5747e-02],
        ...,
        [-3.3379e-05,  8.5831e-06],
        [-3.2425e-05,  7.1824e-06],
        [-3.1948e-05, -2.9206e-06]], device='cuda:0')

In [12]:
model.language_model.model.embed_tokens.weight.data[:, :2]

tensor([[ 4.7302e-03, -6.2866e-03],
        [-5.9814e-03,  1.1841e-02],
        [-5.8899e-03,  1.5747e-02],
        ...,
        [-3.3379e-05,  8.5831e-06],
        [-3.2425e-05,  7.1824e-06],
        [-3.1948e-05, -2.9206e-06]], device='cuda:0')

In [13]:
model.config.pad_token_id

In [14]:
model.config.pad_token_id = llm_tokenizer.pad_token_id
model.config.pad_token_id

151643

In [15]:
model.config.image_token_index

32000

In [16]:
llm_tokenizer.encode("<image>")[0]

151646

In [17]:
model.config.image_token_index = llm_tokenizer.encode("<image>")[0]
model.config.image_token_index

151646

In [18]:
model.save_pretrained("show_model/model001")

[2024-07-06 21:39:35,115] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/root/autodl-tmp/conda/envs/llama_train/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




In [19]:
llm_tokenizer.save_pretrained("show_model/model001")

('show_model/model001/tokenizer_config.json',
 'show_model/model001/special_tokens_map.json',
 'show_model/model001/vocab.json',
 'show_model/model001/merges.txt',
 'show_model/model001/added_tokens.json',
 'show_model/model001/tokenizer.json')

In [20]:
autoprocessor = AutoProcessor.from_pretrained(clip_model_name_or_path)
autoprocessor.save_pretrained("show_model/model002")

[]

开始测试效果

In [1]:
from transformers import LlavaProcessor, LlavaForConditionalGeneration
import torch


model_name_or_path = "show_model/model001"  # 
# model_name_or_path = "test_model_copy/model001"  #

llava_processor = LlavaProcessor.from_pretrained(model_name_or_path)
model = LlavaForConditionalGeneration.from_pretrained(
    model_name_or_path, device_map="cuda:0", torch_dtype=torch.bfloat16
)

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.06s/it]


In [7]:
from PIL import Image

prompt_text = "<image>\nWhat are these?"


messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt_text},
]
prompt = llava_processor.tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)


image_path = "000000039769.jpg"
image = Image.open(image_path)


inputs = llava_processor(text=prompt, images=image, return_tensors="pt")

# for tk in inputs.keys():
#     inputs[tk] = inputs[tk].to(model.device)
# generate_ids = model.generate(**inputs, max_new_tokens=20)
# gen_text = llava_processor.batch_decode(
#     generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
# )[0]

# print(gen_text)

In [10]:
# inputs
llava_processor.decode([151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,  74785,    279,   2168,   3529,
            285,    974,    624, 151646, 151645,    198, 151644,  77091,    198], skip_special_tokens=False)

'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nDescribe the image concisely.\n<image><|im_end|>\n<|im_start|>assistant\n'

In [5]:
inputs

{'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198, 151646,    198,   3838,    525,
           1493,     30, 151645,    198, 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]], device='cuda:0'), 'pixel_values': tensor([[[[ 0.5435,  0.6457,  0.5581,  ...,  0.0909,  0.0033, -0.0696],
          [ 0.5435,  0.6165,  0.5435,  ...,  0.1201,  0.0179,  0.0617],
          [ 0.5581,  0.5581,  0.6603,  ...,  0.0909,  0.0763,  0.0617],
          ...,
          [ 1.8281,  1.8865,  1.8281,  ...,  1.4048,  1.4486,  1.5654],
          [ 1.8573,  1.9011,  1.8719,  ...,  1.4778,  1.4048,  1.4924],
          [ 1.8719,  1.9011,  1.9011,  ...,  1.4048,  1.2150,  1.4778]],

         [[-1.3619, -1.2718, -1.3769,  ..., -1.4219, -1.4820, -1.5120],
          [-1.3319, -1.2418, -1.3469,  ..., -1.4219, -1.4820, -1

In [6]:
model.config

LlavaConfig {
  "_name_or_path": "show_model/model001",
  "architectures": [
    "LlavaForConditionalGeneration"
  ],
  "ignore_index": -100,
  "image_token_index": 151646,
  "model_type": "llava",
  "pad_token_id": 151643,
  "projector_hidden_act": "gelu",
  "text_config": {
    "_name_or_path": "/root/autodl-tmp/Models/Qwen1.5-4B-Chat",
    "architectures": [
      "Qwen2ForCausalLM"
    ],
    "bos_token_id": 151643,
    "eos_token_id": 151645,
    "hidden_size": 2560,
    "intermediate_size": 6912,
    "max_position_embeddings": 32768,
    "max_window_layers": 21,
    "model_type": "qwen2",
    "num_attention_heads": 20,
    "num_hidden_layers": 40,
    "num_key_value_heads": 20,
    "rope_theta": 5000000.0,
    "sliding_window": 32768,
    "torch_dtype": "bfloat16",
    "use_sliding_window": false,
    "vocab_size": 151936
  },
  "torch_dtype": "float32",
  "transformers_version": "4.42.3",
  "vision_config": {
    "dropout": 0.0,
    "hidden_size": 1024,
    "image_size": 336,
  