In [1]:
from colpali_engine.models.late_interaction.colclip_architecture import ColClip

In [3]:
from transformers import SiglipModel
model = SiglipModel.from_pretrained("google/siglip-large-patch16-256", device_map="auto")

In [14]:
model

SiglipModel(
  (text_model): SiglipTextTransformer(
    (embeddings): SiglipTextEmbeddings(
      (token_embedding): Embedding(32000, 1024)
      (position_embedding): Embedding(64, 1024)
    )
    (encoder): SiglipEncoder(
      (layers): ModuleList(
        (0-23): 24 x SiglipEncoderLayer(
          (self_attn): SiglipSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (layer_norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (mlp): SiglipMLP(
            (activation_fn): PytorchGELUTanh()
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (layer_norm2): L

In [8]:
from transformers import SiglipProcessor
from PIL import Image
import requests

processor = SiglipProcessor.from_pretrained("google/siglip-large-patch16-256")

url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text = ["Hello how are you?"], images=image, return_tensors="pt", padding=True).to(model.device)

In [9]:
inputs

{'input_ids': tensor([[14647,   364,   280,   273,     1]], device='cuda:0'), 'pixel_values': tensor([[[[ 0.4431,  0.4667,  0.4824,  ...,  0.9137,  0.9059,  0.9059],
          [ 0.4588,  0.4745,  0.4980,  ...,  0.9137,  0.9059,  0.9059],
          [ 0.4745,  0.4980,  0.5216,  ...,  0.9137,  0.9137,  0.9059],
          ...,
          [-0.4667, -0.4431, -0.3961,  ..., -0.2000, -0.2235, -0.2078],
          [-0.4588, -0.4353, -0.4275,  ..., -0.2000, -0.2000, -0.2235],
          [-0.4431, -0.4510, -0.4510,  ..., -0.1765, -0.2392, -0.2863]],

         [[ 0.5608,  0.5765,  0.6000,  ...,  0.9137,  0.9059,  0.9059],
          [ 0.5765,  0.5922,  0.6078,  ...,  0.9137,  0.9059,  0.9059],
          [ 0.5843,  0.6000,  0.6157,  ...,  0.9137,  0.9137,  0.9059],
          ...,
          [-0.4039, -0.3804, -0.3333,  ..., -0.2863, -0.3098, -0.2784],
          [-0.3961, -0.3725, -0.3647,  ..., -0.2784, -0.2784, -0.2941],
          [-0.3804, -0.3882, -0.3882,  ..., -0.2549, -0.3020, -0.3569]],

        

In [12]:
out = model(input_ids = inputs["input_ids"], pixel_values = inputs["pixel_values"])

In [15]:
out['text_model_output']['last_hidden_state'].shape

torch.Size([1, 5, 1024])

In [2]:
model = ColClip.from_pretrained("openai/clip-vit-large-patch14-336", ignore_mismatched_sizes=True)

Some weights of ColClip were not initialized from the model checkpoint at openai/clip-vit-large-patch14-336 and are newly initialized because the shapes did not match:
- visual_projection.weight: found shape torch.Size([768, 1024]) in the checkpoint and torch.Size([128, 1024]) in the model instantiated
- text_projection.weight: found shape torch.Size([768, 768]) in the checkpoint and torch.Size([128, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
model

ColClip(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-0

In [16]:
from transformers import CLIPProcessor
from PIL import Image
import requests

processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14-336")

url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text = ["Hello how are you?"], images=image, return_tensors="pt", padding=True)



In [18]:
inputs

KeyError: 'position_ids'

In [5]:
outputs = model(**inputs)

In [6]:
outputs["text"].shape

torch.Size([1, 7, 128])

In [7]:
outputs["vision"].shape

torch.Size([1, 576, 128])

In [8]:
import configue
config = configue.load("colphi3/scripts/configs/clip/train_colclip_model.yaml", sub_path="config")

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Some weights of ColClip were not initialized from the model checkpoint at openai/clip-vit-large-patch14-336 and are newly initialized because the shapes did not match:
- visual_projection.weight: found shape torch.Size([768, 1024]) in the checkpoint and torch.Size([128, 1024]) in the model instantiated
- text_projection.weight: found shape torch.Size([768, 768]) in the checkpoint and torch.Size([128, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


pynvml not found. GPU stats will not be printed.
Configurating PEFT model
trainable params: 8,679,424 || all params: 435,476,737 || trainable%: 1.9931


In [9]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'pixel_values'])

In [10]:
config.model

PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): ColClip(
      (text_model): CLIPTextTransformer(
        (embeddings): CLIPTextEmbeddings(
          (token_embedding): Embedding(49408, 768)
          (position_embedding): Embedding(77, 768)
        )
        (encoder): CLIPEncoder(
          (layers): ModuleList(
            (0-11): 12 x CLIPEncoderLayer(
              (self_attn): CLIPSdpaAttention(
                (k_proj): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=32, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=32, out_features=768, bias=False)
                  )
                  (lora_embedding

In [11]:
inputs

{'input_ids': tensor([[49406,  3306,   829,   631,   592,   286, 49407]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]]), 'pixel_values': tensor([[[[ 1.7114,  1.7114,  1.7114,  ...,  1.8281,  1.8281,  1.8281],
          [ 1.7114,  1.7114,  1.7260,  ...,  1.8281,  1.8281,  1.8281],
          [ 1.7114,  1.7260,  1.7260,  ...,  1.8281,  1.8281,  1.8281],
          ...,
          [ 0.0033,  0.1055,  0.1201,  ...,  0.3683,  0.3391,  0.3683],
          [-0.0259,  0.0909,  0.0617,  ...,  0.4267,  0.3975,  0.3975],
          [-0.0696, -0.0550, -0.0113,  ...,  0.4559,  0.3975,  0.3391]],

         [[ 1.8498,  1.8498,  1.8498,  ...,  1.9698,  1.9698,  1.9698],
          [ 1.8498,  1.8498,  1.8648,  ...,  1.9698,  1.9698,  1.9698],
          [ 1.8498,  1.8648,  1.8648,  ...,  1.9698,  1.9698,  1.9698],
          ...,
          [ 0.4691,  0.5741,  0.5891,  ...,  0.5891,  0.5741,  0.6041],
          [ 0.4240,  0.5591,  0.5441,  ...,  0.6191,  0.5891,  0.6041],
          [ 0.3940,  0.3940,  0.45

In [12]:
config.model(input_ids = inputs["input_ids"], attention_mask = inputs["attention_mask"], pixel_values = inputs["pixel_values"])

{'text': tensor([[[ 9.5703e-02, -1.3477e-01,  1.8848e-01, -1.4062e-01, -1.6479e-02,
            1.4648e-01, -4.0039e-02,  8.1055e-02, -1.3867e-01, -1.3428e-03,
           -5.7129e-02,  3.9551e-02,  2.1484e-02, -5.3955e-02, -1.0205e-01,
           -1.2268e-02, -4.8828e-02,  6.2500e-02, -9.8145e-02, -1.2988e-01,
           -9.4238e-02, -7.4707e-02, -9.0820e-02, -1.9727e-01, -6.5918e-03,
            3.9795e-02,  5.5176e-02,  1.1621e-01,  2.5024e-02,  1.2695e-01,
           -3.8330e-02, -8.5449e-02, -3.0396e-02,  1.2158e-01,  4.2480e-02,
            9.4238e-02, -3.1738e-02, -7.6172e-02, -1.0840e-01, -3.6865e-02,
           -1.8677e-02, -1.8750e-01,  4.6387e-02, -3.6621e-02, -9.7656e-03,
            5.3711e-02, -1.5625e-01, -6.2500e-02,  2.8809e-02, -5.0354e-03,
            5.4443e-02,  9.0820e-02, -6.2012e-02,  8.1543e-02,  6.8665e-03,
           -1.9043e-01, -3.7842e-02,  7.5378e-03,  5.9326e-02,  1.5332e-01,
            1.1353e-02,  1.1719e-01, -9.4727e-02, -2.9419e-02, -4.2480e-02,
    