In [None]:
!pip install qwen-vl-utils

In [None]:
import torch
import os

# maybe requires '4.51.3' +
from transformers import AutoProcessor, AutoModel, AutoConfig, Qwen2VLForConditionalGeneration
from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VisionTransformerPretrainedModel

from glob import glob
from PIL import Image
import requests

from safetensors.torch import load_file
from qwen_vl_utils import process_vision_info

In [None]:
if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = torch.device('mps')
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('Using device: {0}'.format(device))

In [None]:
# reduce memory usage
min_pixels = 256*28*28
max_pixels = 1280*28*28

In [None]:
config = AutoConfig.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
model = Qwen2VisionTransformerPretrainedModel.from_pretrained('jeddobson/qwen2-vl-2b-instruct-vision')
model.to(device)

In [None]:
pretrained = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
pretrained.to(device)

In [None]:
def format_prompt(image, prompt="", min_pixels = min_pixels, max_pixels = max_pixels):
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": image,
                    "max_pixels": max_pixels,
                    "min_pixels": min_pixels,
                },
                {"type": "text", "text": prompt},
            ],
        }
    ]
    return messages

In [None]:
def get_embedding(image):
    text = processor.apply_chat_template(format_prompt(image,prompt="Describe this image"), tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(format_prompt(image))
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt")
    inputs = inputs.to(device)
    pixel_values = inputs["pixel_values"].type(torch.bfloat16)
    with torch.no_grad():
        image_embeds = model(pixel_values, grid_thw=inputs["image_grid_thw"])
    return image_embeds.mean(dim=0)

In [None]:
img = 'https://collections.dartmouth.edu/xcdas-derivative/college-photographer/jpeg-1k/college-photographer-2006-1807375212.jpg?disposition=download'

In [None]:
embs = get_embedding(img)

In [None]:
embs.shape

In [None]:
def get_response(image):
    text = processor.apply_chat_template(format_prompt(image,prompt="Describe this image"), tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(format_prompt(image))
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt")
    inputs = inputs.to(device)
    with torch.no_grad():
        outputs = pretrained.generate(**inputs, max_new_tokens=256)
    generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, outputs)
    ]
    response = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    return response

In [None]:
get_response(img)

In [None]:
display(Image.open(requests.get(img, stream=True).raw))