In [1]:
import torch
from datasets import load_dataset
from helpers import *
import pandas as pd
from qwen_vl_utils import process_vision_info
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
import prefix_tuning
import importlib
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
importlib.reload(prefix_tuning)

<module 'prefix_tuning' from '/Users/floriandreyer/Library/Mobile Documents/com~apple~CloudDocs/Python Projekte/foundation_models/prefix_tuning.py'>

In [59]:
prefix_tuning_layer = torch.load("QWEN_PREFIX_TUNING_10/prefix_tuning.pt", map_location=torch.device('cpu'))

In [2]:
model_name = "Qwen/Qwen2-VL-2B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForImageTextToText.from_pretrained(
    model_name,
    torch_dtype=torch.float32
)

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [61]:
model_prefix_tuning_layer= prefix_tuning.loaded_prefix_tuning = prefix_tuning.PrefixTuning(model.config, prefix_length=10)
model_prefix_tuning_layer.load_state_dict(prefix_tuning_layer)

<All keys matched successfully>

In [62]:
prefix_tuning_model = prefix_tuning.PrefixTuningModel(model, tokenizer, prefix_length=10)
prefix_tuning_model.prefix_tuning = model_prefix_tuning_layer

In [81]:
prefix_tuning_model

PrefixTuningModel(
  (model): Qwen2VLForConditionalGeneration(
    (visual): Qwen2VisionTransformerPretrainedModel(
      (patch_embed): PatchEmbed(
        (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
      )
      (rotary_pos_emb): VisionRotaryEmbedding()
      (blocks): ModuleList(
        (0-31): 32 x Qwen2VLVisionBlock(
          (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
          (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
          (attn): VisionSdpaAttention(
            (qkv): Linear(in_features=1280, out_features=3840, bias=True)
            (proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (mlp): VisionMlp(
            (fc1): Linear(in_features=1280, out_features=5120, bias=True)
            (act): QuickGELUActivation()
            (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          )
        )
      )
      (merger): PatchMerger(
        (ln_q

In [9]:
data = pd.DataFrame(load_dataset('derek-thomas/ScienceQA', split='test'))

In [15]:
data.dropna(inplace=True)
data

Unnamed: 0,image,question,choices,answer,hint,task,grade,subject,topic,category,skill,lecture,solution
2,<PIL.PngImagePlugin.PngImageFile image mode=RG...,What is the name of the colony shown?,"[Maryland, New Hampshire, Rhode Island, Vermont]",1,,closed choice,grade5,social science,us-history,English colonies in North America,Identify the Thirteen Colonies,,The colony is New Hampshire.\nDuring the colon...
5,<PIL.PngImagePlugin.PngImageFile image mode=RG...,Which of these organisms contains matter that ...,"[bilberry, mushroom]",1,Below is a food web from a tundra ecosystem in...,closed choice,grade5,natural science,biology,Ecosystems,Interpret food webs II,A food web is a model.\nA food web shows where...,Use the arrows to follow how matter moves thro...
9,<PIL.PngImagePlugin.PngImageFile image mode=RG...,What is the expected ratio of offspring with a...,"[0:4, 4:0, 2:2, 1:3, 3:1]",1,This passage describes the fleece type trait i...,closed choice,grade8,natural science,biology,Genes to traits,Use Punnett squares to calculate ratios of off...,Offspring phenotypes: dominant or recessive?\n...,To determine how many boxes in the Punnett squ...
10,<PIL.PngImagePlugin.PngImageFile image mode=RG...,Which property do these three objects have in ...,"[shiny, slippery, opaque]",2,Select the best answer.,closed choice,grade4,natural science,physics,Materials,Compare properties of objects,An object has different properties. A property...,"Look at each object.\nFor each object, decide ..."
13,<PIL.PngImagePlugin.PngImageFile image mode=RG...,Think about the magnetic force between the mag...,[The magnitude of the magnetic force is the sa...,2,The images below show two pairs of magnets. Th...,closed choice,grade8,natural science,physics,"Velocity, acceleration, and forces",Compare magnitudes of magnetic forces,Magnets can pull or push on each other without...,Magnet sizes affect the magnitude of the magne...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4234,<PIL.PngImagePlugin.PngImageFile image mode=RG...,Which continent is highlighted?,"[North America, South America, Antarctica, Aus...",0,,closed choice,grade3,social science,geography,Geography,Identify oceans and continents,A continent is one of the seven largest areas ...,This continent is North America.
4235,<PIL.PngImagePlugin.PngImageFile image mode=RG...,Which continent is highlighted?,"[Africa, South America, North America, Asia]",1,,closed choice,grade5,social science,geography,Oceans and continents,Identify oceans and continents,A continent is one of the major land masses on...,This continent is South America.
4237,<PIL.PngImagePlugin.PngImageFile image mode=RG...,Which of these states is farthest west?,"[Alabama, Illinois, South Carolina, Connecticut]",1,,closed choice,grade3,social science,geography,Geography,Read a map: cardinal directions,"Maps have four cardinal directions, or main di...","To find the answer, look at the compass rose. ..."
4238,<PIL.PngImagePlugin.PngImageFile image mode=RG...,Which continent is highlighted?,"[Asia, Europe, Australia, North America]",1,,closed choice,grade5,social science,geography,Oceans and continents,Identify oceans and continents,A continent is one of the major land masses on...,This continent is Europe.


In [16]:
data['input'] = data.apply(lambda row: build_prompt(row)[0], axis=1)

In [18]:
data['message'] = data.apply(lambda row: build_message(row), axis=1)

In [97]:
test_data = []
for i in range(20):
    row = data.iloc[i]
    message = processor.apply_chat_template(row['message'], tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(row['message'])
    inputs = processor(
        text=[message],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    test_data.append(inputs)
    data["inputs"] = inputs

In [85]:
generated_ids = prefix_tuning_model.generate(test_data[0], max_new_tokens=128)

In [93]:
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(test_data[0].input_ids, generated_ids)
]
output_text_prefix = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

In [79]:
output_text # output of base model for index 0

['Answer: (1) New Hampshire']

In [77]:
generated_ids = model.generate(**test_data[0], max_new_tokens=128)

In [94]:
output_text_prefix

['']

In [3]:
help(model.forward)

Help on method forward in module transformers.models.qwen2_vl.modeling_qwen2_vl:

forward(input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, pixel_values: Optional[torch.Tensor] = None, pixel_values_videos: Optional[torch.FloatTensor] = None, image_grid_thw: Optional[torch.LongTensor] = None, video_grid_thw: Optional[torch.LongTensor] = None, rope_deltas: Optional[torch.LongTensor] = None) -> Union[Tuple, transformers.models.qwen2_vl.modeling_qwen2_vl.Qwen2VLCausalLMOutputWithPast] method of transformers.models.qwen2_vl.modeling_qwen2_vl.Qwen2VLForConditionalGeneration instance
    The [`Qwen2VLForCond

In [88]:
inputs_embeds = model.get_input_embeddings()(test_data[0]["input_ids"])

In [90]:
generated_ids = model.generate(inputs_embeds=inputs_embeds, attention_mask=test_data[0]["attention_mask"], pixel_values=test_data[0]["pixel_values"], max_new_tokens=128)

In [120]:
output_texts = []
with torch.inference_mode():
    for i in tqdm(range(30)):
        max_length = test_data[i]["input_ids"].size(1) + 10 # +10 for later prefix
        labels = tokenizer(data.iloc[i]["solution"], padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")["input_ids"]
        outputs = prefix_tuning_model(test_data[i], labels=labels)
        output_ids = outputs.logits.argmax(-1)
        output_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        output_texts.append(output_text)

 67%|██████▋   | 20/30 [11:17<05:38, 33.86s/it]


IndexError: list index out of range

In [121]:
output_texts

[[')1))'],
 [') matter. web the. a is.'],
 [')4)24)4, cut toeces, for. is are aly.. a of the is. is is is for hairyly fleece isF)'],
 [')'],
 [' of the) of the magnetic force is smaller in pair 1 of the., are...'],
 [')3)'],
 [') the is'],
 [')1) matter in matter web model matter matter eaten through the. a matter.'],
 [' between force is2) magnetic is is force is stronger magnetic of.'],
 [' forest year. has forestree forest forest'],
 [')))'],
 [')1)) ('],
 ["ieie) Allie can trade get oranges) All for Allie's can All All the All get. something something something or to directly.ie for the and. something tomatoes...ie. something. tomatoes. the of."],
 ['))3)'],
 ['))'],
 [''],
 [''],
 [')'],
 [')))'],
 ['']]