## Imports & Setup

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import re
import ast
from datasets import load_dataset
from tqdm import tqdm
import helpers
import prefix_tuning
import importlib
from PIL import Image
from qwen_vl_utils import process_vision_info
importlib.reload(helpers)
importlib.reload(prefix_tuning)

<module 'prefix_tuning' from '/Users/floriandreyer/Library/Mobile Documents/com~apple~CloudDocs/Python Projekte/foundation_models/prefix_tuning.py'>

In [2]:
#device = torch.device('mps' if (torch.backends.mps.is_available() and torch.backends.mps.is_built()) else 'cpu')
device = torch.device('cpu')

# CONSTANTS
NUM_EPOCHS_FT = 100
NUM_EPOCHS_KD = 100
BATCH_SIZE = 32

In [3]:
device

device(type='cpu')

## Get Data and preprocess it

In [25]:
### train data
# data with label and image data
df_train_label = pd.DataFrame(load_dataset('derek-thomas/ScienceQA', split='train'))

df_train_label = df_train_label[df_train_label['solution'] != ''].reset_index()
df_train_label['input'] = df_train_label.apply(lambda row: helpers.build_prompt(row)[0], axis=1)
df_train_label['message'] = df_train_label.apply(lambda row: helpers.build_message(row), axis=1)
"""transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor()
    ])
print("test")
df_train_label['image'] = df_train_label.apply(lambda row: transform(row['image']) if row['image'] else transform(Image.new("RGB", (224, 224), (0, 0, 0))), axis=1)
print("test")"""
df_train_label[['index', 'message', 'answer', 'explanation', 'image']] = df_train_label[['index', 'message', 'answer', 'solution', 'image']]

# # data from Gemini for KD
df_train_gemini = pd.read_csv('gemini_1_5_flash_output_train.csv', sep="\t")[['index', 'input', 'answer', 'explanation']]
df_train_gemini = pd.merge(df_train_gemini, df_train_label[['index', 'image']], on='index')
df_train_gemini['message'] = df_train_gemini.apply(lambda row: helpers.build_message(row), axis=1)

In [26]:
### val data
df_val = pd.DataFrame(load_dataset('derek-thomas/ScienceQA', split='validation'))
df_val['input'] = df_val.apply(lambda row: helpers.build_prompt(row)[0], axis=1)
df_val['message'] = df_val.apply(lambda row: helpers.build_message(row), axis=1)
"""transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor()
    ])
df_val['image'] = df_val.apply(lambda row: transform(row['image']) if row['image'] else transform(Image.new("RGB", (224, 224), (0, 0, 0))), axis=1)"""

'transform = transforms.Compose([\n        transforms.Resize((224, 224)),\n        transforms.ToTensor()\n    ])\ndf_val[\'image\'] = df_val.apply(lambda row: transform(row[\'image\']) if row[\'image\'] else transform(Image.new("RGB", (224, 224), (0, 0, 0))), axis=1)'

In [6]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

# default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="float32", device_map={"": "cpu"}
)

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2-VL-2B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

In [9]:
def tokenize_dataset(df, tokenizer, input_column="input"):

    tokenized_data = []
    for _, row in df.iterrows():
        sample = tokenizer.encode(row[input_column], return_tensors="pt").squeeze(0)
        tokenized_data.append(sample)
    df["input_ids"] = tokenized_data
    return df


In [9]:
# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": df_train_label.iloc[10]['image'],
            },
            {"type": "text", "text": df_train_label.iloc[10]['question'] + " " + ' '.join(df_train_label.iloc[10]['choices'])},
        ],
    }
]
# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to(device)

"""# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)"""


'# Inference: Generation of the output\ngenerated_ids = model.generate(**inputs, max_new_tokens=128)\ngenerated_ids_trimmed = [\n    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n]\noutput_text = processor.batch_decode(\n    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n)\nprint(output_text)'

In [11]:
inputs

{'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198, 151652, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151653,   3838,   1558,  78684,   1492,
            264,   6008,    653,     30,   1281,  19056,   3063,  11243,   3063,
            501,  10901, 151645,    198, 151644,  77091,    198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'pixel_values': tensor([[-0.2156, -0.0842, -0.0842,  ...,  2.1459,  2.1459,  2.1459],
        [-0.0842, -0.0842, -0.0842,  ...,  2.1459,  2.1459,  1.7477],
        [-0.0842,  1.9303,  1.9303,  ...,  2.

In [None]:
df_train_label[11]['image']

In [None]:
df_train_gemini.head(1)

In [33]:
"""model_name = "Qwen2-VL-2B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)


# Datensatz tokenisieren
tokenized_data = tokenize_dataset(df_train_gemini, tokenizer, input_column="input", label_column="answer")

# Dataset erstellen
dataset = SoftPromptingDataset(tokenized_data)

# Zugriff auf ein Beispiel
print(dataset[0])"""

'model_name = "Qwen2-VL-2B-Instruct"\n\nmodel = AutoModelForCausalLM.from_pretrained(\n    model_name,\n    torch_dtype="auto",\n    device_map="auto"\n)\ntokenizer = AutoTokenizer.from_pretrained(model_name)\n\n\n# Datensatz tokenisieren\ntokenized_data = tokenize_dataset(df_train_gemini, tokenizer, input_column="input", label_column="answer")\n\n# Dataset erstellen\ndataset = SoftPromptingDataset(tokenized_data)\n\n# Zugriff auf ein Beispiel\nprint(dataset[0])'

## Functions for model training

In [16]:
def train(model, tokenizer, processor, optimizer, dataloader_train, dataloader_val):
    train_errors = []
    val_errors = []
    for epoch in tqdm(range(NUM_EPOCHS_FT)):
        error = 0
        num_samples = 0
        for X, y in dataloader_train:
            labels = tokenizer(y, return_tensors="pt")
            text = processor.apply_chat_template(
                X, tokenize=False, add_generation_prompt=True
            )
            image_inputs = X[1]
            inputs = processor(
                text=[text],
                images=image_inputs,
                padding=True,
                return_tensors="pt",
            )
            outputs = model(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                labels=labels["input_ids"],
            )
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            error += loss.item() * X.size(0)
            num_samples += X.size(0)
        error /= num_samples
        print(f'Error after epoch {epoch}: {error}')
        train_errors.append((epoch, error))
        if epoch % 10:
            val_error = 0
            num_samples = 0
            for X, y in dataloader_val:
                labels = tokenizer(y, return_tensors="pt")
                text = processor.apply_chat_template(
                    X['message'], tokenize=False, add_generation_prompt=True
                )
                image_inputs = X[1]
                inputs = processor(
                    text=[text],
                    images=image_inputs,
                    padding=True,
                    return_tensors="pt",
                )
                outputs = model(
                    input_ids=inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    labels=labels["input_ids"],
                )
                loss = outputs.loss
                val_error += loss.item() * X.size(0)
                num_samples += X.size(0)
            val_error /= num_samples
            print(f'Validation error after epoch {epoch}: {val_error}')
            val_errors.append((epoch, val_error))
    return train_errors_ft, val_error

In [11]:
def visualize_error(train_errors, val_errors):
    plt.plot(zip(*train_errors), label="Train Error", marker="o", linestyle="-")
    plt.plot(zip(*val_errors), label="Train Error", marker="o", linestyle="-")
    plt.title("Train and Validation Error over Epochs")
    plt.xlabel("Epochs")
    plt.ylabel("Error")
    plt.show()

In [12]:
class PrefixDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        """image = row['image']
        transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor()
        ])
        image_tensor = transform(image) if image else transform(Image.new("RGB", (224, 224), (0, 0, 0)))"""
        return {'message': row['message'], 'image': row['image']}, row['answer'].astype(str)

In [13]:
def prefix_collate(batch):
    X, y = zip(*batch)
    message = X[0]
    image = X[1]
    return [message, image], y

## PrefixTuning using labels

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor

model_name = "Qwen/Qwen2-VL-2B-Instruct"

model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype="float32",
    device_map={"": "cpu"}
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name)

model_prefix = prefix_tuning.PrefixTuningModel(model, tokenizer)
optimizer = torch.optim.Adam(model_prefix.prefix_tuning.parameters(), lr=5e-5)
# DataLoader for train data
dataset_label_train = PrefixDataset(df_train_label)
dataloader_label_train=DataLoader(dataset_label_train, collate_fn=prefix_collate, batch_size=32, shuffle=True)
# DataLoader for val data
dataset_label_val = PrefixDataset(df_val)
dataloader_label_val=DataLoader(dataset_label_val, collate_fn=prefix_collate, batch_size=32, shuffle=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [21]:
train_errors_ft, val_errors_ft = train(model_prefix, tokenizer, processor, optimizer, dataloader_label_train, dataloader_label_val)

  0%|          | 0/100 [00:00<?, ?it/s]


ValueError: Could not make batched images from {'message': [{'role': 'user', 'content': [{'type': 'image', 'image': tensor([[[1.0000, 1.0000, 0.9882,  ..., 0.9882, 1.0000, 1.0000],
         [1.0000, 0.9843, 0.9137,  ..., 0.9137, 0.9843, 1.0000],
         [0.9725, 0.8941, 0.8980,  ..., 0.8980, 0.8941, 0.9765],
         ...,
         [0.9765, 0.8941, 0.8980,  ..., 0.8980, 0.8941, 0.9765],
         [1.0000, 0.9804, 0.9137,  ..., 0.9137, 0.9843, 1.0000],
         [1.0000, 1.0000, 0.9882,  ..., 0.9882, 1.0000, 1.0000]],

        [[1.0000, 1.0000, 0.9882,  ..., 0.9882, 1.0000, 1.0000],
         [1.0000, 0.9843, 0.9137,  ..., 0.9137, 0.9843, 1.0000],
         [0.9725, 0.8941, 0.8980,  ..., 0.8980, 0.8941, 0.9765],
         ...,
         [0.9765, 0.8941, 0.8980,  ..., 0.8980, 0.8941, 0.9765],
         [1.0000, 0.9804, 0.9137,  ..., 0.9137, 0.9843, 1.0000],
         [1.0000, 1.0000, 0.9882,  ..., 0.9882, 1.0000, 1.0000]],

        [[1.0000, 1.0000, 0.9882,  ..., 0.9882, 1.0000, 1.0000],
         [1.0000, 0.9843, 0.9137,  ..., 0.9137, 0.9843, 1.0000],
         [0.9725, 0.8941, 0.8980,  ..., 0.8980, 0.8941, 0.9765],
         ...,
         [0.9765, 0.8941, 0.8980,  ..., 0.8980, 0.8941, 0.9765],
         [1.0000, 0.9804, 0.9137,  ..., 0.9137, 0.9843, 1.0000],
         [1.0000, 1.0000, 0.9882,  ..., 0.9882, 1.0000, 1.0000]]])}, {'type': 'text', 'text': 'Question: Which property matches this object?\n Task: closed choice\n Choices: (0) sour (1) salty\n Hint: Select the better answer.'}]}], 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=172x208 at 0x3246CB0A0>}

In [None]:
visualize_error(train_errors_ft, val_errors_ft)

## Knowledge Distillation

In [None]:
model_knowledge_distillation = soft_prompting.MultimodalSoftPrompting.from_pretrained(model)
# DataLoader for train data
dataset_gemini_train = SoftPromptingDataset(df_train_gemini, model_fine_tuned)
dataloader_gemini_train=DataLoader(dataset_gemini_train, batch_size=32, shuffle=True)
# DataLoader for val data
dataset_gemini_val = SoftPromptingDataset(df_val, model_fine_tuned)
dataloader_gemini_val=DataLoader(dataset_gemini_val, batch_size=32, shuffle=True)

In [None]:
train_errors_kd, val_errors_kd = train(dataset_gemini_train, dataloader_gemini_train)

In [None]:
visualize_error(train_errors_kd, val_errors_kd)