## Imports & Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from tensorflow.python.keras.backend import dtype
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import re, os, pickle
import ast
from datasets import load_dataset
from tqdm import tqdm
#import helpers
#import prefix_tuning
import importlib
from PIL import Image
from qwen_vl_utils import process_vision_info
#importlib.reload(helpers)
#importlib.reload(prefix_tuning)

2025-01-05 22:11:41.062185: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-05 22:11:41.076287: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736111501.093236   17571 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736111501.098569   17571 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-05 22:11:41.116764: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
import torch
torch.cuda.empty_cache()

In [3]:
from PIL import Image


def get_question_text(problem):
    question = problem['question']
    return question


def get_choice_text(probelm, options):
    choices = probelm['choices']
    choice_list = []
    for i, c in enumerate(choices):
        choice_list.append("({}) {}".format(options[i], c))
    choice_txt = " ".join(choice_list)
    return choice_txt


def get_context_text(problem, use_caption):
    txt_context = problem['hint']
    img_context = problem['caption'] if use_caption else ""
    context = " ".join([txt_context, img_context]).strip()
    if context == "":
        context = "N/A"
    return context


def build_prompt(question_data, use_lecture=False, use_solution=False):
    question = get_question_text(question_data)
    choices = get_choice_text(question_data, [choice_num for choice_num in range(5)])
    hint = get_context_text(question_data, False)
    #image = question_data['image']
    task = question_data['task']
    input_prompt = f'Question: {question}\n Task: {task}\n Choices: {choices}\n Hint: {hint}'
    if use_lecture:
        lecture = f'\n Lecture: {question_data["lecture"]}'
        input_prompt += lecture
    if use_solution and question_data["solution"]:
        solution = f'\n Solution: {question_data["solution"]}'
        input_prompt += solution
    prompt = [input_prompt]
    #if image:
    #    prompt.append(image)
    return prompt

def build_message(row):
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": row["image"],
                },
                {"type": "text", "text": row['input']},
            ],
        }
    ]
    return messages

In [4]:
import torch
from torch import nn

class PrefixTuning(nn.Module):
    def __init__(self, config, prefix_length=10):
        super().__init__()
        self.prefix_length = prefix_length
        self.hidden_size = config.hidden_size
        self.prefix_embeddings = nn.Parameter(torch.randn(prefix_length, config.hidden_size))

    def forward(self, inputs_embeds):
        batch_size = inputs_embeds.size(0)
        prefix = self.prefix_embeddings.unsqueeze(0).expand(batch_size, -1, -1)
        return torch.cat([prefix.to(device, dtype=torch.bfloat16), inputs_embeds.to(device, dtype=torch.bfloat16)], dim=1)


class PrefixTuningModel(nn.Module):
    def __init__(self, model, tokenizer, prefix_length=10):
        super().__init__()
        self.model = model
        self.freeze_main_model()
        self.tokenizer = tokenizer
        self.prefix_tuning = PrefixTuning(self.model.config, prefix_length)

    def freeze_main_model(self):
        for param in self.model.parameters():
            param.requires_grad = False

    def forward(self, inputs, labels):
        inputs_embeds = self.model.get_input_embeddings()(inputs["input_ids"])
        # Add Prefix
        inputs_embeds = self.prefix_tuning(inputs_embeds)

        # Modify attention mask for prefix
        prefix_mask = torch.ones((inputs["input_ids"].size(0), self.prefix_tuning.prefix_length), device=device)
        attention_mask = torch.cat([prefix_mask, inputs["attention_mask"]], dim=1)

        return self.model(inputs_embeds=inputs_embeds, attention_mask=attention_mask, pixel_values=inputs["pixel_values"], labels=labels)

In [5]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device("mps")
print('Using device:', device)
print()


#device = torch.device('cuda:0,1' if torch.cuda.is_available() else 'cpu')
#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

# CONSTANTS
NUM_EPOCHS_FT = 20
NUM_EPOCHS_KD = 20
BATCH_SIZE = 4

Using device: cuda

NVIDIA RTX A6000
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [6]:
device

device(type='cuda')

In [7]:
!nvidia-smi

Sun Jan  5 22:11:43 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A6000               Off |   00000000:15:00.0 Off |                  Off |
| 30%   39C    P8             21W /  300W |      18MiB /  49140MiB |      1%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA RTX A6000               Off |   00

In [8]:
# !kill -9 15374

In [9]:
# !nvidia-smi | grep 'python' | awk '{ print $3 }' | xargs -n1 kill -9

## Get Data and preprocess it

In [10]:
import base64
def encode_image(image_file):
    """Getting the base64 string"""
    return base64.b64encode(image_file).decode("utf-8")

In [11]:
### train data
# data with label and image data
df_train_label = pd.DataFrame(load_dataset('derek-thomas/ScienceQA', split='train'))

from PIL import Image
from torchvision import transforms

def encode_image(image_file):
    """Getting the base64 string"""
    return base64.b64encode(image_file.read()).decode("utf-8")

df_train_label = df_train_label[df_train_label['solution'] != '']
# df_train_label['image'] = df_train_label.apply(lambda row: encode_image(row['image']) if row['image'] else encode_image(Image.new("RGB", (224, 224), (0, 0, 0))), axis=1)
df_train_label['input'] = df_train_label.apply(lambda row: build_prompt(row)[0], axis=1)
df_train_label['message'] = df_train_label.apply(lambda row: build_message(row), axis=1)
df_train_label = df_train_label.reset_index()

In [12]:
### val data
df_val = pd.DataFrame(load_dataset('derek-thomas/ScienceQA', split='validation'))
# df_val['image'] = df_val.apply(lambda row: encode_image(row['image']) if row['image'] else encode_image(Image.new("RGB", (224, 224), (0, 0, 0))), axis=1)
df_val['input'] = df_val.apply(lambda row: build_prompt(row)[0], axis=1)
df_val['message'] = df_val.apply(lambda row: build_message(row), axis=1)
df_val = df_val.reset_index()

## Functions for model training

In [13]:
def preprocess_input_qwen(tokenizer, processor, prompts, texts, images, y, device):
    messages = [processor.apply_chat_template(
                text, tokenize=False, add_generation_prompt=False
    ) for text in texts]
    image_inputs, video_inputs = process_vision_info(texts)
    inputs = processor(
        text=messages,
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    max_length = inputs["input_ids"].size(1)
    labels = tokenizer(y, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")["input_ids"]
    return inputs.to(device), labels.to(device)

In [14]:
class PrefixDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        return row['input'], row['message'], row['image'], row['solution']

In [15]:
def prefix_collate(batch):
    input, message, image, y = zip(*batch)
    return input, message, image, y

In [16]:
# def encode_image(image_path):
#     """Getting the base64 string"""
#     with open(image_path, "rb") as image_file:
#         return base64.b64encode(image_file.read()).decode("utf-8")

In [17]:
### TODO: givcen path to Katja's code!!!!!!!

## PrefixTuning using labels

In [18]:
# DataLoader for train data
dataset_label_train = PrefixDataset(df_train_label)
dataloader_label_train=DataLoader(dataset_label_train, collate_fn=prefix_collate, batch_size=BATCH_SIZE, shuffle=True)
# DataLoader for val data
dataset_label_val = PrefixDataset(df_val)
dataloader_label_val=DataLoader(dataset_label_val, collate_fn=prefix_collate, batch_size=BATCH_SIZE, shuffle=True)

### Qwen

In [19]:
from transformers import AutoModelForImageTextToText, AutoTokenizer, AutoProcessor, Qwen2VLForConditionalGeneration
import gc, os

model_name = "Qwen/Qwen2-VL-2B-Instruct"

model = AutoModelForImageTextToText.from_pretrained(
    model_name,
    torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name)

model_prefix = PrefixTuningModel(model, tokenizer).to(device)
optimizer = torch.optim.AdamW(model_prefix.prefix_tuning.parameters(), lr=5e-5)

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [20]:
from transformers import AutoModelForImageTextToText, AutoTokenizer, AutoProcessor
from peft import LoraConfig, get_peft_model
from datasets import load_dataset

# Load model, tokenizer, and processor
model_name = "Qwen/Qwen2-VL-2B-Instruct"
model = AutoModelForImageTextToText.from_pretrained(
    model_name,
    torch_dtype="bfloat16",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name)

# Configure LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [21]:
from helpers import *
from qwen_vl_utils import process_vision_info

In [22]:
from datasets import load_dataset,DatasetDict
from transformers import DataCollatorForSeq2Seq
from datasets import Dataset, DatasetDict, load_dataset

# processed_dataset = DatasetDict({"train": Dataset.from_pandas(df_train_label),
#                                  "val": Dataset.from_pandas(df_val)})
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True, max_length=512)

In [23]:
import pandas as pd
import os

def add_image_paths(df, temp_images):
    def construct_path(idx):
        image_path = os.path.join(temp_images, f"image_{idx}.png")
        return image_path if os.path.isfile(image_path) else None
    df['image'] = df.index.to_series().apply(construct_path)
    return df

In [24]:
# from datasets import Dataset, DatasetDict, load_dataset
# from transformers import DataCollatorForSeq2Seq
# import pandas as pd

# # # Load train dataset
df_train_label = pd.DataFrame(load_dataset('derek-thomas/ScienceQA', split='train'))
temp_images_path = "./temp_images"
df_train_label = add_image_paths(df_train_label, temp_images_path)
# df_train_label['image'] = df_train_label.apply(lambda row: row['image'] if row['image'] else Image.new("RGB", (224, 224), (0, 0, 0)), axis=1)
df_train_label['input'] = df_train_label.apply(lambda row: build_prompt(row)[0], axis=1)
df_train_label['message'] = df_train_label.apply(lambda row: build_message(row), axis=1)
df_train_label = df_train_label[df_train_label['solution'] != ''].reset_index(drop=True)


df_val = pd.DataFrame(load_dataset('derek-thomas/ScienceQA', split='validation'))
temp_images_path = "./temp_images_val/"
df_val = add_image_paths(df_val, temp_images_path)
# df_val['image'] = df_val.apply(lambda row: row['image'] if row['image'] else Image.new("RGB", (224, 224), (0, 0, 0)), axis=1)
df_val['input'] = df_val.apply(lambda row: build_prompt(row)[0], axis=1)
df_val['message'] = df_val.apply(lambda row: build_message(row), axis=1)
df_val = df_val[df_val['solution'] != ''].reset_index(drop=True)

# # Convert DataFrames to Hugging Face Dataset objects
# hf_train_dataset = Dataset.from_pandas(df_train_label)
# hf_val_dataset = Dataset.from_pandas(df_val)

# import os
# from PIL import Image
from datasets import Dataset, DatasetDict, load_dataset
# import pandas as pd

# Create a directory for temporary images
# os.makedirs("temp_images", exist_ok=True)
# os.makedirs("temp_images_val", exist_ok=True)
# import os.path

# Save images as temporary files
# def save_image(image, idx, temp_dir):
#     if isinstance(image, Image.Image):  # Check if it's a PIL Image
        
#         file_path = f"{temp_dir}/image_{idx}.png"
#         if os.path.isfile(file_path):
#             pass
#         else:
#             image.save(file_path)
#         return file_path
#     return image  # If already a valid path, return as is

# # Apply the function to save images in the train dataset
# df_train_label['image_path'] = df_train_label['image'].apply(
#     lambda img: save_image(img, df_train_label.index[df_train_label['image'] == img][0], "temp_images")
# )

# Apply the function to save images in the validation dataset
# df_val['image_path'] = df_val['image'].apply(
#     lambda img: save_image(img, df_val.index[df_val['image'] == img][0], "temp_images_val")
# )

# # Drop the original image column
# df_train_label = df_train_label.drop(columns=["image"])
# df_val = df_val.drop(columns=["image"])

# # Convert to Hugging Face Dataset
hf_train_dataset = Dataset.from_pandas(df_train_label)
hf_val_dataset = Dataset.from_pandas(df_val)

# # Create DatasetDict
dataset = DatasetDict({"train": hf_train_dataset, "val": hf_val_dataset})

# # Debugging: Check dataset structure
# print(processed_dataset)

# # # Create DatasetDict
# processed_dataset = DatasetDict({"train": hf_train_dataset, "val": hf_val_dataset})

# # Data collator for seq2seq
# data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True, max_length=512)

# # Debugging: Check dataset structure
# print(processed_dataset)

In [25]:
# processed_dataset = hf_train_dataset.map(
#     preprocess_function,
#     batched=True,  # Process multiple examples at once
#     remove_columns=hf_train_dataset.column_names  # Remove original columns
# )

In [26]:
def preprocess_function(examples):
    # Tokenize input and target text
    model_inputs = tokenizer(examples["question"], padding="max_length", truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["solution"], padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
# dataset = DatasetDict({"train": hf_train_dataset, "val": hf_val_dataset})
# processed_dataset = dataset.map(preprocess_function, batched=True)

In [27]:
processed_dataset_val = hf_val_dataset.map(
    preprocess_function,
    batched=True,  # Process multiple examples at once
    remove_columns=hf_val_dataset.column_names  # Remove original columns
)

processed_dataset_train = hf_train_dataset.map(
    preprocess_function,
    batched=True,  # Process multiple examples at once
    remove_columns=hf_val_dataset.column_names  # Remove original columns
)

Map:   0%|          | 0/3848 [00:00<?, ? examples/s]



Map:   0%|          | 0/11515 [00:00<?, ? examples/s]

In [28]:
# processed_dataset['train']

In [29]:
# from datasets import Dataset, Image

# Dataset.from_dict({"image": df_train_label["image"].tolist()}).cast_column("image", Image())

In [30]:
from transformers import Seq2SeqTrainingArguments


SAVE_DIR = "./qwen_lora_model"

training_args = Seq2SeqTrainingArguments(
    output_dir=SAVE_DIR,                # Directory to save the model
    per_device_train_batch_size=8,        # Batch size for training
    per_device_eval_batch_size=8,         # Batch size for evaluation
    predict_with_generate=True,           # Generate sequences for evaluation
    evaluation_strategy="steps",          # Evaluation frequency
    logging_steps=500,                    # Logging frequency
    save_steps=500,                       # Checkpoint saving frequency
    save_total_limit=2,                   # Number of checkpoints to keep
    num_train_epochs=3,                   # Number of epochs
    fp16=True,                            # Use mixed precision training
    learning_rate=5e-5,                   # Learning rate
    lr_scheduler_type="linear",           # Learning rate scheduler
    warmup_steps=500,                     # Warmup steps
    report_to="none",                     # Disable logging services
    remove_unused_columns=False
)



from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset_train,    # Pass training dataset
    eval_dataset=processed_dataset_val,     # Optionally pass validation dataset
    tokenizer=tokenizer,
    data_collator=data_collator
    
)
# Dataset.from_dict({"image": df_train_label["image"].tolist()}).cast_column("image", Image())

trainer.train()


model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

  trainer = Seq2SeqTrainer(


OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU 0 has a total capacity of 47.43 GiB of which 5.49 GiB is free. Including non-PyTorch memory, this process has 41.92 GiB memory in use. Of the allocated memory 41.16 GiB is allocated by PyTorch, and 472.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [31]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Sun Jan  5 22:17:46 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A6000               Off |   00000000:15:00.0 Off |                  Off |
| 30%   38C    P8             21W /  300W |   42949MiB /  49140MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA RTX A6000               Off |   00

In [85]:
import gc
import time


def clear_memory():
    # Delete variables if they exist in the current global scope
    if "inputs" in globals():
        del globals()["inputs"]
    if "model" in globals():
        del globals()["model"]
    if "processor" in globals():
        del globals()["processor"]
    if "trainer" in globals():
        del globals()["trainer"]
    if "peft_model" in globals():
        del globals()["peft_model"]
    if "bnb_config" in globals():
        del globals()["bnb_config"]
    time.sleep(2)

    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)

    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


clear_memory()

GPU allocated memory: 27.45 GB
GPU reserved memory: 27.95 GB


In [20]:
help(model.forward)

Help on method forward in module peft.peft_model:

forward(input_ids=None, attention_mask=None, inputs_embeds=None, decoder_input_ids=None, decoder_attention_mask=None, decoder_inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, task_ids=None, **kwargs) method of peft.peft_model.PeftModelForSeq2SeqLM instance
    Forward pass of the model.



In [22]:
processed_dataset["train"][0]

{'image': './temp_images/image_0.png',
 'question': 'Which of these states is farthest north?',
 'choices': ['West Virginia', 'Louisiana', 'Arizona', 'Oklahoma'],
 'answer': 0,
 'hint': '',
 'task': 'closed choice',
 'grade': 'grade2',
 'subject': 'social science',
 'topic': 'geography',
 'category': 'Geography',
 'skill': 'Read a map: cardinal directions',
 'lecture': 'Maps have four cardinal directions, or main directions. Those directions are north, south, east, and west.\nA compass rose is a set of arrows that point to the cardinal directions. A compass rose usually shows only the first letter of each cardinal direction.\nThe north arrow points to the North Pole. On most maps, north is at the top of the map.',
 'solution': 'To find the answer, look at the compass rose. Look at which way the north arrow is pointing. West Virginia is farthest north.',
 'input': 'Question: Which of these states is farthest north?\n Task: closed choice\n Choices: (0) West Virginia (1) Louisiana (2) Ari

### Paligemma

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 

In [None]:
!huggingface-cli whoami

In [None]:
!huggingface_hub

In [23]:
from transformers import (
    PaliGemmaProcessor,
    PaliGemmaForConditionalGeneration,
)
from transformers.image_utils import load_image
import torch

SAVE_DIR = "PALIGEMMA"

model_id = "google/paligemma2-3b-pt-224"

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"
image = load_image(url)

model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto", device_map="auto").eval()
processor = PaliGemmaProcessor.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model_prefix = PrefixTuningModel(model, tokenizer).to(device)
optimizer = torch.optim.AdamW(model_prefix.prefix_tuning.parameters(), lr=5e-5)

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/google/paligemma2-3b-pt-224.
403 Client Error. (Request ID: Root=1-67786c54-641d66ae698d6f7251ff790b;afe73b60-caae-43a4-b10c-391f97057cb5)

Cannot access gated repo for url https://huggingface.co/google/paligemma2-3b-pt-224/resolve/main/config.json.
Access to model google/paligemma2-3b-pt-224 is restricted and you are not in the authorized list. Visit https://huggingface.co/google/paligemma2-3b-pt-224 to ask for access.

In [None]:
train_errors_ft_paligemma, val_errors_ft_paligemma = train(model_prefix, tokenizer, processor, optimizer, dataloader_label_train, dataloader_label_val, preprocess_input_paligemma)

In [None]:
visualize_error(train_errors_ft_paligemma, val_errors_ft_paligemma)

In [16]:
from transformers import (
    PaliGemmaProcessor,
    PaliGemmaForConditionalGeneration,
)
from transformers.image_utils import load_image
import torch

model_id = "google/paligemma2-3b-pt-224"

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"
image = load_image(url)

model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto", device_map="auto").eval()
processor = PaliGemmaProcessor.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Leaving the prompt blank for pre-trained models
prompt = ""
model_inputs = processor(text=prompt, images=image, return_tensors="pt").to(torch.bfloat16).to(model.device)
input_len = model_inputs["input_ids"].shape[-1]

with torch.inference_mode():
    generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
    generation = generation[0][input_len:]
    decoded = processor.decode(generation, skip_special_tokens=True)
    print(decoded)


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/google/paligemma2-3b-pt-224.
403 Client Error. (Request ID: Root=1-6776d825-0dcebebc236e29cf2bda920b;c83f6a1c-e043-4ccc-94f2-e377569cd004)

Cannot access gated repo for url https://huggingface.co/google/paligemma2-3b-pt-224/resolve/main/config.json.
Your request to access model google/paligemma2-3b-pt-224 is awaiting a review from the repo authors.

## Knowledge Distillation

In [None]:
# DataLoader for train data
dataset_gemini_train = PrefixDataset(df_train_gemini)
dataloader_gemini_train=DataLoader(dataset_gemini_train, collate_fn=prefix_collate, batch_size=BATCH_SIZE, shuffle=True)
# DataLoader for val data
dataset_label_val = PrefixDataset(df_val)
dataloader_label_val=DataLoader(dataset_label_val, collate_fn=prefix_collate, batch_size=BATCH_SIZE, shuffle=True)

### Qwen

In [None]:
from transformers import AutoModelForImageTextToText, AutoTokenizer, AutoProcessor, Qwen2VLForConditionalGeneration
import gc

model_name = "Qwen/Qwen2-VL-2B-Instruct"

model = AutoModelForImageTextToText.from_pretrained(
    model_name,
    torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name)

model_prefix = PrefixTuningModel(model, tokenizer).to(device)
optimizer = torch.optim.AdamW(model_prefix.prefix_tuning.parameters(), lr=5e-5)

In [None]:
train_errors_kd, val_errors_kd = train(dataset_gemini_train, dataloader_gemini_train)

In [None]:
visualize_error(train_errors_kd, val_errors_kd)

### Paligemma

In [None]:
from transformers import (
    PaliGemmaProcessor,
    PaliGemmaForConditionalGeneration,
)
from transformers.image_utils import load_image
import torch

model_id = "google/paligemma2-3b-pt-224"

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"
image = load_image(url)

model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto", device_map="auto").eval()
processor = PaliGemmaProcessor.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model_prefix = PrefixTuningModel(model, tokenizer).to(device)
optimizer = torch.optim.AdamW(model_prefix.prefix_tuning.parameters(), lr=5e-5)

In [None]:
train_errors_kd_paligemma, val_errors_kd_paligemma = train(model_prefix, tokenizer, processor, optimizer, dataloader_gemini_train, dataloader_label_val, preprocess_input_paligemma)

In [None]:
visualize_error(train_errors_kd_paligemma, val_errors_kd_paligemma)