In [6]:
import kagglehub
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from torchvision import transforms
from torch.utils.data import Dataset
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config , BitsAndBytesConfig
from transformers import ViTModel
from peft import LoraConfig, get_peft_model
from transformers import GPT2Model
from peft import LoraConfig, get_peft_model, TaskType

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)
# Load Models and Tokenizer
base_model = GPT2LMHeadModel.from_pretrained(
    "gpt2",
    device_map="auto",
    load_in_4bit=True,   # QLoRA requires 4-bit or 8-bit
)
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
# ViT is not a language model, so we don't need to quantize it unless we're memory-constrained
# For simplicity, let's load it normally on the GPU
vit = ViTModel.from_pretrained("google/vit-base-patch16-224")


# --- LoRA Configuration for GPT-2 ---
lora_config1 = LoraConfig(
    r=8,
    lora_alpha=16,
    # Best target modules for GPT-2
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# --- LoRA Configuration for ViT ---
# Note: ViT is not a Causal LM, so TaskType should not be set or set appropriately
# for the task you are doing (e.g., image classification).
# Since you're just adapting the base ViTModel, we can omit task_type.
lora_config2 = LoraConfig(
    r=8,
    lora_alpha=16,
    # Best target modules for ViT
    target_modules=["query", "key", "value", "dense"],
    lora_dropout=0.05,
    bias="none",
)

# Wrap GPT-2 with QLoRA
lora_gpt2_model = get_peft_model(base_model, lora_config1)
print("--- GPT-2 with LoRA ---")
lora_gpt2_model.print_trainable_parameters()


# Wrap ViT with LoRA
lora_vit_model = get_peft_model(vit, lora_config2)
print("\n--- ViT with LoRA ---")
lora_vit_model.print_trainable_parameters()

# Download latest version
path = kagglehub.dataset_download("hsankesara/flickr-image-dataset")

print("Path to dataset files:", path)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- GPT-2 with LoRA ---
trainable params: 811,008 || all params: 125,250,816 || trainable%: 0.6475

--- ViT with LoRA ---
trainable params: 1,339,392 || all params: 87,728,640 || trainable%: 1.5267
Path to dataset files: /kaggle/input/flickr-image-dataset


In [7]:
import pandas as pd
df1= pd.read_csv(r"/kaggle/input/flickr-image-dataset/flickr30k_images/results.csv", delimiter='\t', engine='python')


In [8]:
  transform=transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) # Added normalization as a common practice
    ])

In [9]:
class FlickrDataset(Dataset):
  def __init__(self, csv_file, root_dir, transform=None):
    self.annotations = csv_file
    self.root_dir = root_dir
    self.transform = transform
  def __len__(self):
    return len(self.annotations)
  def __getitem__(self, index):
    img_path = os.path.join(self.root_dir, self.annotations.iloc[index, 0])
    image = io.imread(img_path)
    txt=tokenizer(self.annotations.iloc[index, 1], return_tensors='pt')
    txt=torch.tensor(txt)
    if self.transform:
      image = self.transform(image)
    return (image, txt)
data=FlickrDataset(df1,r"/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images",transform=transform)
dataloader=DataLoader(data, batch_size=16, shuffle=True)

In [10]:
import torch
import torch.nn as nn

class Img2GPT(nn.Module):
    def __init__(self, gptt, vitt):
        super(Img2GPT, self).__init__()
        self.vit = vitt
        self.gpt2 = gptt

        vit_dim = self.vit.config.hidden_size
        gpt_dim = self.gpt2.config.hidden_size

        # project ViT outputs to GPT2 input size
        self.proj = nn.Linear(vit_dim, gpt_dim)

    def forward(self, images):
        # ViT forward
        vit_outputs = self.vit(images)
        img_embeds = vit_outputs.last_hidden_state  # (batch, seq_len, vit_dim)

        # Project to GPT2 dimension
        img_embeds = self.proj(img_embeds)  # (batch, seq_len, gpt_dim)

        # Feed into GPT2 as inputs_embeds
        outputs = self.gpt2(inputs_embeds=img_embeds)

        return outputs

In [14]:
model = Img2GPT(lora_gpt2_model, lora_vit_model)
def count_all_parameters(model):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total, trainable

total, trainable = count_all_parameters(model)
print(f"Total parameters: {total}")
print(f"Trainable parameters: {trainable}")
print(f"Frozen parameters: {total - trainable}")



Total parameters: 171102720
Trainable parameters: 3331584
Frozen parameters: 167771136
