In [1]:
!pip install numpy
!pip install datasets
!pip install torch
!pip install transformers datasets accelerate bitsandbytes peft safetensors --upgrade
!pip install matplotlib

Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
bigframes 1.42.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine =

In [10]:
from transformers import (
    BlipForConditionalGeneration,
    VisionEncoderDecoderModel,
    AutoTokenizer,
    AutoProcessor,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
)
from datasets import load_dataset
import torch

In [3]:
dataset = load_dataset(
    "CADCODER/GenCAD-Code",
    num_proc=16,
    split={"train": "train", "test": "test"},
    cache_dir="/Volumes/BIG-DATA/HUGGINGFACE_CACHE"
)

README.md:   0%|          | 0.00/706 [00:00<?, ?B/s]

Setting num_proc from 16 to 2 for the train split as it only contains 2 shards.


Generating train split:   0%|          | 0/147289 [00:00<?, ? examples/s]

Setting num_proc from 16 back to 1 for the test split to disable multiprocessing as it only contains one shard.


Generating test split:   0%|          | 0/7355 [00:00<?, ? examples/s]

Setting num_proc from 16 back to 1 for the validation split to disable multiprocessing as it only contains one shard.


Generating validation split:   0%|          | 0/8204 [00:00<?, ? examples/s]

In [14]:
# Load BLIP encoder (vision + text encoder)
encoder_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

decoder = AutoModelForCausalLM.from_pretrained("cerebras/Cerebras-GPT-590M")
tokenizer = AutoTokenizer.from_pretrained("cerebras/Cerebras-GPT-590M")

encoder_decoder_model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    "Salesforce/blip-image-captioning-base",
    "cerebras/Cerebras-GPT-590M"
)

# Configuration
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.vocab_size = model.config.decoder.vocab_size

`BlipModel` is going to be deprecated in future release, please use `BlipForConditionalGeneration`, `BlipForQuestionAnswering` or `BlipForImageTextRetrieval` depending on your usecase.
Some weights of BlipModel were not initialized from the model checkpoint at Salesforce/blip-image-captioning-base and are newly initialized: ['logit_scale', 'text_model.embeddings.LayerNorm.bias', 'text_model.embeddings.LayerNorm.weight', 'text_model.embeddings.position_embeddings.weight', 'text_model.embeddings.word_embeddings.weight', 'text_model.encoder.layer.0.attention.output.LayerNorm.bias', 'text_model.encoder.layer.0.attention.output.LayerNorm.weight', 'text_model.encoder.layer.0.attention.output.dense.bias', 'text_model.encoder.layer.0.attention.output.dense.weight', 'text_model.encoder.layer.0.attention.self.key.bias', 'text_model.encoder.layer.0.attention.self.key.weight', 'text_model.encoder.layer.0.attention.self.query.bias', 'text_model.encoder.layer.0.attention.self.query.weight', 'text_mo

AttributeError: 'BlipConfig' object has no attribute 'hidden_size'

In [None]:
def preprocess(example):
    image = example["image"]
    text = example["code"]
    
    # Encode image
    pixel_values = processor(images=image, return_tensors="pt").pixel_values[0]
    
    # Encode text
    labels = tokenizer(text, padding="max_length", truncation=True, max_length=512).input_ids
    labels = [l if l != tokenizer.pad_token_id else -100 for l in labels]

    return {
        "pixel_values": pixel_values,
        "labels": torch.tensor(labels)
    }

In [None]:
train_dataset = dataset["train"].map(preprocess, remove_columns=dataset["train"].column_names)
eval_dataset = dataset["test"].map(preprocess, remove_columns=dataset["test"].column_names)

In [None]:
training_args = TrainingArguments(
    output_dir="./checkpoints/baseline",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    evaluation_strategy="steps",
    save_strategy="steps",
    num_train_epochs=1,
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    learning_rate=1e-5,
    fp16=True,
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=encoder_decoder_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [None]:
trainer.train()

In [None]:
encoder_decoder_model.save_pretrained("./genCAD-blip-zephyr")
tokenizer.save_pretrained("./genCAD-blip-zephyr")