In [1]:
import torch
import numpy as np
from datasets import load_dataset
import ast
import numpy as np
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt
from transformers import (
    LlavaForConditionalGeneration,
    LlavaProcessor,
    AutoModelForCausalLM,
    AutoTokenizer,
    VisionEncoderDecoderModel,
    TrainingArguments,
    Trainer
)

2025-06-25 04:16:47.565765: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750825008.055213      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750825008.170427      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Verify GPU availability
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.6.0+cu124
CUDA available: True
GPU: Tesla T4


In [19]:
# Load dataset with caching
dataset = load_dataset(
    "CADCODER/GenCAD-Code",
    num_proc=4,  # Reduced for stability
    split={"train": "train", "test": "test"},
)

# Inspect dataset structure
print("Dataset features:", dataset["train"].features)
print("Sample:", dataset["train"][0])

Dataset features: {'image': Image(mode=None, decode=True, id=None), 'deepcad_id': Value(dtype='string', id=None), 'cadquery': Value(dtype='string', id=None), 'token_count': Value(dtype='int64', id=None), 'prompt': Value(dtype='string', id=None), 'hundred_subset': Value(dtype='bool', id=None)}
Sample: {'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=448x448 at 0x7D72F763DED0>, 'deepcad_id': '0000/00006371', 'cadquery': 'import cadquery as cq\n# Generating a workplane for sketch 0\nwp_sketch0 = cq.Workplane(cq.Plane(cq.Vector(-0.015625, -0.0078125, 0.0), cq.Vector(1.0, 0.0, 0.0), cq.Vector(0.0, 0.0, 1.0)))\nloop0=wp_sketch0.moveTo(0.0, 0.0).threePointArc((0.0007948582418457166, -0.0019189575476279677), (0.0027138157894736844, -0.0027138157894736844)).lineTo(0.021217105263157895, -0.0027138157894736844).threePointArc((0.022787161438489866, -0.00206347722796355), (0.0234375, -0.000493421052631579)).lineTo(0.0234375, 0.018256578947368422).threePointArc((0.02283825686147997, 0.

In [32]:
from transformers import (
    VisionEncoderDecoderModel,
    CLIPVisionModel,
    CLIPImageProcessor,
    AutoModelForCausalLM,
    AutoTokenizer
)
import torch

# Model identifiers
CLIP_ENCODER_ID = "openai/clip-vit-large-patch14"
CEREBRAS_MODEL_ID = "cerebras/Cerebras-GPT-590M"

print("Loading CLIP vision encoder...")
vision_encoder = CLIPVisionModel.from_pretrained(
    CLIP_ENCODER_ID,
    torch_dtype=torch.float16,
    device_map="auto" 
)
vision_encoder.eval()  # Optional: freeze for now

print("Loading Cerebras-GPT decoder...")
decoder = AutoModelForCausalLM.from_pretrained(
    CEREBRAS_MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto"
)

print("Loading tokenizer and image processor...")
tokenizer = AutoTokenizer.from_pretrained(CEREBRAS_MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token  # Required for decoder

image_processor = CLIPImageProcessor.from_pretrained(CLIP_ENCODER_ID)

# Build encoder-decoder model
print("Building VisionEncoderDecoder model...")
model = VisionEncoderDecoderModel(encoder=vision_encoder, decoder=decoder)

# Set config values
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.vocab_size = model.config.decoder.vocab_size

# Dimension compatibility check
encoder_dim = model.encoder.config.hidden_size
decoder_dim = model.decoder.config.n_embd
print(f"Encoder hidden size: {encoder_dim}")
print(f"Decoder embedding size: {decoder_dim}")

# Optional: add projection layer if dimensions don't match
if encoder_dim != decoder_dim:
    print("⚠️ Dimension mismatch detected. Adding projection layer...")
    model.encoder.projection = torch.nn.Linear(encoder_dim, decoder_dim)
    model.config.encoder_hidden_size = decoder_dim

print("✅ Model setup complete.")

Loading CLIP vision encoder...
Loading Cerebras-GPT decoder...
Loading tokenizer and image processor...
Building VisionEncoderDecoder model...
Encoder hidden size: 1024
Decoder embedding size: 1536
⚠️ Dimension mismatch detected. Adding projection layer...
✅ Model setup complete.


In [33]:
def preprocess_function(examples):
    """Preprocess image-text pairs for VisionEncoderDecoder training."""
    
    # Process images using CLIPImageProcessor (batch of PIL images)
    image_outputs = image_processor(
        examples["image"],  # List of PIL images
        return_tensors="pt"  # Return PyTorch tensors directly
    )
    
    # Tokenize the 'cadquery' text with padding and truncation
    text_outputs = tokenizer(
        examples["cadquery"],
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )
    
    # Replace pad tokens in labels with -100 to ignore in loss calculation
    labels = text_outputs.input_ids.clone()
    labels[labels == tokenizer.pad_token_id] = -100
    
    return {
        "pixel_values": image_outputs.pixel_values,
        "labels": labels,
        "attention_mask": text_outputs.attention_mask
    }

In [36]:
# Columns to remove after processing (all original columns)
columns_to_remove = dataset["train"].column_names

# Preprocess and map the train dataset subset
train_dataset = dataset["train"].select(range(4000)).map(
    preprocess_function,
    batched=True,
    batch_size=4,
    remove_columns=columns_to_remove  # Remove original columns after mapping
)

# Preprocess and map the eval dataset subset
eval_dataset = dataset["test"].select(range(100)).map(
    preprocess_function,
    batched=True,
    batch_size=4,
    remove_columns=columns_to_remove  # Remove original columns after mapping
)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [37]:
model.gradient_checkpointing_enable()

training_args = TrainingArguments(
    output_dir="./genCAD-vision-coder",
    per_device_train_batch_size=2,  # Reduced batch size
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    save_strategy="epoch",
    num_train_epochs=3,
    learning_rate=5e-5,
    fp16=True,
    logging_steps=50,
    report_to="none",
    save_total_limit=2,
    dataloader_num_workers=4,
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=default_data_collator,
)

NotImplementedError: Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device.

In [38]:
### Start Training
print("Starting training...")
train_results = trainer.train()
print("Training completed!")

# %% [code]
### Save Final Model
model.save_pretrained("./genCAD-vision-coder-final")
tokenizer.save_pretrained("./genCAD-vision-coder-final")
print("Model saved successfully!")

Starting training...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/parallel/parallel_apply.py", line 96, in _worker
    output = module(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py", line 606, in forward
    encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)
                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/linear.py", line 125, in forward
    return F.linear(input, self.weight, self.bias)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: mat1 and mat2 must have the same dtype, but got Half and Float


In [None]:
### Run Evaluation on Test Subset
model.eval()  # Set model to evaluation mode
device = model.device

# Select a small subset for evaluation
subset_size = 10
eval_subset = dataset["test"].select(range(subset_size))

# Store results
syntax_results = []
iou_results = []

for example in eval_subset:
    # Preprocess image
    pixel_values = llava_processor.image_processor(
        example["image"],
        return_tensors="pt"
    ).pixel_values.to(device)
    
    # Generate CAD code
    outputs = model.generate(
        pixel_values=pixel_values,
        max_length=256,
        num_beams=5,
        early_stopping=True
    )
    generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # 1. Calculate Syntax Validity
    is_valid = valid_syntax_rate_simple(generated_code)
    syntax_results.append(is_valid)
    
    # 2. Calculate IoU
    try:
        # Render generated CAD code
        generated_image = render_cad_code(generated_code)
        
        # Get ground truth image
        gt_image = example["image"]
        
        # Calculate IoU
        iou = get_iou_best(gt_image, generated_image)
        iou_results.append(iou)
        
        # Visualization (optional)
        fig, axes = plt.subplots(1, 2, figsize=(10, 5))
        axes[0].imshow(gt_image)
        axes[0].set_title("Ground Truth")
        axes[0].axis('off')
        
        axes[1].imshow(generated_image)
        axes[1].set_title(f"Generated (IoU: {iou:.2f})")
        axes[1].axis('off')
        
        plt.suptitle(f"Syntax: {'Valid' if is_valid else 'Invalid'}")
        plt.show()
        
    except Exception as e:
        print(f"Rendering failed: {str(e)}")
        iou_results.append(0.0)

)

In [None]:
### Calculate Final Metrics
# 1. Syntax Validity Rate
valid_syntax_rate = np.mean(syntax_results)
print(f"\nSyntax Validity Rate: {valid_syntax_rate:.2%}")

In [None]:
# 2. Average IoU
average_iou = np.mean(iou_results)
print(f"Average IoU: {average_iou:.4f}")

In [None]:
# 3. Combined Metric
composite_score = 0.7 * valid_syntax_rate + 0.3 * average_iou
print(f"Composite Score: {composite_score:.4f}")