In [1]:
# ===== Step 1: Environment & Imports =====
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"      # Disable TensorFlow, use PyTorch only
os.environ['WANDB_DISABLED'] = "true"       # Disable Weights & Biases logging
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'


In [2]:
!pip uninstall -y transformers trl peft accelerate
!pip install -U "transformers==4.45.2" "trl==0.9.4" "peft==0.12.0" "accelerate==0.34.2" "datasets>=2.20.0" safetensors einops lxml defusedxml cairosvg pillow scikit-image

Found existing installation: transformers 4.55.2
Uninstalling transformers-4.55.2:
  Successfully uninstalled transformers-4.55.2
[0mFound existing installation: peft 0.17.0
Uninstalling peft-0.17.0:
  Successfully uninstalled peft-0.17.0
Found existing installation: accelerate 1.10.0
Uninstalling accelerate-1.10.0:
  Successfully uninstalled accelerate-1.10.0
Collecting transformers==4.45.2
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl==0.9.4
  Downloading trl-0.9.4-py3-none-any.whl.metadata (11 kB)
Collecting peft==0.12.0
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting accelerate==0.34.2
  Downloading accelerate-0.34.2-py3-none-any.whl.metadata (19 kB)
Collecting lxml
  Downloading lxml-6.0.1-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Collecting cairosvg
  Downloadi

In [3]:
import trl
trl.__version__

'0.9.4'

In [4]:
!pip install -q -U bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
# ===== Library Imports =====
import re
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    GenerationConfig,
    DataCollatorForLanguageModeling
)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

removed huggingface user information

In [7]:
# ===== Step 2: Parameters =====
SEED = 42
MODEL_NAME = "meta-llama/CodeLlama-7b-Instruct-hf"
MAX_LEN = 6400                              # Based on p99≈5985, saves compute

# ===== Step 3: Load CSV =====
dataset = pd.read_csv("/content/data10k.csv")
dataset = dataset[['description', 'svg']]

# Create the text column
dataset["text"] = dataset.apply(
    lambda row: f"Given the following description: {row['description']}, generate the corresponding SVG string.\n{row['svg']}",
    axis=1
)

In [8]:
# ===== Step 4: Tokenizer & SVG token expansion =====
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

svg_corpus = dataset['svg'].tolist()
svg_tokens = set()
for svg in svg_corpus:
    tags = re.findall(r"</?\w+", svg)        # Match SVG/XML tags
    svg_tokens.update(tags)
    attrs = re.findall(r'\b\w+=', svg)       # Match attributes (fill=, stroke=, etc.)
    svg_tokens.update(attrs)

new_tokens = [t for t in svg_tokens if t not in tokenizer.get_vocab()]
print(f"Adding {len(new_tokens)} new tokens...")
tokenizer.add_tokens(new_tokens)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.model_max_length = MAX_LEN

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Adding 10 new tokens...


In [9]:
# ===== Step 5: 8-1-1 split =====
ds_all = Dataset.from_pandas(dataset[["text"]], preserve_index=False)
training, temp = ds_all.train_test_split(test_size=0.2, seed=SEED).values()
validation, testing = temp.train_test_split(test_size=0.5, seed=SEED).values()
print(f"training: {len(training)}, validation: {len(validation)}, testing: {len(testing)}")

training: 8011, validation: 1001, testing: 1002


In [None]:
# SKIP
# ===== Step 6: Tokenization + masking =====
def tokenize_and_mask(ex):
    """
    对单个样本进行分词，但不进行填充。
    我们将填充任务交给 DataCollator。
    """
    # 不填充，不返回张量，只返回 input_ids
    # DataCollatorForLanguageModeling 需要 token_ids 的列表
    enc = tokenizer(
        ex["text"],
        truncation=True,
        max_length=MAX_LEN,
        padding=False,  # 禁用填充
        return_tensors=None,  # 不返回张量
    )

    input_ids = enc["input_ids"]

    # 处理 labels
    text = ex["text"]
    svg_start = text.find("<svg")

    if svg_start == -1:
        # 直接复制 input_ids 作为 labels
        labels = input_ids.copy()
    else:
        prompt_part = text[:svg_start]
        # 注意：这里我们使用 add_special_tokens=False，因为我们不想在 prompt 部分重复添加特殊 token
        prompt_tokens = tokenizer(prompt_part, add_special_tokens=False)["input_ids"]
        prompt_len = len(prompt_tokens)

        # 创建一个与 input_ids 长度相同的列表，并用 -100 填充
        labels = [-100] * len(input_ids)
        # 仅保留 SVG 部分的真实标签
        labels[prompt_len:] = input_ids[prompt_len:]

    return {"input_ids": input_ids, "labels": labels}


for name, ds in [("training", training), ("validation", validation), ("testing", testing)]:
    # 调整 map 函数，它将返回一个包含 'input_ids' 和 'labels' 列表的字典
    ds = ds.map(tokenize_and_mask, batched=False, remove_columns=["text"])

    # 移除 set_format，因为 DataCollator 期望的是 Python 列表而不是 PyTorch 张量
    # ds.set_format(type="torch") 这一行现在可以删除了

    if name == "training": training = ds
    elif name == "validation": validation = ds
    else: testing = ds

# Data collator for Trainer
# 现在 DataCollator 会负责填充和将列表转换为张量
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8)

print("Dataset is ready.")

Map:   0%|          | 0/8011 [00:00<?, ? examples/s]

Map:   0%|          | 0/1001 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

Dataset is ready.


In [10]:
# ===== Step 7: Model loading (4-bit QLoRA) =====
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,
    use_cache=False
)
model.resize_token_embeddings(len(tokenizer))  # adjust for new tokens

# Prepare for LoRA
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05, # some regularization
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.gradient_checkpointing_enable()
model.print_trainable_parameters()

config.json:   0%|          | 0.00/646 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

trainable params: 4,194,304 || all params: 6,742,822,912 || trainable%: 0.0622


one epoch: 8011 / 16 ≈ 501 steps

In [11]:
# Create save directories
os.mkdir("/content/fine_tune")
os.mkdir("/content/fine_tune/codellama_svg_qlora")
os.mkdir("/content/fine_tune/checkpoints")
os.mkdir("/content/fine_tune/tuned_codellama")
os.mkdir("/content/tokenizer_svg_extended")

In [12]:
# ===== Step 8: Training setup + ETA callback =====
from transformers import TrainerCallback
from transformers.trainer_callback import EarlyStoppingCallback
from datetime import datetime, timedelta
import time
from trl import SFTTrainer

training_args = TrainingArguments(
    output_dir="/content/fine_tune/checkpoints",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    num_train_epochs=1,
    learning_rate=5e-6,
    bf16=True,
    logging_steps=1,          # update logs every step → smoother tqdm + ETA
    eval_strategy="steps",   # perform evaluation based on steps
    eval_steps=50,                # calculate eval loss every 50 steps
    save_strategy="steps",          # save checkpoints accordingly
    save_steps=50,
    save_total_limit=10,
    load_best_model_at_end=True,
    report_to="none",
    disable_tqdm=False,       # keep tqdm enabled
    gradient_checkpointing=True,
    optim="paged_adamw_8bit", # use 8-bit AdamW optimizer instead of default 32-bit
)

class ETAProgressCallback(TrainerCallback):
    """Show elapsed time, ETA and estimated finish time during training."""
    def __init__(self, every_steps: int = 20):
        self.every_steps = every_steps
        self._start_time = None

    def on_train_begin(self, args, state, control, **kwargs):
        self._start_time = time.time()
        print(f"🚀 Training started at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        if state.max_steps:
            print(f"Total steps (estimated): {state.max_steps}")

    def on_log(self, args, state, control, **kwargs):
        if state.global_step and state.global_step % self.every_steps == 0:
            elapsed = time.time() - self._start_time
            # fall back if max_steps is not set yet
            total_steps = state.max_steps or (state.global_step / max(1e-9, state.epoch)) * state.num_train_epochs
            progress = min(1.0, state.global_step / max(1, total_steps))
            if progress > 0:
                remaining = elapsed * (1 / progress - 1)
                eta_dt = datetime.now() + timedelta(seconds=remaining)
                from tqdm import tqdm as _tqdm
                _tqdm.write(
                    f"[step {state.global_step}/{int(total_steps)}] "
                    f"elapsed: {int(elapsed//3600)}h {int((elapsed%3600)//60)}m {int(elapsed%60)}s | "
                    f"ETA: {int(remaining//3600)}h {int((remaining%3600)//60)}m {int(remaining%60)}s "
                    f"(~{eta_dt.strftime('%H:%M:%S')})"
                )

    def on_train_end(self, args, state, control, **kwargs):
        total = time.time() - self._start_time
        print(f"✅ Training finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"⏱ Total time: {int(total//3600)}h {int((total%3600)//60)}m {int(total%60)}s")

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=training,
    eval_dataset=validation,
    dataset_text_field="text",
    peft_config=lora_config,
    tokenizer=tokenizer,
    max_seq_length=MAX_LEN,
    callbacks=[ETAProgressCallback(every_steps=20),
               EarlyStoppingCallback(early_stopping_patience=2)],
)



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/8011 [00:00<?, ? examples/s]

Map:   0%|          | 0/1001 [00:00<?, ? examples/s]



In [13]:
# ===== Step 9: Train + save =====
start_time = time.time()
train_output = trainer.train()
elapsed = time.time() - start_time
print(f"\n✅ Training complete in {int(elapsed//3600)}h {int(elapsed%3600//60)}m {int(elapsed%60)}s")
print("Metrics:", train_output.metrics)

🚀 Training started at 2025-08-22 16:48:47
Total steps (estimated): 500


Step,Training Loss,Validation Loss
50,1.1517,1.150117
100,1.074,1.066724
150,0.9904,0.995244
200,0.972,0.937074
250,0.8925,0.888628
300,0.8498,0.844675
350,0.8331,0.813638


[step 20/500] elapsed: 0h 12m 4s | ETA: 4h 49m 43s (~21:50:35)
[step 40/500] elapsed: 0h 24m 15s | ETA: 4h 38m 55s (~21:51:57)




[step 60/500] elapsed: 0h 51m 37s | ETA: 6h 18m 35s (~23:59:00)
[step 80/500] elapsed: 1h 3m 50s | ETA: 5h 35m 12s (~23:27:50)
[step 100/500] elapsed: 1h 15m 37s | ETA: 5h 2m 28s (~23:06:53)
[step 100/500] elapsed: 1h 30m 45s | ETA: 6h 3m 0s (~00:22:32)




[step 120/500] elapsed: 1h 42m 43s | ETA: 5h 25m 18s (~23:56:49)
[step 140/500] elapsed: 1h 54m 50s | ETA: 4h 55m 17s (~23:38:54)




[step 160/500] elapsed: 2h 22m 11s | ETA: 5h 2m 9s (~00:13:07)
[step 180/500] elapsed: 2h 34m 7s | ETA: 4h 33m 59s (~23:56:54)
[step 200/500] elapsed: 2h 46m 12s | ETA: 4h 9m 18s (~23:44:17)
[step 200/500] elapsed: 3h 1m 19s | ETA: 4h 31m 59s (~00:22:07)




[step 220/500] elapsed: 3h 13m 27s | ETA: 4h 6m 13s (~00:08:28)
[step 240/500] elapsed: 3h 25m 24s | ETA: 3h 42m 31s (~23:56:42)




[step 260/500] elapsed: 3h 52m 47s | ETA: 3h 34m 53s (~00:16:28)
[step 280/500] elapsed: 4h 4m 48s | ETA: 3h 12m 21s (~00:05:57)
[step 300/500] elapsed: 4h 16m 56s | ETA: 2h 51m 17s (~23:57:00)
[step 300/500] elapsed: 4h 32m 3s | ETA: 3h 1m 22s (~00:22:13)




[step 320/500] elapsed: 4h 44m 8s | ETA: 2h 39m 50s (~00:12:46)
[step 340/500] elapsed: 4h 56m 2s | ETA: 2h 19m 19s (~00:04:09)




[step 360/500] elapsed: 5h 23m 3s | ETA: 2h 5m 38s (~00:17:29)
[step 380/500] elapsed: 5h 35m 8s | ETA: 1h 45m 50s (~00:09:46)


Step,Training Loss,Validation Loss
50,1.1517,1.150117
100,1.074,1.066724
150,0.9904,0.995244
200,0.972,0.937074
250,0.8925,0.888628
300,0.8498,0.844675
350,0.8331,0.813638
400,0.7986,0.80008
450,0.7993,0.79487
500,0.801,0.793413


[step 400/500] elapsed: 5h 47m 10s | ETA: 1h 26m 47s (~00:02:44)
[step 400/500] elapsed: 6h 2m 17s | ETA: 1h 30m 34s (~00:21:39)




[step 420/500] elapsed: 6h 14m 38s | ETA: 1h 11m 21s (~00:14:46)
[step 440/500] elapsed: 6h 26m 43s | ETA: 0h 52m 44s (~00:08:15)




[step 460/500] elapsed: 6h 54m 3s | ETA: 0h 36m 0s (~00:18:51)
[step 480/500] elapsed: 7h 6m 19s | ETA: 0h 17m 45s (~00:12:53)
[step 500/500] elapsed: 7h 18m 25s | ETA: 0h 0m 0s (~00:07:13)
[step 500/500] elapsed: 7h 33m 33s | ETA: 0h 0m 0s (~00:22:20)




[step 500/500] elapsed: 7h 33m 36s | ETA: 0h 0m 0s (~00:22:23)
✅ Training finished at 2025-08-23 00:22:23
⏱ Total time: 7h 33m 36s

✅ Training complete in 7h 33m 36s
Metrics: {'train_runtime': 27216.557, 'train_samples_per_second': 0.294, 'train_steps_per_second': 0.018, 'total_flos': 1.8002112423330447e+18, 'train_loss': 0.9352982071638107, 'epoch': 0.9986268880289602}


In [14]:
# Save QLoRA adapter weights
model.save_pretrained("/content/fine_tune/codellama_svg_qlora")
# Save tokenizer
tokenizer.save_pretrained("/content/tokenizer_svg_extended")
print("💾 QLoRA adapter and tokenizer saved.")



💾 QLoRA adapter and tokenizer saved.


In [15]:
!zip -r /content/fine_tune/codellama_svg_qlora.zip /content/fine_tune/codellama_svg_qlora

  adding: content/fine_tune/codellama_svg_qlora/ (stored 0%)
  adding: content/fine_tune/codellama_svg_qlora/adapter_model.safetensors


zip error: Interrupted (aborting)


In [16]:
# Save full model
merged = model.merge_and_unload()
merged.save_pretrained("/content/fine_tune/tuned_codellama")

