# SRL GRPO Training (Colab, A100)
End-to-end notebook to build SRL data, split 95/5, and train with GRPO + LoRA on an A100.

**Memory-optimized configuration for single GPU training.**

In [1]:
# ============================================================================
# Setup and Installation (vLLM Edition - 6x faster GRPO training)
# ============================================================================
# Runtime: GPU (A100 recommended)
# Make sure to select GPU: Runtime -> Change runtime type -> GPU

import os
from pathlib import Path

# Repository configuration
REPO_URL = "https://github.com/iroblesrazzaq/SRL-reasoning.git"
BRANCH = "main"
WORKDIR = "/content/SRL-reasoning"

# ============================================================================
# CRITICAL: Install packages with compatible versions
# vLLM 0.10.2 pins numpy to 2.0.x, so we need scipy compatible with numpy 2.0.x
# ============================================================================

# Step 1: Install vLLM first (this pins numpy to 2.0.x)
!pip install -q vllm==0.10.2
print("✓ Step 1: vLLM 0.10.2 installed (numpy pinned to 2.0.x)")

# Step 2: Install scipy version compatible with numpy 2.0.x
# scipy 1.14.x works with numpy >=1.23.5,<2.3 (includes 2.0.x)
!pip install -q "scipy>=1.14.0,<1.15"
print("✓ Step 2: scipy 1.14.x installed (compatible with numpy 2.0.x)")

# Step 3: Install TRL and other dependencies
!pip install -q git+https://github.com/huggingface/trl.git
!pip install -q bitsandbytes datasets peft accelerate
print("✓ Step 3: TRL and dependencies installed")

# Clone repo if not exists
if not os.path.exists(WORKDIR):
    !git clone --branch $BRANCH $REPO_URL $WORKDIR

%cd $WORKDIR
!git pull

# Install package
!pip install -e . --no-deps

# Verify GPU
import torch
print("=" * 80)
print("SETUP COMPLETE (vLLM Edition)")
print("=" * 80)
print(f"✓ Repository: {WORKDIR}")
print(f"✓ CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"✓ GPU: {torch.cuda.get_device_name(0)}")
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"✓ GPU Memory: {gpu_mem:.2f} GB")
print("=" * 80)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m436.4/436.4 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.0/180.0 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.0/111.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.4/45.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m76.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.9/887.9 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Verify all installations
from inspect import signature
from trl import GRPOConfig

# Check TRL supports vLLM
supported = set(signature(GRPOConfig.__init__).parameters)
assert 'use_vllm' in supported, "TRL version doesn't support vLLM"
print("✓ TRL with vLLM support verified")

# Print versions
import torch
import trl
import vllm
import scipy
import numpy as np

print(f"TRL version: {trl.__version__}")
print(f"vLLM version: {vllm.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"SciPy version: {scipy.__version__}")
print(f"PyTorch {torch.__version__}, CUDA {torch.version.cuda}")

# Verify vLLM compatibility
if vllm.__version__ == "0.10.2":
    print("✓ vLLM version compatible - fast generation enabled!")
else:
    print(f"⚠️ vLLM {vllm.__version__} may not be compatible with TRL")

✓ TRL with vLLM support verified
TRL version: 0.26.0.dev0
vLLM version: 0.10.2
NumPy version: 2.0.2
SciPy version: 1.14.1
PyTorch 2.8.0+cu128, CUDA 12.8
✓ vLLM version compatible - fast generation enabled!


In [3]:
#@title 1. Global config
import random, numpy as np
from pathlib import Path
import gc

SEED = 42
BASE_MODEL = 'Qwen/Qwen3-1.7B'  # Base model, GRPO-compatible with vLLM
REPO_DIR = Path('/content/SRL-reasoning')
OUTPUT_DIR = Path('/content/outputs/srl_grpo')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
DATA_DIR = REPO_DIR / 'data'
DATA_DIR.mkdir(parents=True, exist_ok=True)

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
print('DATA_DIR:', DATA_DIR)

DATA_DIR: /content/SRL-reasoning/data


In [4]:
#@title 2. Build SRL data (s1K-1.1 -> step-wise JSONL)
from src.shared.build_srl_data import load_teacher_dataset, normalize_dataset, build_srl_dataset, save_jsonl
from src.shared.splits import split_by_trajectory

raw_ds = load_teacher_dataset('simplescaling/s1K-1.1', split='train')
norm_trajs = normalize_dataset(raw_ds)
srl_examples = build_srl_dataset(norm_trajs)

all_path = DATA_DIR / 'srl_steps.jsonl'
save_jsonl(srl_examples, all_path)

train_examples, val_examples, _ = split_by_trajectory(
    str(all_path),
    train_ratio=0.95,
    val_ratio=0.05,
    test_ratio=0.0,
    seed=SEED,
)

train_path = DATA_DIR / 'train.jsonl'
val_path = DATA_DIR / 'val.jsonl'
save_jsonl(train_examples, train_path)
save_jsonl(val_examples, val_path)

print(f'Train examples: {len(train_examples)}')
print(f'Val examples:   {len(val_examples)}')

Loading dataset: simplescaling/s1K-1.1 (split: train)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/22.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Loaded 1000 examples


Normalizing trajectories: 100%|██████████| 1000/1000 [00:00<00:00, 4904.63example/s]
Building SRL examples: 100%|██████████| 606/606 [00:00<00:00, 108302.37trajectory/s]
Saving to JSONL: 100%|██████████| 2675/2675 [00:00<00:00, 54536.35example/s]
Saving to JSONL: 100%|██████████| 2528/2528 [00:00<00:00, 53894.21example/s]
Saving to JSONL: 100%|██████████| 143/143 [00:00<00:00, 45531.43example/s]

Train examples: 2528
Val examples:   143





In [5]:
#@title 3. Prepare HF datasets for GRPO
from scripts.train_srl import load_srl_dataset

train_dataset = load_srl_dataset(str(train_path))
val_dataset = load_srl_dataset(str(val_path))

print(f'HF datasets -> train {len(train_dataset)}, val {len(val_dataset)}')

✓ vLLM available - using optimized generation
INFO 12-08 01:39:47 [__init__.py:216] Automatically detected platform cuda.
HF datasets -> train 2528, val 143


In [6]:
#@title 4. Load model with transformers + PEFT (vLLM handles generation)
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType

# Clear any existing GPU memory
gc.collect()
torch.cuda.empty_cache()

# Load base model with standard transformers
# vLLM will handle fast generation, so we use standard loading here
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)

# Set padding
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

# Apply LoRA with standard PEFT
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    task_type=TaskType.CAUSAL_LM,
    bias="none",
)
model = get_peft_model(model, lora_config)
model.enable_input_require_grads()
model.gradient_checkpointing_enable()

# Print memory usage
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f'\n✓ Model loaded with PEFT LoRA!')
print(f'  Trainable params: {trainable_params/1e6:.1f}M / {total_params/1e6:.1f}M')
if torch.cuda.is_available():
    print(f'  GPU Memory used: {torch.cuda.memory_allocated(0)/1e9:.2f} GB')

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/622M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]


✓ Model loaded with PEFT LoRA!
  Trainable params: 17.4M / 1738.0M
  GPU Memory used: 3.51 GB


# CODE TO LOAD MODEL FROM SAVED CHECKPOIN

In [7]:
import random, numpy as np
from pathlib import Path
import gc

SEED = 42
BASE_MODEL = 'rd211/Qwen3-1.7B-Instruct'
REPO_DIR = Path('/content/SRL-reasoning')
OUTPUT_DIR = Path('/content/outputs/srl_grpo')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
DATA_DIR = REPO_DIR / 'data'
DATA_DIR.mkdir(parents=True, exist_ok=True)

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
print('DATA_DIR:', DATA_DIR)
from inspect import signature
from trl import GRPOConfig
from scripts.train_srl import SRLGRPOTrainer, create_reward_function

reward_fn = create_reward_function(tokenizer)

# Check which parameter names are supported
supported = set(signature(GRPOConfig.__init__).parameters)

# ============================================================================
# GENERATION MODE
# ============================================================================
# vLLM 0.10.2 enabled for 6-10x faster generation
# numpy/scipy conflicts resolved via correct installation order
# ============================================================================

use_vllm = True
print("✓ Using vLLM for fast generation (6-10x speedup)")

# ============================================================================
# GRPO CONFIGURATION
# ============================================================================

grpo_kwargs = {
    'output_dir': str(OUTPUT_DIR),

    # === VLLM SETTINGS (colocate mode - simpler for single GPU) ===
    'use_vllm': use_vllm,
    'vllm_mode': 'colocate' if use_vllm else None,  # No separate server needed!
    'vllm_gpu_memory_utilization': 0.6 if use_vllm else None,  # Lower for colocate
    'vllm_init_kwargs': {'task': 'generate', 'trust_remote_code': True} if use_vllm else None,  # Required for Qwen models


    # === BATCH SIZE (adjusted based on vLLM availability) ===
    # With vLLM: can use larger batches; Without: need smaller for memory
    'per_device_train_batch_size': 2 if not use_vllm else 4,
    'num_generations': 4,                 # Reduced from 8 (still effective)
    'per_device_eval_batch_size': 4,     # Must be divisible by num_generations
    'gradient_accumulation_steps': 64 if not use_vllm else 32,  # Adjust for batch size

    # === GRPO-SPECIFIC ===
    'beta': 0.0,                         # Paper: no KL penalty for SRL
    'temperature': 1.0,                  # Paper: 1.0 for rollouts

    # === TOKEN LIMITS (reduced for faster generation) ===
    'max_prompt_length': 512,            # Reduced from 1024
    'max_completion_length': 256,        # Reduced from 1024 (paper uses 256-512)

    # === CHECKPOINTING ===
    'save_strategy': 'epoch',
    'save_total_limit': 2,
    'load_best_model_at_end': True,
    'metric_for_best_model': 'eval_reward',
    'greater_is_better': True,

    # === OPTIMIZATION ===
    'optim': 'adamw_8bit',               # Memory-efficient optimizer
    'bf16': True,                        # bfloat16 precision

    # === LOGGING ===
    'logging_steps': 1,
    'report_to': 'none',
    'seed': SEED,
}

# Handle eval_strategy naming
if 'eval_strategy' in supported:
    grpo_kwargs['eval_strategy'] = 'epoch'
else:
    grpo_kwargs['evaluation_strategy'] = 'epoch'

# Remove None values (for when vLLM is disabled)
grpo_kwargs = {k: v for k, v in grpo_kwargs.items() if v is not None}

# Filter to only supported parameters
grpo_config = GRPOConfig(**{k: v for k, v in grpo_kwargs.items() if k in supported})

print("=" * 80)
print("GRPO Config Summary" + (" (vLLM)" if use_vllm else " (HuggingFace)"))
print("=" * 80)
print(f"  use_vllm: {use_vllm}")
if use_vllm:
    print(f"  vllm_gpu_memory_utilization: 0.5")
    if 'vllm_engine_kwargs' in grpo_kwargs:
        print(f"  vllm_engine_kwargs: {grpo_kwargs['vllm_engine_kwargs']}")
print(f"  per_device_train_batch_size: {grpo_kwargs['per_device_train_batch_size']}")
print(f"  num_generations: {grpo_kwargs['num_generations']}")
print(f"  Sequences per step: {grpo_kwargs['per_device_train_batch_size']} x {grpo_kwargs['num_generations']} = {grpo_kwargs['per_device_train_batch_size'] * grpo_kwargs['num_generations']}")
print(f"  Effective batch size: {grpo_kwargs['per_device_train_batch_size']} x {grpo_kwargs['gradient_accumulation_steps']} = {grpo_kwargs['per_device_train_batch_size'] * grpo_kwargs['gradient_accumulation_steps']}")
print(f"  Max tokens: prompt={grpo_kwargs['max_prompt_length']}, completion={grpo_kwargs['max_completion_length']}")
print(f"  Total max sequence: {grpo_kwargs['max_prompt_length'] + grpo_kwargs['max_completion_length']}")
if not use_vllm:
    print("  NOTE: Training will be slower without vLLM (~2-3 min/step)")
print("=" * 80)

DATA_DIR: /content/SRL-reasoning/data
✓ Using vLLM for fast generation (6-10x speedup)
GRPO Config Summary (vLLM)
  use_vllm: True
  vllm_gpu_memory_utilization: 0.5
  per_device_train_batch_size: 4
  num_generations: 4
  Sequences per step: 4 x 4 = 16
  Effective batch size: 4 x 32 = 128
  Max tokens: prompt=512, completion=256
  Total max sequence: 768


In [8]:
#@title 6. Initialize trainer and start training

# Clear cache before training
gc.collect()
torch.cuda.empty_cache()

# Initialize trainer
trainer = SRLGRPOTrainer(
    model=model,
    args=grpo_config,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    reward_funcs=reward_fn,
    filter_epsilon=1e-4,
)

print("\n" + "=" * 80)
print("STARTING TRAINING" + (" (vLLM)" if use_vllm else " (HuggingFace)"))
print("=" * 80)
print(f"GPU Memory before training: {torch.cuda.memory_allocated(0)/1e9:.2f} GB")
if use_vllm:
    print("vLLM will handle fast generation (6-10x speedup)")
else:
    print("Using HuggingFace generation (slower, ~2-3 min/step)")
    print("This is expected when vLLM 0.10.2 is not available")

train_result = trainer.train()
print(train_result)

The model is already on multiple devices. Skipping the move to device specified in `args`.


INFO 12-08 01:40:06 [utils.py:328] non-default args: {'seed': 0, 'max_model_len': 768, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 128, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'Qwen/Qwen3-1.7B'}
INFO 12-08 01:40:22 [__init__.py:742] Resolved architecture: Qwen3ForCausalLM
INFO 12-08 01:40:22 [__init__.py:1815] Using max model len 768
INFO 12-08 01:40:27 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
INFO 12-08 01:40:27 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
INFO 12-08 01:40:28 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='Qwen/Qwen3-1.7B', speculative_config=None, tokenizer='Qwen/Qwen3-1.7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=768, download_dir=

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 12-08 01:40:32 [default_loader.py:268] Loading weights took 1.18 seconds
INFO 12-08 01:40:33 [gpu_model_runner.py:2392] Model loading took 3.2152 GiB and 1.933934 seconds
INFO 12-08 01:40:41 [backends.py:539] Using cache directory: /root/.cache/vllm/torch_compile_cache/643a8dbc90/rank_0_0/backbone for vLLM's torch.compile
INFO 12-08 01:40:41 [backends.py:550] Dynamo bytecode transform time: 7.50 s
INFO 12-08 01:40:47 [backends.py:194] Cache the graph for dynamic shape for later use
INFO 12-08 01:41:12 [backends.py:215] Compiling a graph for dynamic shape takes 31.09 s
INFO 12-08 01:41:14 [monitor.py:34] torch.compile takes 38.59 s in total
INFO 12-08 01:41:16 [gpu_worker.py:298] Available KV cache memory: 27.78 GiB
INFO 12-08 01:41:17 [kv_cache_utils.py:864] GPU KV cache size: 260,096 tokens
INFO 12-08 01:41:17 [kv_cache_utils.py:868] Maximum concurrency for 768 tokens per request: 338.67x


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 35/35 [00:01<00:00, 25.23it/s]


INFO 12-08 01:41:19 [gpu_model_runner.py:3118] Graph capturing finished in 3 secs, took 0.28 GiB
INFO 12-08 01:41:19 [gpu_worker.py:391] Free memory on device (75.54/79.32 GiB) on startup. Desired GPU memory utilization is (0.4, 31.73 GiB). Actual usage is 3.22 GiB for weight, 0.71 GiB for peak activation, 0.02 GiB for non-torch memory, and 0.28 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=29377396121` to fit into requested memory, or `--kv-cache-memory=76421915648` to fully utilize gpu memory. Current kv cache memory in use is 29830380953 bytes.
INFO 12-08 01:41:19 [core.py:218] init engine (profile, create kv cache, warmup model) took 46.62 seconds
INFO 12-08 01:41:21 [llm.py:295] Supported_tasks: ('generate',)
INFO 12-08 01:41:21 [__init__.py:36] No IOProcessor plugins requested by the model


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.



STARTING TRAINING (vLLM)
GPU Memory before training: 36.84 GB
vLLM will handle fast generation (6-10x speedup)
INFO 12-08 01:41:22 [block_pool.py:292] Successfully reset prefix cache




Epoch,Training Loss,Validation Loss
1,0.0226,0.034935


INFO 12-08 01:42:10 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:42:57 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:43:44 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:44:31 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:45:18 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:46:05 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:46:52 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:47:39 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:48:25 [block_pool.py:292] Successfully reset prefix cache


Exception ignored in: <function _xla_gc_callback at 0x78204986dbc0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/jax/_src/lib/__init__.py", line 127, in _xla_gc_callback
    def _xla_gc_callback(*args):
    
KeyboardInterrupt: 


INFO 12-08 01:49:12 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:49:59 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:50:47 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:51:34 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:52:21 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:53:08 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:53:54 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:54:42 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:55:29 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:56:15 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:57:02 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:57:49 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:58:36 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 01:59:23 [block_pool.py:292] Successfully reset prefi



INFO 12-08 02:09:35 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 02:10:22 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 02:11:09 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 02:11:56 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 02:12:43 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 02:13:30 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 02:14:17 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 02:15:03 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 02:15:51 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 02:16:37 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 02:17:24 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 02:18:11 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 02:18:59 [block_pool.py:292] Successfully reset prefix cache
INFO 12-08 02:19:46 [block_pool.py:292] Successfully reset prefi

KeyError: "The `metric_for_best_model` training argument is set to 'eval_reward', which is not found in the evaluation metrics. The available evaluation metrics are: ['eval_loss']. Consider changing the `metric_for_best_model` via the TrainingArguments."

In [17]:
# Re-copy to Drive (after mounting works)
import os

src = "/content/srl_model_epoch1_emergency"
dst = "/content/drive/MyDrive/SRL-reasoning/outputs/srl_model_epoch1"

if os.path.exists(src):
    os.makedirs(dst, exist_ok=True)
    !cp -r {src}/* "{dst}/"
    !ls -la "{dst}"
    print("✓ Model saved to Drive!")
else:
    print(f"⚠️ Source not found: {src}")
    print("Check if model is still in memory with: trainer.save_model('/content/backup')")

total 83704
drwxr-xr-x 2 root root     4096 Dec  8 02:51 .
drwxr-xr-x 3 root root     4096 Dec  8 02:51 ..
-rw-r--r-- 1 root root     1046 Dec  8 03:07 adapter_config.json
-rw-r--r-- 1 root root 69782384 Dec  8 03:07 adapter_model.safetensors
-rw-r--r-- 1 root root      707 Dec  8 03:07 added_tokens.json
-rw-r--r-- 1 root root     4168 Dec  8 03:07 chat_template.jinja
-rw-r--r-- 1 root root  1671853 Dec  8 03:07 merges.txt
-rw-r--r-- 1 root root     5197 Dec  8 03:07 README.md
-rw-r--r-- 1 root root      613 Dec  8 03:07 special_tokens_map.json
-rw-r--r-- 1 root root     5404 Dec  8 03:07 tokenizer_config.json
-rw-r--r-- 1 root root 11422654 Dec  8 03:07 tokenizer.json
-rw-r--r-- 1 root root     7505 Dec  8 03:07 training_args.bin
-rw-r--r-- 1 root root  2776833 Dec  8 03:07 vocab.json
✓ Model saved to Drive!


In [19]:
trainer.save_model('/content/backup')

In [21]:
# DIRECT DOWNLOAD - No Drive needed
import shutil

# Create zip of the model
src = "/content/srl_model_epoch1_emergency"
shutil.make_archive("/content/srl_model_epoch1", 'zip', src)

# Download directly to your Mac
from google.colab import files
files.download("/content/srl_model_epoch1.zip")
print("✓ Downloading to your Mac...")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✓ Downloading to your Mac...


In [None]:
#@title 8. (Optional) Merge LoRA and save full model
# Uncomment to merge LoRA weights into the base model for easier deployment

# MERGED_DIR = Path('/content/drive/MyDrive/SRL_Merged_Model')
# MERGED_DIR.mkdir(parents=True, exist_ok=True)

# model.save_pretrained_merged(
#     str(MERGED_DIR),
#     tokenizer,
#     save_method="merged_16bit",  # Full 16-bit merged model
# )
# print('Saved merged model to', MERGED_DIR)

In [12]:
from google.colab import drive

# Method 1: Flush and sync
drive.flush_and_unmount()


Drive not mounted, so nothing to flush and unmount.
