In [1]:
# Cell 1: Installation and Setup
# Step 1: Install PyTorch and ML libraries
# !pip install -q torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124
!pip install -q --upgrade transformers accelerate datasets trl bitsandbytes scipy deepspeed wandb

# Step 2: Verification and imports
import os
import json
import torch
import transformers
import trl
import logging
import warnings
import wandb
from dataclasses import dataclass, field, asdict
from typing import Optional
from datetime import datetime
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import HfApi, create_repo, upload_folder, login
from dotenv import load_dotenv

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
load_dotenv() 

# Configure logging
warnings.filterwarnings("ignore", category=FutureWarning)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Print versions and GPU info
print(f"✅ PyTorch: {torch.__version__}")
print(f"✅ Transformers: {transformers.__version__}")
print(f"✅ TRL: {trl.__version__}")
print(f"✅ CUDA: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    gpu_count = torch.cuda.device_count()
    print(f"✅ GPUs detected: {gpu_count}")
    for i in range(gpu_count):
        props = torch.cuda.get_device_properties(i)
        print(f"  GPU {i}: {props.name} ({props.total_memory / 1024**3:.1f}GB)")

# Step 3: Login to services
wandb.login(key=os.getenv("WANDB_API_KEY"), relogin=True)
wb_api  = wandb.Api()        
wb_user = wb_api.viewer
print(wb_user.username)

login(token=os.getenv("HF_WRITE_TOKEN"))
hf_info = HfApi().whoami(token=os.getenv("HF_WRITE_TOKEN"))
print(json.dumps({k: hf_info[k] for k in ("name", "auth") if k in hf_info}, indent=2))

print("🔄 Please restart kernel and continue with configuration")


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/jaeh8nkim/.netrc


✅ PyTorch: 2.6.0+cu124
✅ Transformers: 4.53.0
✅ TRL: 0.19.0
✅ CUDA: True
✅ GPUs detected: 1
  GPU 0: NVIDIA RTX A6000 (47.5GB)


[34m[1mwandb[0m: Currently logged in as: [33mjaeh8nkim[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


jaeh8nkim
{
  "name": "jaeh8nkim",
  "auth": {
    "type": "access_token",
    "accessToken": {
      "displayName": "notebooks",
      "role": "write",
      "createdAt": "2025-06-29T15:24:25.718Z"
    }
  }
}
🔄 Please restart kernel and continue with configuration


In [None]:
# Cell 2: Training Configuration

# Canonical IDs for both models
BASE_MODEL_NAME = "Qwen3-0.6B"
BASE_MODEL_REMOTE_PATH = "Qwen/" + BASE_MODEL_NAME
BASE_MODEL_LOCAL_PATH = BASE_MODEL_NAME + "-local"

# SFTD_MODEL_NAME = "s1K-Distill-Qwen3-0.6B" + "-250710"
# SFTD_MODEL_REMOTE_PATH = "jaeh8nkim/" + SFTD_MODEL_NAME
# SFTD_MODEL_LOCAL_PATH = SFTD_MODEL_NAME + "-local"

SFTD_MODEL_NAME = "s1K4Q3p6B-Distill-Qwen3-0.6B" + "-250710"
SFTD_MODEL_REMOTE_PATH = "jaeh8nkim/" + SFTD_MODEL_NAME
SFTD_MODEL_LOCAL_PATH = SFTD_MODEL_NAME + "-local"

# DATASET_REMOTE_PATH = "simplescaling/s1K-1.1_tokenized"
DATASET_REMOTE_PATH = "jaeh8nkim/s1K-for-Qwen3-0.6B"

@dataclass
class TrainingConfig:
    """S1-faithful training configuration"""
    base_model_name:   str = field(default=BASE_MODEL_REMOTE_PATH)   # load from here
    output_model_name: str = field(default=SFTD_MODEL_NAME)   # save to here
    block_size: int = 8192
    wandb_project: Optional[str] = SFTD_MODEL_NAME
    wandb_entity:  Optional[str] = "jaeh8nkim"
    train_file_path: Optional[str] = DATASET_REMOTE_PATH
    dagger: bool = False

    def __post_init__(self):
        if self.wandb_project:
            os.environ["WANDB_PROJECT"] = self.wandb_project
        if self.wandb_entity:
            os.environ["WANDB_ENTITY"]  = self.wandb_entity

# Initialise
config = TrainingConfig()

print("Training Configuration:")
print(f"  Base model (loading from): {config.base_model_name}")
print(f"  Output model (saving to): {config.output_model_name}")
print(f"  Block size: {config.block_size}")
print(f"  Dataset: {config.train_file_path}")

Training Configuration:
  Base model (loading from): Qwen/Qwen3-0.6B
  Output model (saving to): s1-4q36-qwen3-0.6b
  Block size: 8192
  Dataset: jaeh8nkim/s1K_for_Qwen3-0.6B


In [3]:
# Cell 3: Model and Dataset Loading (FIXED)
def load_model_and_tokenizer():
    """Load Qwen3-0.6B model and tokenizer"""
    logging.info(f"Loading model: {config.base_model_name}")  # Changed this line
    
    model = AutoModelForCausalLM.from_pretrained(
        config.base_model_name,  # Changed this line
        torch_dtype=torch.bfloat16,
        trust_remote_code=True
    )
    
    tokenizer = AutoTokenizer.from_pretrained(
        config.base_model_name,  # Changed this line
        use_fast=True,
        trust_remote_code=True
    )
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Template definitions
    instruction_template = "<|im_start|>user"
    response_template = "<|im_start|>assistant\n"
    
    total = sum(p.numel() for p in model.parameters())
    train = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"✅ Model loaded: {total:,} total, {train:,} trainable parameters")
    
    return model, tokenizer, instruction_template, response_template

def load_s1_dataset(qwen3_style=True):
    """Load and preprocess S1 dataset"""
    logging.info(f"Loading dataset: {config.train_file_path}")
    
    dataset = load_dataset(config.train_file_path)
    print(f"✅ Dataset loaded: {len(dataset['train'])} train samples")
    
    # Filter to text column only
    train_dataset = dataset['train'].select_columns(['text'])
    test_dataset = dataset['test'].select_columns(['text']) if 'test' in dataset else train_dataset
    filtered_dataset = DatasetDict(train=train_dataset, test=test_dataset)
    
    # Apply Qwen3 token style conversion
    if qwen3_style:
        def swap_tokens(example):
            txt = example['text']
            txt = txt.replace('<|im_start|>think', '<think>')
            txt = txt.replace('<|im_start|>answer', '</think>')
            example['text'] = txt
            return example
        
        filtered_dataset = filtered_dataset.map(swap_tokens)
        print("✅ Applied Qwen3 token style conversion")
    
    return filtered_dataset

# Load model and data
model, tokenizer, instruction_template, response_template = load_model_and_tokenizer()
dataset = load_s1_dataset(qwen3_style=True)

2025-07-01 07:37:13,961 - INFO - Loading model: Qwen/Qwen3-0.6B
2025-07-01 07:37:16,220 - INFO - Loading dataset: jaeh8nkim/s1K_for_Qwen3-0.6B


✅ Model loaded: 596,049,920 total, 596,049,920 trainable parameters
✅ Dataset loaded: 157 train samples


Map:   0%|          | 0/157 [00:00<?, ? examples/s]

✅ Applied Qwen3 token style conversion


In [4]:
# Cell 4: Training Configuration and Execution
def create_sft_config():
    """Create SFTConfig with S1-faithful hyperparameters"""
    
    # Load the dataset to get its size
    current_dataset = load_dataset(config.train_file_path)
    dataset_size = len(current_dataset['train'])

    # S1 hyperparameters
    lr = 1e-5
    epochs = 5
    weight_decay = 1e-4
    micro_batch_size = 1
    gradient_accumulation_steps = 16
    
    # Calculate training steps using actual dataset size
    effective_batch_size = micro_batch_size * gradient_accumulation_steps
    steps_per_epoch = dataset_size // effective_batch_size
    total_steps = steps_per_epoch * epochs
    warmup_steps = int(total_steps * 0.05)
    
    print(f"🎯 Detected dataset size: {dataset_size}")
    print(f"Training setup: {effective_batch_size} batch size, {total_steps} steps, {warmup_steps} warmup")
    
    return trl.SFTConfig(
        # Model and data
        max_seq_length=config.block_size,
        dataset_text_field='text',
        
        # Training schedule
        num_train_epochs=epochs,
        per_device_train_batch_size=micro_batch_size,
        per_device_eval_batch_size=micro_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        
        # Optimizer
        learning_rate=lr,
        weight_decay=weight_decay,
        adam_beta1=0.9,
        adam_beta2=0.95,
        warmup_ratio=0.05,
        lr_scheduler_type="cosine",
        
        # Precision and memory
        bf16=True,
        gradient_checkpointing=True,
        dataloader_pin_memory=True,
        dataloader_num_workers=0,
        
        # Evaluation and logging
        eval_strategy="no",
        save_strategy="no",
        logging_steps=1,
        
        # Output
        output_dir=f"./{SFTD_MODEL_NAME}",
        logging_dir=f"./{SFTD_MODEL_NAME}/logs",
        save_only_model=True,
        
        # Data handling
        remove_unused_columns=True,
        report_to=["wandb"] if config.wandb_project else [],
        run_name=SFTD_MODEL_NAME,
        dataloader_drop_last=True,
        packing=False,
    )

def train_model():
    """Execute S1-faithful training"""
    sft_args = create_sft_config()
    
    # Create data collator
    collator = trl.DataCollatorForCompletionOnlyLM(
        instruction_template=instruction_template,
        response_template=response_template,
        tokenizer=tokenizer,
        mlm=False
    )
    
    # Create trainer
    trainer = trl.SFTTrainer(
        model=model,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test'] if 'test' in dataset else dataset['train'],
        args=sft_args,
        data_collator=collator
    )
    
    # Monitor memory
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated(0) / 1024**3
        total = torch.cuda.get_device_properties(0).total_memory / 1024**3
        print(f"GPU memory before training: {allocated:.1f}GB / {total:.1f}GB")
    
    # Train
    print("🚀 Starting training...")
    trainer.train()
    
    # Save model
    trainer.save_model(output_dir=sft_args.output_dir)
    tokenizer.save_pretrained(sft_args.output_dir)
    
    print(f"✅ Training completed! Model saved to: {sft_args.output_dir}")
    return trainer, sft_args

# Execute training
trainer, sft_args = train_model()

🎯 Detected dataset size: 157
Training setup: 16 batch size, 45 steps, 2 warmup


Adding EOS to train dataset:   0%|          | 0/157 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/157 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/157 [00:00<?, ? examples/s]

[2025-07-01 07:37:35,908] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)


2025-07-01 07:37:36,022 - INFO - /home/jaeh8nkim/equigranular/.conda/bin/x86_64-conda-linux-gnu-cc -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /home/jaeh8nkim/equigranular/.conda/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /home/jaeh8nkim/equigranular/.conda/include -fPIC -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /home/jaeh8nkim/equigranular/.conda/include -c /tmp/tmpftlcyqz_/test.c -o /tmp/tmpftlcyqz_/test.o
2025-07-01 07:37:36,065 - INFO - /home/jaeh8nkim/equigranular/.conda/bin/x86_64-conda-linux-gnu-cc -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,-rpath,/home/jaeh8nkim/equigranular/.conda/lib -Wl,-rpath-link,/home/jaeh8nkim/equigranular/.conda/lib -L/home/jaeh8nkim/equigranular/.conda/lib /tmp/tmpftlcyqz_/test.o -laio -o /tmp/tmpftlcyqz_/a.out
/home

[2025-07-01 07:37:37,424] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
GPU memory before training: 1.1GB / 47.5GB
🚀 Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1,0.5238
2,0.492
3,0.5331
4,0.4928
5,0.512
6,0.5238
7,0.4726
8,0.3993
9,0.4958
10,0.4552


2025-07-01 07:39:14,145 - INFO - The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.
2025-07-01 07:39:14,245 - INFO - The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.
2025-07-01 07:39:14,246 - INFO - The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.
2025-07-01 07:39:14,678 - INFO - The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.
2025-07-01 07:39:14,781 - INFO - The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.
2025-07-01 07:39:14,782 - INFO - The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.
2025-07-01 07:39:15,218 - INFO - The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.
2025-07-01 07:39:15,280 - INFO - The used dataset had no length, retu

✅ Training completed! Model saved to: ./s1-4q36-qwen3-0.6b


In [5]:
# Cell 5: Model Upload and Download

def setup_sft_repository():
    """Setup HuggingFace repository for SFT-ed model"""

    repo_id  = SFTD_MODEL_REMOTE_PATH

    try:
        create_repo(repo_id=repo_id, repo_type="model", private=False, exist_ok=True)
        print(f"✅ Repository ready: https://huggingface.co/{repo_id}")
        return repo_id
    except Exception as e:
        print(f"⚠️ Repository setup issue: {e}")
        return None

def upload_sft_model(repo_id):
    """Upload SFT-ed model"""
    if repo_id:
        try:
            print(f"📤 Uploading SFT-ed model to {repo_id}...")
            upload_folder(
                folder_path=sft_args.output_dir,
                repo_id=repo_id,
                repo_type="model",
                commit_message="Upload S1-faithful fine-tuned Qwen3-0.6B"
            )
            print(f"✅ SFT-ed model uploaded successfully!")
        except Exception as e:
            print(f"❌ Upload failed: {e}")

def download_model(repo_name, local_folder_name, description):
    """Download a single model and save locally"""
    print(f"📥 Downloading {description} from {repo_name}...")

    try:
        model = AutoModelForCausalLM.from_pretrained(
            repo_name,
            torch_dtype=torch.bfloat16,
            trust_remote_code=True
        )
        tokenizer = AutoTokenizer.from_pretrained(
            repo_name,
            trust_remote_code=True
        )

        # Save locally
        local_path = f"./{local_folder_name}"
        model.save_pretrained(local_path)
        tokenizer.save_pretrained(local_path)

        print(f"✅ {description} ready at {local_path}")

        return {
            "model": model,
            "tokenizer": tokenizer,
            "path": local_path
        }

    except Exception as e:
        print(f"❌ Download failed for {description}: {e}")
        return None

# Execute model management
print("🚀 Step 1: Setting up repository and uploading SFT-ed model...")
sft_repo_id = setup_sft_repository()
upload_sft_model(sft_repo_id)

print("\n🚀 Step 2: Downloading both models for local inference...")

# Download SFT model
base_model = download_model(
    repo_name=BASE_MODEL_REMOTE_PATH,
    local_folder_name=BASE_MODEL_LOCAL_PATH,
    description="Official base Qwen3-0.6B",
)

sftd_model = download_model(
    repo_name=SFTD_MODEL_REMOTE_PATH,
    local_folder_name=SFTD_MODEL_LOCAL_PATH,
    description="SFT-ed model from HuggingFace",
)

🚀 Step 1: Setting up repository and uploading SFT-ed model...
✅ Repository ready: https://huggingface.co/jaeh8nkim/s1-4q36-qwen3-0.6b
📤 Uploading SFT-ed model to jaeh8nkim/s1-4q36-qwen3-0.6b...


training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

✅ SFT-ed model uploaded successfully!

🚀 Step 2: Downloading both models for local inference...
📥 Downloading Official base Qwen3-0.6B from Qwen/Qwen3-0.6B...
✅ Official base Qwen3-0.6B ready at ./vanilla-qwen3-0.6b-local
📥 Downloading SFT-ed model from HuggingFace from jaeh8nkim/s1-4q36-qwen3-0.6b...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

✅ SFT-ed model from HuggingFace ready at ./s1-4q36-qwen3-0.6b-local
