In [None]:
import wandb
import os

In [14]:
# 0.1 Setup WANDB for training metrics
os.environ["WANDB_PROJECT"] = "rlinv-sft-demo"  # Change to your desired project!
wandb.init(project=os.environ["WANDB_PROJECT"])

In [17]:
from huggingface_hub import notebook_login 
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [34]:
# 1. LOAD DATASET
from datasets import load_dataset


dataset = load_dataset("HuggingFaceH4/Multilingual-Thinking", split="train")
print(f"Total examples: {len(dataset)}")
print("Sample:", dataset[0])


Total examples: 1000
Sample: {'reasoning_language': 'French', 'developer': 'You are an AI chatbot with a lively and energetic personality.', 'user': 'Can you show me the latest trends on Twitter right now?', 'analysis': "D'accord, l'utilisateur demande les tendances Twitter les plus r√©centes. Tout d'abord, je dois v√©rifier si j'ai acc√®s √† des donn√©es en temps r√©el. √âtant donn√© que je ne peux pas naviguer sur Internet ou acc√©der directement √† l'API de Twitter, je ne peux pas fournir des tendances en direct. Cependant, je peux donner quelques conseils g√©n√©raux sur la fa√ßon de les trouver.\n\nJe devrais pr√©ciser que les tendances Twitter √©voluent rapidement et sont sp√©cifiques √† chaque r√©gion. Je pourrais sugg√©rer de consulter la section ¬´\xa0En vogue\xa0¬ª sur l'application ou le site web. Aussi, l'utilisation de hashtags et le suivi d'utilisateurs pertinents pourraient √™tre utiles. Il est important de souligner que les tendances varient selon la r√©gion et l'heure d

In [19]:
messages = dataset[0]['messages']
messages

[{'content': 'reasoning language: French\n\nYou are an AI chatbot with a lively and energetic personality.',
  'role': 'system',
  'thinking': None},
 {'content': 'Can you show me the latest trends on Twitter right now?',
  'role': 'user',
  'thinking': None},
 {'content': 'Hey there!  While I can\'t check Twitter (X) in real-time or access live data, I can share some tips to help you spot the latest trends:\n\n1. **Open the "Trending" tab** on the Twitter app or website ‚Äì it updates constantly!  \n2. **Search for hashtags** like #Trending or #Viral to see what‚Äôs blowing up.  \n3. **Follow accounts** that curate trends (e.g., @TrendingNow, @ViralThreads).  \n4. **Check regional trends** ‚Äì they often differ by location!  \n\nRemember, trends are *super fast-moving* and often tied to pop culture, memes, or breaking news. For example, recent trends have included viral challenges (like the "Distracted Boyfriend" meme revival), celebrity drama, or unexpected events (hello, weather dis

In [35]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")
conversation = tokenizer.apply_chat_template(messages, tokenize=False)
print(conversation)  # Shows how dialogue is formatted


<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: 2025-11-09

Reasoning: medium

# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions

reasoning language: French

You are an AI chatbot with a lively and energetic personality.

<|end|><|start|>user<|message|>Can you show me the latest trends on Twitter right now?<|end|><|start|>assistant<|channel|>analysis<|message|>D'accord, l'utilisateur demande les tendances Twitter les plus r√©centes. Tout d'abord, je dois v√©rifier si j'ai acc√®s √† des donn√©es en temps r√©el. √âtant donn√© que je ne peux pas naviguer sur Internet ou acc√©der directement √† l'API de Twitter, je ne peux pas fournir des tendances en direct. Cependant, je peux donner quelques conseils g√©n√©raux sur la fa√ßon de les trouver.

Je devrais pr√©ciser que les tendances Twitter √©voluent rapidement et sont sp√©ci

In [25]:
# MEMORY CLEANUP - Run this cell to free GPU memory and start clean
import gc
import torch

def clean_memory():
    """Comprehensive memory cleanup function"""
    # Delete common model variables if they exist
    variables_to_delete = ['model', 'peft_model', 'trainer', 'tokenizer', 'dataset']
    for var_name in variables_to_delete:
        if var_name in globals():
            del globals()[var_name]
            print(f"‚úì Deleted {var_name}")
    
    # Python garbage collection
    gc.collect()
    print("‚úì Python garbage collection completed")
    
    # CUDA cleanup
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
        torch.cuda.reset_peak_memory_stats()
        torch.cuda.synchronize()
        print("‚úì CUDA cache cleared and stats reset")
        
        # Print memory status
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        print(f"\nüìä GPU Memory Status:")
        print(f"   Allocated: {allocated:.2f} GB")
        print(f"   Reserved: {reserved:.2f} GB")
        print(f"   Free: {torch.cuda.get_device_properties(0).total_memory / 1024**3 - reserved:.2f} GB")
    else:
        print("‚ö† CUDA not available")
    
    print("\n‚úÖ Memory cleanup complete! You can now start fresh.")

# Run cleanup
clean_memory()


‚úì Deleted model
‚úì Deleted peft_model
‚úì Deleted trainer
‚úì Deleted tokenizer
‚úì Deleted dataset
‚úì Python garbage collection completed
‚úì CUDA cache cleared and stats reset

üìä GPU Memory Status:
   Allocated: 1.91 GB
   Reserved: 45.79 GB
   Free: 94.01 GB

‚úÖ Memory cleanup complete! You can now start fresh.


In [27]:
# AGGRESSIVE MEMORY CLEANUP - Use if regular cleanup doesn't work
# This will try to delete all PyTorch/transformers objects

def aggressive_cleanup():
    """More aggressive cleanup that searches for large objects"""
    import gc
    import torch
    import sys
    
    # Get all variables in current namespace
    current_vars = list(globals().keys())
    
    # Delete any object that might be holding GPU memory
    deleted_count = 0
    for var_name in current_vars:
        if var_name.startswith('_'):
            continue
        try:
            obj = globals()[var_name]
            # Check if it's a model, tensor, or large object
            if hasattr(obj, 'cuda') or hasattr(obj, 'to'):
                del globals()[var_name]
                deleted_count += 1
                print(f"‚úì Deleted {var_name}")
        except:
            pass
    
    # Multiple rounds of garbage collection
    for i in range(3):
        collected = gc.collect()
        if collected > 0:
            print(f"‚úì GC round {i+1}: collected {collected} objects")
    
    # CUDA cleanup
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
        torch.cuda.reset_peak_memory_stats()
        torch.cuda.synchronize()
        
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        print(f"\nüìä GPU Memory after aggressive cleanup:")
        print(f"   Allocated: {allocated:.2f} GB")
        print(f"   Reserved: {reserved:.2f} GB")
    
    print(f"\n‚úÖ Aggressive cleanup complete! Deleted {deleted_count} variables.")
    print("üí° If memory is still high, consider restarting the kernel (Kernel ‚Üí Restart)")

# Uncomment to run aggressive cleanup:
aggressive_cleanup()


‚úì Deleted torch
‚úì Deleted param
‚úì Deleted test_input
‚úì Deleted test_loss
‚úì GC round 1: collected 482 objects

üìä GPU Memory after aggressive cleanup:
   Allocated: 0.83 GB
   Reserved: 45.79 GB

‚úÖ Aggressive cleanup complete! Deleted 4 variables.
üí° If memory is still high, consider restarting the kernel (Kernel ‚Üí Restart)


In [None]:
# MEMORY STATUS CHECKER - Interpret your current GPU memory state
import torch

def check_memory_status():
    """Check and interpret current GPU memory status"""
    if not torch.cuda.is_available():
        print("‚ö† CUDA not available")
        return
    
    allocated = torch.cuda.memory_allocated() / 1024**3
    reserved = torch.cuda.memory_reserved() / 1024**3
    total = torch.cuda.get_device_properties(0).total_memory / 1024**3
    free = total - reserved
    
    print("=" * 60)
    print("üìä CURRENT GPU MEMORY STATUS")
    print("=" * 60)
    print(f"Total GPU Memory:     {total:.2f} GB")
    print(f"Reserved Memory:      {reserved:.2f} GB")
    print(f"Allocated Memory:     {allocated:.2f} GB")
    print(f"Available Memory:     {free:.2f} GB")
    print("=" * 60)
    
    # Interpretation
    print("\nüí° INTERPRETATION:")
    
    if allocated < 1:
        print("‚úÖ Excellent! Almost no memory allocated.")
    elif allocated < 10:
        print("‚úÖ Good! Low memory usage.")
    elif allocated < 50:
        print("‚ö†Ô∏è  Moderate memory usage.")
    else:
        print("‚ùå High memory usage - consider cleanup!")
    
    if reserved > total * 0.8:
        print("‚ö†Ô∏è  High reserved memory - may indicate fragmentation")
        print("   Consider kernel restart if training fails")
    elif reserved > total * 0.5:
        print("‚ö†Ô∏è  Moderate reserved memory - should be fine for training")
    else:
        print("‚úÖ Low reserved memory - good for training")
    
    if free > 50:
        print("‚úÖ Plenty of free memory available for training!")
    elif free > 20:
        print("‚ö†Ô∏è  Limited free memory - use memory optimizations")
    else:
        print("‚ùå Very little free memory - cleanup required!")
    
    print("\nüéØ RECOMMENDATION:")
    if free > 50 and allocated < 10:
        print("‚úÖ Ready to load model and start training!")
    elif free > 20:
        print("‚ö†Ô∏è  Can proceed, but use memory optimizations:")
        print("   - Reduce batch size")
        print("   - Enable gradient checkpointing")
        print("   - Reduce max_length")
    else:
        print("‚ùå Need more cleanup or kernel restart")
        print("   Run clean_memory() or restart kernel")

# Run check
check_memory_status()


In [28]:
import torch
torch.cuda.is_available()

True

In [29]:
# Check GPU usage
torch.cuda.memory_summary(device=None, abbreviated=False)
# 3. LOAD BASE MODEL WITH QUANTIZATION



In [30]:
# 3. LOAD BASE MODEL WITH QUANTIZATION
import torch
from transformers import AutoModelForCausalLM, Mxfp4Config

quant_config = Mxfp4Config(dequantize=True)
model_kwargs = dict(
    attn_implementation="eager",
    dtype=torch.bfloat16,
    quantization_config=quant_config,
    use_cache=False,
    device_map="auto",
)
model = AutoModelForCausalLM.from_pretrained("openai/gpt-oss-20b", **model_kwargs)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [31]:
# 4. WRAP WITH LORA (PEFT)
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules="all-linear",
    target_parameters=[
        "7.mlp.experts.gate_up_proj",
        "7.mlp.experts.down_proj",
        "15.mlp.experts.gate_up_proj",
        "15.mlp.experts.down_proj",
        "23.mlp.experts.gate_up_proj",
        "23.mlp.experts.down_proj",
    ],
)
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

# Ensure model is in training mode
peft_model.train()

# Freeze base model parameters (should already be done by PEFT, but ensure it)
for name, param in peft_model.named_parameters():
    if 'lora' not in name.lower():
        param.requires_grad = False

# Verify trainable parameters have gradients enabled
trainable_params = [p for p in peft_model.parameters() if p.requires_grad]
print(f"Number of trainable parameters with requires_grad=True: {len(trainable_params)}")
if len(trainable_params) > 0:
    print(f"Sample trainable param requires_grad: {trainable_params[0].requires_grad}")

# Test forward pass to ensure computation graph is properly connected
# This helps identify if quantization is breaking the graph
try:
    test_input = tokenizer("Test input", return_tensors="pt").to(peft_model.device)
    with torch.enable_grad():
        test_output = peft_model(**test_input)
        if hasattr(test_output, 'logits'):
            test_loss = test_output.logits.sum()
            # Try a backward pass to verify gradients work
            test_loss.backward()
            print("‚úì Forward and backward pass test successful - gradients are working!")
            # Clear gradients after test
            peft_model.zero_grad()
except Exception as e:
    print(f"‚ö† Warning: Forward/backward test failed: {e}")
    print("This might indicate an issue with quantization compatibility.")


trainable params: 15,040,512 || all params: 20,929,797,696 || trainable%: 0.0719
Number of trainable parameters with requires_grad=True: 204
Sample trainable param requires_grad: True
This might indicate an issue with quantization compatibility.




In [None]:
# 5. CONFIGURE TRAINING (& use wandb)
from trl import SFTConfig



training_args = SFTConfig(
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    num_train_epochs=1,
    logging_steps=1,
    max_length=2048,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine_with_min_lr",
    lr_scheduler_kwargs={"min_lr_rate": 0.1},
    output_dir="gpt-oss-20b-multilingual-reasoner",
    report_to="wandb",  # ‚Üê use wandb
    push_to_hub=False,  # ‚Üê DO NOT push to hub
)

In [36]:
# 6. START TRAINING
from trl import SFTTrainer


trainer = SFTTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=dataset,
    processing_class=tokenizer,
)
trainer.train()


The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 199998}.


Step,Training Loss
1,1.9809
2,2.0533
3,1.8119
4,1.8299
5,1.6146
6,1.5726
7,1.4115
8,1.411
9,1.2115
10,1.3222


TrainOutput(global_step=63, training_loss=1.1419532696406047, metrics={'train_runtime': 653.271, 'train_samples_per_second': 1.531, 'train_steps_per_second': 0.096, 'total_flos': 1.995467289064105e+17, 'train_loss': 1.1419532696406047, 'epoch': 1.0})

In [37]:
# 7. SAVE MODEL LOCALLY (no pushing to hub)
trainer.save_model(training_args.output_dir)

del trainer
del peft_model
del model

In [38]:
dataset

Dataset({
    features: ['reasoning_language', 'developer', 'user', 'analysis', 'final', 'messages'],
    num_rows: 1000
})

In [45]:
import json

with open("dataset/training/train_format.jsonl", "r") as f:
    train_format = json.load(f)
    
train_format

{'messages': [{'role': 'developer',
   'content': 'You are a helpful assistant and an expert C programmer.'},
  {'role': 'user',
   'content': 'Generate a strong loop invariant that helps prove the target property of the following C program: \n```c\n{program}\n```\n\nAvailable locations for placing the invariant:\n{locations}\n\nOutput Format:\nassert(<invariant>); // Line <line_number>'},
  {'role': 'assistant',
   'content': 'assert(<invariant>); // Line <line_number>'}]}

In [50]:
conversation = tokenizer.apply_chat_template(train_format['messages'], tokenize=False)
print(conversation)  # Shows how dialogue is formatted

<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: 2025-11-09

Reasoning: medium

# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions

You are a helpful assistant and an expert C programmer.

<|end|><|start|>user<|message|>Generate a strong loop invariant that helps prove the target property of the following C program: 
```c
{program}
```

Available locations for placing the invariant:
{locations}

Output Format:
assert(<invariant>); // Line <line_number><|end|><|start|>assistant<|channel|>final<|message|>assert(<invariant>); // Line <line_number><|return|>


In [51]:
len(conversation)

726

In [52]:
conversation

'<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\nKnowledge cutoff: 2024-06\nCurrent date: 2025-11-09\n\nReasoning: medium\n\n# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions\n\nYou are a helpful assistant and an expert C programmer.\n\n<|end|><|start|>user<|message|>Generate a strong loop invariant that helps prove the target property of the following C program: \n```c\n{program}\n```\n\nAvailable locations for placing the invariant:\n{locations}\n\nOutput Format:\nassert(<invariant>); // Line <line_number><|end|><|start|>assistant<|channel|>final<|message|>assert(<invariant>); // Line <line_number><|return|>'

In [54]:
# How to create a HuggingFace Dataset from train_format (dict loaded from JSON)?
from datasets import Dataset

# Here train_format["messages"] is a list of turn dicts, but we want to keep the list-of-dicts per example.
# For 1 conversation example:

# If you have a list of conversations (list-of-dicts, each with "messages" field), you can do:
# dataset = Dataset.from_list(list_of_conversations)
conversations = [{'messages': train_format['messages']}]
my_dataset = Dataset.from_list(conversations)
my_dataset

Dataset({
    features: ['messages'],
    num_rows: 1
})

In [55]:
my_dataset[0]

{'messages': [{'content': 'You are a helpful assistant and an expert C programmer.',
   'role': 'developer'},
  {'content': 'Generate a strong loop invariant that helps prove the target property of the following C program: \n```c\n{program}\n```\n\nAvailable locations for placing the invariant:\n{locations}\n\nOutput Format:\nassert(<invariant>); // Line <line_number>',
   'role': 'user'},
  {'content': 'assert(<invariant>); // Line <line_number>',
   'role': 'assistant'}]}

In [66]:
import os
import json

from datasets import Dataset

# Paths
training_folder = "dataset/training"
programs_path = os.path.join(training_folder, "Programs")

invariants_path = os.path.join(training_folder, "invariants.json")

In [76]:
# Load invariants.json
with open(invariants_path, 'r') as f:
    invariants_data = json.load(f)

# Get first 5 file names
files = list(invariants_data.keys())[:5]

# Build list of dicts for each file: {'code', 'invariant', 'line'}
data  = []
for filename in files:
    # Try to read code
    program_path = os.path.join(programs_path, filename)
    try:
        with open(program_path, 'r') as code_file:
            code = code_file.read()
    except FileNotFoundError:
        code = None  # or handle otherwise

    # Get invariants for this file (may be a list)
    invariants = invariants_data[filename]
    # For each invariant/line pair, make a dict, but include the code each time
    for inv in invariants:
        data.append({
            'code': code,
            'invariant': inv.get('invariant'),
            'line': inv.get('line'),
            'file': filename
        })

# Now, first_5_data is a list of dicts, one per (filename, invariant/line).


In [77]:
data

[{'code': '#include <assert.h>\nvoid reach_error(void) { assert(0); }\n\nextern int __VERIFIER_nondet_int(void);\nextern _Bool __VERIFIER_nondet_bool(void);\n\nvoid __VERIFIER_assert(int cond) {\n    if (!cond) {\n        reach_error();\n    }\n}\n\n/* Custom CFG:\nnames=i count isPositive\nbeforeloop=\nbeforeloopinit=\nprecondition=i==0 && count==0 && isPositive==1\nloopcondition=i<200 && isPositive\nloop=count += i; i += 2; isPositive = (i % 5 != 0);\npostcondition=count >= 0\nafterloop=\nlearners= conj\n*/\nint main() {\n    int i = __VERIFIER_nondet_int();\n    int count = __VERIFIER_nondet_int();\n    _Bool isPositive = __VERIFIER_nondet_bool();\n\n    if (!(i == 0 && count == 0 && isPositive == 1)) {\n        return 0;\n    }\n\n    while (i < 200 && isPositive) {\n        count += i;\n        i += 2;\n        isPositive = (i % 5 != 0);\n    }\n\n    __VERIFIER_assert(count >= 0);\n    return 0;\n}',
  'invariant': '0 <= count && 0 <= i',
  'line': 33,
  'file': '1003_1.c'},
 {'c

In [None]:
developer_msg = "You are a helpful assistant and an expert C programmer."
user_msg = "Generate a strong loop invariant that helps prove the target property of the following C program: \n```c\n{program}\n```\n\nAvailable locations for placing the invariant:\n{locations}\n\nOutput Format:\nassert(<invariant>); // Line <line_number>"
response_msg = "assert({invariant}); // Line {line_number}"
samples = []
for example in data:
    sample = {
        "messages": [
            {"role": "system", "content": developer_msg},
            {"role": "user", "content": user_msg.format(program=example['code'], locations=example['line'])},
            {"role": "assistant", "content": response_msg.format(invariant=example['invariant'], line_number=example['line'])}
        ]
    }
    samples.append(sample)
print(samples[0])

{'messages': [{'role': 'system', 'content': 'You are a helpful assistant and an expert C programmer.'}, {'role': 'user', 'content': 'Generate a strong loop invariant that helps prove the target property of the following C program: \n```c\n#include <assert.h>\nvoid reach_error(void) { assert(0); }\n\nextern int __VERIFIER_nondet_int(void);\nextern _Bool __VERIFIER_nondet_bool(void);\n\nvoid __VERIFIER_assert(int cond) {\n    if (!cond) {\n        reach_error();\n    }\n}\n\n/* Custom CFG:\nnames=i count isPositive\nbeforeloop=\nbeforeloopinit=\nprecondition=i==0 && count==0 && isPositive==1\nloopcondition=i<200 && isPositive\nloop=count += i; i += 2; isPositive = (i % 5 != 0);\npostcondition=count >= 0\nafterloop=\nlearners= conj\n*/\nint main() {\n    int i = __VERIFIER_nondet_int();\n    int count = __VERIFIER_nondet_int();\n    _Bool isPositive = __VERIFIER_nondet_bool();\n\n    if (!(i == 0 && count == 0 && isPositive == 1)) {\n        return 0;\n    }\n\n    while (i < 200 && isPos