# 🔒 Fine-Tune OLMo for Security Remediation

This notebook fine-tunes OLMo-2-1B on WebAuthn security vulnerability dataset for improved security remediation.

**Prerequisites:**
1. Upload your fine-tuning dataset files to Google Drive:
   - `train_YYYYMMDD_HHMMSS.jsonl` (training data)
   - `validation_YYYYMMDD_HHMMSS.jsonl` (validation data)
2. Enable GPU runtime: Runtime → Change runtime type → T4 GPU
3. Generated dataset from: https://huggingface.co/datasets/hitoshura25/webauthn-security-vulnerabilities-olmo


In [None]:
# Install required packages
!pip install -q transformers datasets torch accelerate huggingface_hub

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# List available dataset files
print('📁 Available dataset files in Drive:')
!ls -la /content/drive/MyDrive/ | grep -E '(train_|validation_).*\.jsonl'

# Update these paths to match your uploaded files
TRAIN_FILE = '/content/drive/MyDrive/train_20250908_154659.jsonl'  # Update with your file
VAL_FILE = '/content/drive/MyDrive/validation_20250908_154659.jsonl'  # Update with your file

In [None]:
# Load dataset from JSONL files
import json
from datasets import Dataset, DatasetDict

def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            item = json.loads(line)
            # Convert to text format for language modeling
            text = f"Prompt: {item['prompt']}\n\nCompletion: {item['completion']}"
            data.append({'text': text})
    return data

print('📚 Loading training data...')
train_data = load_jsonl(TRAIN_FILE)
print(f'Training examples: {len(train_data)}')

print('📖 Loading validation data...')
val_data = load_jsonl(VAL_FILE)
print(f'Validation examples: {len(val_data)}')

# Create dataset
dataset = DatasetDict({
    'train': Dataset.from_list(train_data),
    'validation': Dataset.from_list(val_data)
})

print('✅ Dataset loaded successfully')
print('Sample training example:')
print(dataset['train'][0]['text'][:200] + '...')

In [None]:
# Load OLMo-2-1B model and tokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

print('🚀 Loading OLMo-2-1B model...')
model_name = "allenai/OLMo-2-1124-1B"  # Updated to OLMo-2 series

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map='auto'
)

# Set pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print('✅ Model and tokenizer loaded successfully')
print(f'Model: {model_name}')
print(f'Parameters: ~1B')
print(f'Device: {model.device}')

In [None]:
# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=1024  # Increased for security analysis context
    )

print('🔧 Tokenizing dataset...')
tokenized_dataset = dataset.map(tokenize_function, batched=True)

print('✅ Tokenization complete')
print(f'Train samples: {len(tokenized_dataset["train"])}')
print(f'Validation samples: {len(tokenized_dataset["validation"])}')

In [None]:
# Fine-tune the model
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Training arguments optimized for Google Colab T4
training_args = TrainingArguments(
    output_dir='./olmo-security-finetuned',
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Very small batch for T4 with 1B model
    gradient_accumulation_steps=8,   # Effective batch size of 8
    per_device_eval_batch_size=2,
    warmup_steps=50,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=100,
    eval_steps=100,
    evaluation_strategy='steps',
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    fp16=True,
    report_to='none',
    dataloader_pin_memory=False,  # Reduce memory usage
    remove_unused_columns=False,
    learning_rate=5e-5,
    lr_scheduler_type='cosine'
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal language modeling
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation']
)

print('🚀 Starting fine-tuning...')
print(f'Training for {training_args.num_train_epochs} epochs')
print(f'Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}')

# Start training
trainer.train()
print('✅ Fine-tuning complete!')

In [None]:
# Save the fine-tuned model
print('💾 Saving fine-tuned model...')
trainer.save_model('./olmo-security-finetuned')
tokenizer.save_pretrained('./olmo-security-finetuned')

# Save to Google Drive
!cp -r ./olmo-security-finetuned /content/drive/MyDrive/
print('✅ Model saved to Google Drive at: /content/drive/MyDrive/olmo-security-finetuned')

# Create model info file
model_info = f"""
# OLMo Security Fine-tuned Model

**Base Model**: {model_name}
**Training Data**: WebAuthn Security Vulnerabilities Dataset
**Training Examples**: {len(tokenized_dataset['train'])}
**Validation Examples**: {len(tokenized_dataset['validation'])}
**Epochs**: {training_args.num_train_epochs}
**Fine-tuned**: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}

## Usage:
```python
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained('./olmo-security-finetuned')
tokenizer = AutoTokenizer.from_pretrained('./olmo-security-finetuned')

# Generate security remediation
prompt = "Analyze this security vulnerability and provide remediation guidance:"
inputs = tokenizer(prompt, return_tensors='pt')
outputs = model.generate(**inputs, max_new_tokens=200)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
```
"""

with open('/content/drive/MyDrive/olmo-security-finetuned/README.md', 'w') as f:
    f.write(model_info)

print('📝 Model documentation saved')

In [None]:
# Test the fine-tuned model
print('🧪 Testing fine-tuned model...')

test_prompts = [
    "Analyze this security vulnerability and provide remediation guidance:\n\nVulnerability ID: CKV_GHA_7",
    "Analyze this security vulnerability and provide remediation guidance:\n\nVulnerability ID: semgrep-rules.webauthn-credential-validation-bypass",
    "Analyze this security vulnerability and provide remediation guidance:\n\nVulnerability ID: kotlin.lang.security.gcm-detection"
]

for i, prompt in enumerate(test_prompts, 1):
    print(f"\n{'='*60}")
    print(f"🔍 Test {i}/3: {prompt.split(':')[-1].strip()[:50]}...")
    print(f"{'='*60}")
    
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=512)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract just the generated part (after the prompt)
    generated = response[len(prompt):].strip()
    
    print(f"**Prompt**: {prompt}")
    print(f"**Response**: {generated[:300]}{'...' if len(generated) > 300 else ''}")

print('\n✅ Testing complete! Your model is ready to generate security remediation guidance.')

In [None]:
# Optional: Push to Hugging Face Hub
print('🤗 Optional: Upload to Hugging Face Hub')
print('Uncomment and run the following code to upload your model:')
print()

upload_code = '''
# 1. Login to Hugging Face
from huggingface_hub import notebook_login
notebook_login()

# 2. Push model to your HF account
model_name = "your-username/olmo-security-finetuned"  # Change this
trainer.push_to_hub(model_name, private=True)
tokenizer.push_to_hub(model_name, private=True)

print(f"✅ Model uploaded to: https://huggingface.co/{model_name}")
'''

print(upload_code)

# Alternatively, you can download the model from Google Drive
print('\n💾 Your fine-tuned model is saved in Google Drive at:')
print('/content/drive/MyDrive/olmo-security-finetuned/')
print('\nYou can download it and use it locally or upload to any model hosting service.')