# Secure MLOps Pipeline for LLM Fine-Tuning

This notebook demonstrates a complete secure MLOps pipeline with:
- Secure model download from HuggingFace
- Container security with ECR vulnerability scanning
- SageMaker training with security controls
- Experiment tracking
- Performance threshold validation
- Model registry and versioning
- Secure endpoint deployment

## Setup and Configuration

In [None]:
# Install required packages
!pip install -q -r ../requirements.txt

In [None]:
import sys
sys.path.append('..')

import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.estimator import Estimator
from sagemaker.experiments.run import Run
import yaml
import json
from datetime import datetime

# Import custom modules
from src.model_management.secure_model_downloader import SecureModelDownloader
from src.security.ecr_manager import SecureECRManager
from src.deployment.model_registry import SecureModelRegistry
from src.deployment.deploy import SecureEndpointDeployer

print('✓ All imports successful')

In [None]:
# Configuration
session = sagemaker.Session()
region = session.boto_region_name
role = get_execution_role()  # Or specify your role ARN
bucket = session.default_bucket()

# Load configurations
with open('../config/training_config.yaml', 'r') as f:
    training_config = yaml.safe_load(f)

with open('../config/security_config.yaml', 'r') as f:
    security_config = yaml.safe_load(f)

print(f'Region: {region}')
print(f'Role: {role}')
print(f'Bucket: {bucket}')

## Step 1: Securely Download Model from HuggingFace

In [None]:
# Set HuggingFace token (stored in AWS Secrets Manager)
# Alternatively, set environment variable: os.environ['HUGGINGFACE_TOKEN'] = 'your_token'

downloader = SecureModelDownloader()

# Download model with security controls
model_path = downloader.download_model(
    model_id='gpt2',  # Use a smaller model for demo
    local_dir='./models/base-model',
    cache_dir='./cache'
)

print(f'✓ Model downloaded to: {model_path}')

In [None]:
# Upload model to S3 with encryption
s3_model_path = f's3://{bucket}/models/base-model/'

downloader.upload_to_s3(
    local_path=model_path,
    s3_uri=s3_model_path,
    encrypt=True
)

print(f'✓ Model uploaded to: {s3_model_path}')

## Step 2: Build and Push Secure Container to ECR

In [None]:
# Create secure ECR repository
ecr_manager = SecureECRManager(region=region)

repository_name = 'secure-mlops-training'
repo = ecr_manager.create_secure_repository(
    repository_name=repository_name,
    scan_on_push=True,
    enable_encryption=True
)

print(f"✓ Repository created: {repo['repositoryUri']}")

In [None]:
# Build and push image with vulnerability scanning
# Using the shell script for better control
!cd .. && bash scripts/build_and_push.sh {repository_name} v1.0

In [None]:
# Read the image URI
with open('../.ecr_image_uri', 'r') as f:
    image_uri = f.read().strip()

print(f'Training image URI: {image_uri}')

# Check scan results
scan_results = ecr_manager.get_scan_results(repository_name, 'v1.0')
if scan_results:
    print('\nVulnerability Scan Results:')
    print(json.dumps(scan_results.get('findingSeverityCounts', {}), indent=2))

## Step 3: Prepare Training Data

In [None]:
# For this demo, we'll use a sample dataset
# In production, you would upload your own data to S3

from datasets import load_dataset

# Load sample dataset
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train[:1000]')
eval_dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='validation[:100]')

# Save to disk
dataset.save_to_disk('./data/train')
eval_dataset.save_to_disk('./data/validation')

print(f'✓ Training samples: {len(dataset)}')
print(f'✓ Validation samples: {len(eval_dataset)}')

In [None]:
# Upload data to S3 with encryption
import subprocess

s3_train_path = f's3://{bucket}/data/train/'
s3_eval_path = f's3://{bucket}/data/validation/'

subprocess.run([
    'aws', 's3', 'sync', './data/train/', s3_train_path,
    '--sse', 'aws:kms'
])

subprocess.run([
    'aws', 's3', 'sync', './data/validation/', s3_eval_path,
    '--sse', 'aws:kms'
])

print(f'✓ Data uploaded to S3')
print(f'  Train: {s3_train_path}')
print(f'  Eval: {s3_eval_path}')

## Step 4: Run Secure Training Job with Experiment Tracking

In [None]:
# Create experiment
experiment_name = 'secure-llm-finetuning'
run_name = f'trial-{datetime.now().strftime("%Y%m%d-%H%M%S")}'

# Training hyperparameters
hyperparameters = {
    'model_name': 'gpt2',
    'epochs': 1,  # Reduced for demo
    'batch_size': 4,
    'learning_rate': 2e-5,
    'max_seq_length': 512,
    'lora_r': 16,
    'lora_alpha': 32,
    'train_data': '/opt/ml/input/data/train',
    'eval_data': '/opt/ml/input/data/validation',
    'max_perplexity': 20.0,
    'max_eval_loss': 1.5,
    'experiment_name': experiment_name,
    'run_name': run_name
}

print('Hyperparameters:')
print(json.dumps(hyperparameters, indent=2))

In [None]:
# Create SageMaker Estimator with security settings
estimator = Estimator(
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type='ml.g5.xlarge',
    output_path=f's3://{bucket}/models/output/',
    hyperparameters=hyperparameters,
    use_spot_instances=True,
    max_run=3600,
    max_wait=7200,
    
    # Security settings
    encrypt_inter_container_traffic=True,
    # Uncomment if using VPC:
    # subnets=security_config['security']['vpc']['subnets'],
    # security_group_ids=security_config['security']['vpc']['security_group_ids'],
    
    # Encryption
    volume_kms_key=security_config['security']['encryption'].get('volume_kms_key_id'),
    output_kms_key=security_config['security']['encryption'].get('s3_kms_key_id'),
    
    # Experiment tracking
    sagemaker_session=session,
    tags=[
        {'Key': 'Project', 'Value': 'SecureMLOps'},
        {'Key': 'Environment', 'Value': 'Development'}
    ]
)

print('✓ Estimator configured')

In [None]:
# Start training job
with Run(
    experiment_name=experiment_name,
    run_name=run_name,
    sagemaker_session=session
) as run:
    estimator.fit({
        'train': s3_train_path,
        'validation': s3_eval_path
    }, wait=True)
    
print('✓ Training completed')
print(f'Model artifacts: {estimator.model_data}')

## Step 5: Evaluate Model and Check Thresholds

In [None]:
# Download model artifacts for evaluation
import subprocess
import tarfile

# Download model.tar.gz
model_data = estimator.model_data
subprocess.run(['aws', 's3', 'cp', model_data, './model.tar.gz'])

# Extract
with tarfile.open('./model.tar.gz', 'r:gz') as tar:
    tar.extractall('./trained_model')

print('✓ Model artifacts downloaded')

In [None]:
# Evaluate model
from src.training.evaluator import ModelEvaluator

evaluator = ModelEvaluator('./trained_model')

# Load eval dataset
from datasets import load_from_disk
eval_dataset = load_from_disk('./data/validation')

# Run evaluation
metrics = evaluator.evaluate_metrics(eval_dataset)

print('\n' + '='*60)
print('EVALUATION METRICS')
print('='*60)
for key, value in metrics.items():
    print(f'{key}: {value}')
print('='*60)

In [None]:
# Check performance thresholds
thresholds = {
    'perplexity_max': 20.0,
    'eval_loss_max': 1.5
}

passed, failures = evaluator.check_thresholds(metrics, thresholds)

if passed:
    print('\n✓ Model PASSED all performance thresholds')
    approval_status = 'PendingManualApproval'
else:
    print('\n✗ Model FAILED performance thresholds')
    for failure in failures:
        print(f'  - {failure}')
    approval_status = 'Rejected'
    raise ValueError('Model does not meet performance requirements')

## Step 6: Register Model in Model Registry

In [None]:
# Initialize model registry
registry = SecureModelRegistry(region=region)

# Create model package group
model_package_group_name = 'secure-llm-models'
registry.create_model_package_group(
    group_name=model_package_group_name,
    description='Secure LLM models with performance validation'
)

print(f'✓ Model package group: {model_package_group_name}')

In [None]:
# Register model version
model_package_arn = registry.register_model(
    model_package_group_name=model_package_group_name,
    model_data_url=estimator.model_data,
    image_uri=image_uri,
    model_metrics=metrics,
    approval_status=approval_status
)

print(f'✓ Model registered: {model_package_arn}')
print(f'  Approval status: {approval_status}')

## Step 7: Review and Approve Model

In [None]:
# List all model versions
versions = registry.list_model_versions(model_package_group_name)

print('Model Versions:')
for v in versions:
    print(f"  Version {v.get('version', 'N/A')}: {v['status']} (created {v['created']})")

In [None]:
# Approve model (manual step - in production this would be done by ML team)
# Only approve if metrics look good

if passed:
    registry.update_approval_status(
        model_package_arn=model_package_arn,
        approval_status='Approved',
        approval_description=f'Model approved with perplexity={metrics["perplexity"]:.2f}'
    )
    print('✓ Model approved for deployment')
else:
    print('✗ Model not approved - does not meet performance thresholds')

## Step 8: Deploy to SageMaker Endpoint

In [None]:
# Initialize deployer
deployer = SecureEndpointDeployer(
    security_config_path='../config/security_config.yaml',
    region=region
)

# Deploy endpoint
endpoint_name = 'secure-llm-endpoint'

endpoint_arn = deployer.deploy_model(
    model_package_arn=model_package_arn,
    endpoint_name=endpoint_name,
    instance_type='ml.g5.xlarge',
    instance_count=1,
    enable_monitoring=True,
    enable_autoscaling=True,
    tags=[
        {'Key': 'Project', 'Value': 'SecureMLOps'},
        {'Key': 'Environment', 'Value': 'Production'}
    ]
)

print(f'✓ Endpoint deployed: {endpoint_arn}')

## Step 9: Test Endpoint

In [None]:
# Test the endpoint
test_payload = json.dumps({
    'inputs': 'Once upon a time',
    'parameters': {
        'max_new_tokens': 50,
        'temperature': 0.7
    }
})

result = deployer.invoke_endpoint(
    endpoint_name=endpoint_name,
    payload=test_payload,
    content_type='application/json'
)

print('\nEndpoint Response:')
print(result)

## Summary

### Security Controls Implemented:

1. **Model Download Security**
   - Token-based authentication with HuggingFace
   - Credentials stored in AWS Secrets Manager
   - Model integrity verification
   - Audit logging

2. **Container Security**
   - ECR vulnerability scanning (basic + enhanced)
   - Immutable image tags
   - KMS encryption at rest
   - Non-root container execution

3. **Training Security**
   - VPC isolation (optional)
   - Encrypted inter-container traffic
   - KMS encryption for volumes and output
   - IAM least-privilege roles

4. **Model Governance**
   - Performance threshold validation
   - Model versioning in registry
   - Approval workflow
   - Experiment tracking

5. **Deployment Security**
   - VPC endpoint deployment (optional)
   - Data capture for monitoring
   - Auto-scaling
   - Encryption at rest and in transit

### Compliance Features:
- Complete audit trail in CloudWatch Logs
- Model lineage tracking
- Automated quality gates
- Regular vulnerability scanning
- Data encryption throughout pipeline

## Cleanup (Optional)

In [None]:
# Uncomment to delete endpoint and associated resources
# deployer.delete_endpoint(
#     endpoint_name=endpoint_name,
#     delete_config=True,
#     delete_model=True
# )
# print('✓ Endpoint deleted')