# AILib Tutorial 12: Safety and Moderation

Learn how to build safe and responsible AI applications with AILib's built-in safety features:

- Content moderation and filtering
- Rate limiting to prevent abuse
- Custom safety rules
- Integration with OpenAI's moderation API
- Safety hooks for LLM clients

## Setup

In [None]:
from ailib.safety import (
    enable_safety, disable_safety, check_content,
    set_rate_limit, check_rate_limit, reset_rate_limit,
    with_moderation, add_custom_filter
)
from ailib import create_agent, create_chain, OpenAIClient
import os

# Make sure you have your API key
# os.environ['OPENAI_API_KEY'] = 'your-key'

## Basic Content Safety

In [None]:
# Enable safety features globally
enable_safety(
    block_harmful=True,
    max_length=1000,
    blocked_words=["violence", "hate", "self-harm"]
)

# Check content directly
safe_text = "Hello, how can I help you today?"
is_safe, violations = check_content(safe_text)
print(f"Text: '{safe_text}'")
print(f"Safe: {is_safe}, Violations: {violations}")

# Test with problematic content
long_text = "a" * 2000  # Too long
is_safe, violations = check_content(long_text)
print(f"\nLong text safe: {is_safe}")
print(f"Violations: {violations}")

## Custom Safety Filters

In [None]:
# Add custom filters for your specific needs
import re

# Add regex-based filter for phone numbers
add_custom_filter(
    name="phone_filter",
    pattern=r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
    message="Phone numbers are not allowed"
)

# Add regex for email addresses
add_custom_filter(
    name="email_filter",
    pattern=r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
    message="Email addresses are not allowed"
)

# Test custom filters
test_texts = [
    "Call me at 555-123-4567",
    "Email me at user@example.com",
    "This is a safe message"
]

for text in test_texts:
    is_safe, violations = check_content(text)
    print(f"\nText: '{text}'")
    print(f"Safe: {is_safe}, Violations: {violations}")

## OpenAI Moderation Integration

In [None]:
# Get moderation hooks for OpenAI
pre_hook, post_hook = with_moderation(
    api_key=os.environ.get("OPENAI_API_KEY")
)

# Use with an LLM client
client = OpenAIClient()

# Add hooks to client (if supported)
# Note: This is a demonstration - actual implementation may vary
print("Moderation hooks created:")
print(f"- Pre-hook: Checks input before sending to LLM")
print(f"- Post-hook: Checks LLM output before returning")

# Example of how it would work:
def safe_completion(prompt: str) -> str:
    """Example of using moderation hooks."""
    # Pre-check
    try:
        pre_hook(prompt)
    except Exception as e:
        return f"Input blocked: {e}"
    
    # Get completion (mock)
    response = "This is a mock response"
    
    # Post-check
    try:
        post_hook(response)
    except Exception as e:
        return f"Output blocked: {e}"
    
    return response

# Test it
result = safe_completion("Tell me a story")
print(f"\nResult: {result}")

## Rate Limiting

In [None]:
# Set up rate limiting
set_rate_limit(
    max_requests=5,
    window_seconds=60  # 5 requests per minute
)

# Simulate requests from different users
users = ["user1", "user2", "user1", "user1", "user1", "user1", "user1"]

for i, user in enumerate(users):
    allowed = check_rate_limit(user)
    print(f"Request {i+1} from {user}: {'✅ Allowed' if allowed else '❌ Blocked'}")

# Reset rate limit for a user
print("\nResetting rate limit for user1...")
reset_rate_limit("user1")
allowed = check_rate_limit("user1")
print(f"After reset: {'✅ Allowed' if allowed else '❌ Blocked'}")

## Safety with Agents

In [None]:
# Create a safety-enabled agent
from ailib import tool

@tool
def process_user_data(data: str) -> str:
    """Process user-provided data."""
    # Check safety before processing
    is_safe, violations = check_content(data)
    if not is_safe:
        return f"Cannot process: {violations}"
    return f"Processed: {data.upper()}"

# Create agent with safety in mind
safe_agent = create_agent(
    "safe_assistant",
    tools=[process_user_data],
    instructions="""You are a safety-conscious assistant.
    Always check content safety before processing.
    Refuse any requests that might be harmful.""",
    temperature=0.3  # Lower temperature for more consistent behavior
)

# Test with various inputs
test_queries = [
    "Process this data: Hello World",
    "Process this data: Call 555-1234",  # Contains phone number
    "What's the weather like?"
]

for query in test_queries:
    print(f"\nQuery: {query}")
    
    # Check rate limit first
    if check_rate_limit("demo_user"):
        result = safe_agent.run(query)
        print(f"Result: {result}")
    else:
        print("Rate limit exceeded!")

## Safety Configuration Patterns

In [None]:
# Different safety configurations for different use cases

# 1. Strict mode for children's applications
def configure_child_safety():
    enable_safety(
        block_harmful=True,
        max_length=500,
        blocked_words=[
            "violence", "adult", "drugs", "weapons",
            "horror", "profanity"
        ]
    )
    set_rate_limit(max_requests=10, window_seconds=3600)  # 10 per hour
    print("✅ Child safety mode enabled")

# 2. Professional mode for business applications  
def configure_professional_safety():
    enable_safety(
        block_harmful=True,
        max_length=4000,
        blocked_words=["harassment", "discrimination"]
    )
    # Add PII filters
    add_custom_filter("ssn", r'\b\d{3}-\d{2}-\d{4}\b', "SSN detected")
    add_custom_filter("credit_card", r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b', "Credit card detected")
    print("✅ Professional safety mode enabled")

# 3. Research mode with minimal restrictions
def configure_research_safety():
    enable_safety(
        block_harmful=False,  # Don't block, just flag
        max_length=10000,
        blocked_words=[]  # No word restrictions
    )
    set_rate_limit(max_requests=100, window_seconds=60)  # Higher limits
    print("✅ Research safety mode enabled")

# Test different modes
print("Testing different safety configurations:\n")

configure_child_safety()
is_safe, _ = check_content("This contains mild violence")
print(f"Child mode - Violence check: {'Blocked' if not is_safe else 'Allowed'}")

configure_research_safety()
is_safe, _ = check_content("This contains mild violence")
print(f"Research mode - Violence check: {'Blocked' if not is_safe else 'Allowed'}")

## Building a Content Moderation Pipeline

In [None]:
# Complete moderation pipeline
class ModerationPipeline:
    def __init__(self, strict_mode=False):
        self.strict_mode = strict_mode
        self.configure_safety()
        
    def configure_safety(self):
        if self.strict_mode:
            configure_child_safety()
        else:
            configure_professional_safety()
    
    def process_input(self, user_id: str, text: str) -> dict:
        """Process user input through safety checks."""
        result = {
            "user_id": user_id,
            "input": text,
            "timestamp": datetime.now().isoformat(),
            "checks": {}
        }
        
        # Rate limit check
        if not check_rate_limit(user_id):
            result["blocked"] = True
            result["reason"] = "Rate limit exceeded"
            return result
        
        # Content safety check
        is_safe, violations = check_content(text)
        result["checks"]["content_safety"] = {
            "passed": is_safe,
            "violations": violations
        }
        
        if not is_safe:
            result["blocked"] = True
            result["reason"] = f"Content violations: {violations}"
            return result
        
        # All checks passed
        result["blocked"] = False
        result["processed"] = True
        return result
    
    def get_safe_response(self, user_id: str, query: str) -> str:
        """Get LLM response with full safety checks."""
        # Check input
        input_check = self.process_input(user_id, query)
        if input_check.get("blocked"):
            return f"Request blocked: {input_check['reason']}"
        
        # Process with LLM (mock)
        response = f"Response to: {query[:50]}..."
        
        # Check output
        output_check = self.process_input(user_id, response)
        if output_check.get("blocked"):
            return "Response filtered for safety reasons"
        
        return response

# Create and test pipeline
from datetime import datetime

pipeline = ModerationPipeline(strict_mode=False)

# Test various scenarios
test_cases = [
    ("user1", "What's the weather today?"),
    ("user2", "My SSN is 123-45-6789"),  # PII
    ("user3", "Tell me about machine learning"),
    ("user1", "Another question"),  # Rate limit test
]

for user_id, query in test_cases:
    print(f"\n{'='*50}")
    print(f"User: {user_id}")
    print(f"Query: {query}")
    
    result = pipeline.get_safe_response(user_id, query)
    print(f"Result: {result}")

## Best Practices

In [None]:
# Example: Safe agent factory
def create_safe_agent(name: str, purpose: str, **kwargs):
    """Create an agent with safety features pre-configured."""
    
    # Set safety instructions
    safety_instructions = f"""
You are a safe and responsible AI assistant designed for {purpose}.

Safety guidelines:
- Never process or output personal information (SSN, credit cards, etc.)
- Refuse requests for harmful, illegal, or unethical content
- Keep responses appropriate and professional
- If unsure about safety, err on the side of caution
"""
    
    # Merge with user instructions
    user_instructions = kwargs.get('instructions', '')
    kwargs['instructions'] = safety_instructions + "\n" + user_instructions
    
    # Set conservative defaults
    kwargs.setdefault('temperature', 0.3)
    kwargs.setdefault('max_steps', 5)
    
    return create_agent(name, **kwargs)

# Create different types of safe agents
customer_service = create_safe_agent(
    "customer_service",
    purpose="customer support",
    model="gpt-3.5-turbo"
)

educational = create_safe_agent(
    "tutor",
    purpose="educational assistance",
    model="gpt-4"
)

print("✅ Created safe agents for different purposes")
print(f"- Customer Service: {customer_service.name}")
print(f"- Educational: {educational.name}")

## Summary

AILib's safety features help you build responsible AI applications:

- ✅ **Content filtering** - Block harmful or inappropriate content
- ✅ **Rate limiting** - Prevent abuse and manage costs
- ✅ **Custom filters** - Add domain-specific safety rules
- ✅ **Moderation API** - Integrate with OpenAI's moderation
- ✅ **Flexible configuration** - Adapt to different use cases

Key takeaways:
1. Always enable safety features in production
2. Configure based on your use case (strict for children, moderate for business)
3. Add custom filters for PII and sensitive data
4. Implement rate limiting to prevent abuse
5. Test safety features thoroughly before deployment

## Next Steps

- Explore [Tracing and Debugging](13_tracing_and_debugging.ipynb)
- See [Real-World Examples](10_real_world_examples.ipynb) with safety integrated
- Read about safety best practices in production

Stay safe! 🛡️