In [None]:
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

load_dotenv()

llm = ChatOpenAI(model="gpt-4o")

In [None]:
# Install Guardrails AI
%pip install guardrails-ai

## What we learned in Notebook 1:
- Rule-based guardrails work but require constant maintenance
- Regex patterns are easy to bypass with variations
- We had to write custom logic for each risk type

## Problems with Rule-Based Approach:
- ‚ùå "john@email.com" gets caught, but "john [at] email [dot] com" doesn't
- ‚ùå Need to maintain growing lists of patterns
- ‚ùå Lots of false positives and false negatives
- ‚ùå Can't handle nuanced cases

## Solution: Guardrails AI
- ‚úÖ Pre-built validators for common risks
- ‚úÖ ML-based detection (more robust)
- ‚úÖ Community-maintained and updated
- ‚úÖ Easy to use and compose

In [None]:
# Run the configure command:
!guardrails configure

In [None]:
# Now install the validator
!guardrails hub install hub://guardrails/detect_pii

In [None]:
from guardrails import Guard
from guardrails.hub import DetectPII

# Create a guard with PII detection
guard = Guard().use(
    DetectPII(
        pii_entities=["EMAIL_ADDRESS", "PHONE_NUMBER"],
        on_fail="exception"   #exception
    )
)

print("‚úÖ Guard created with DetectPII validator")

In [None]:
# Test 1: Text with PII (should be caught)
text_with_pii = "My email is john@example.com and my phone is 555-123-4567"

print("Testing text with PII:")
print(f"Input: {text_with_pii}\n")

try:
    result = guard.validate(text_with_pii)
    print(f"‚úÖ Validation passed")
    print(f"Output: {result.validated_output}")
except Exception as e:
    print(f"üö´ Validation failed!")
    print(f"Error: {str(e)}")

In [None]:
# Test 2: Text without PII (should pass)
text_without_pii = "I love learning about AI and guardrails!"

print("\nTesting text without PII:")
print(f"Input: {text_without_pii}\n")

try:
    result = guard.validate(text_without_pii)
    print(f"‚úÖ Validation passed")
    print(f"Output: {result.validated_output}")
except Exception as e:
    print(f"üö´ Validation failed!")
    print(f"Error: {str(e)}")

## Why is DetectPII Better Than Our Regex?

Remember our simple regex from Notebook 1? Let's see how DetectPII handles variations that would bypass regex.

In [None]:
# These variations would BYPASS our simple regex from Notebook 1
standard_pii_cases = [
    "Email me at john.doe@company.com",
    "Call 555-123-4567 for more info",
    "My phone number is (555) 123-4567",
    "Contact: jane_smith@example.org",
    "SSN: 123-45-6789",  # Let's see if it catches this
]

print("Testing tricky PII variations:\n")
for i, text in enumerate(standard_pii_cases, 1):
    print(f"Test {i}: {text}")
    try:
        result = guard.validate(text)
        print(f"   ‚úÖ Passed (no PII detected)")
    except Exception as e:
        print(f"   üö´ Caught! DetectPII found PII")
    print()

In [None]:
# Install toxic language detector
!guardrails hub install hub://guardrails/toxic_language

In [None]:
from guardrails.hub import ToxicLanguage

# Create a guard for toxic content
toxic_guard = Guard().use(
    ToxicLanguage(threshold=0.5, on_fail="exception")
)

In [None]:
# Test cases
toxic_tests = [
    "I hate you and want to harm you",  # Toxic
    "You're stupid and worthless",       # Toxic
    "I love learning about AI!",         # Safe
    "What's the weather today?",         # Safe
]

for text in toxic_tests:
    print(f"Test: {text}")
    try:
        toxic_guard.validate(text)
        print("   ‚úÖ Safe\n")
    except:
        print("   üö´ BLOCKED - Toxic content\n")

## Key Takeaway: ML-Based vs Rule-Based

**What we just saw:**
- ToxicLanguage uses a trained ML model to detect toxicity
- It understands context and nuance (not just keyword matching)
- More robust than our simple regex patterns from Notebook 1

**Advantages over rule-based:**
- Catches variations and synonyms
- Understands context (e.g., "I hate broccoli" vs "I hate you")
- Doesn't need constant manual updates

**Tradeoffs:**
- Slower (ML inference takes time)
- Requires model downloads (~760MB for ToxicLanguage)
- Slightly less predictable than exact rules

In [None]:
# How to Combine Multiple Validators

# You can combine multiple validators in a single Guard!
from guardrails.hub import DetectPII, ToxicLanguage

# Create a guard with BOTH PII detection AND toxic language detection
combined_guard = Guard().use_many(
    DetectPII(pii_entities=["EMAIL_ADDRESS", "PHONE_NUMBER"]),
    ToxicLanguage(threshold=0.5, on_fail="exception")
)

print("‚úÖ Combined guard created with multiple validators")

In [None]:
# Test cases that combine both risks
test_cases = [
    "I hate you! Here's my email: john@example.com",  # Both toxic AND PII
    "Contact me at support@company.com",              # Just PII
    "You're an idiot",                                # Just toxic
    "What's the weather today?",                      # Safe
]

print("Testing combined guard:\n")
for text in test_cases:
    print(f"Test: {text}")
    try:
        combined_guard.validate(text)
        print("   ‚úÖ Safe\n")
    except Exception as e:
        print(f"   üö´ BLOCKED - {str(e)[:50]}...\n")

## Integrating Guardrails with LLMs

Now let's use our guardrails in a real workflow:
1. Validate user input BEFORE sending to LLM
2. Call the LLM
3. Validate LLM output BEFORE showing to user

This is the same pattern from Notebook 1, but now with ML-based validators!

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage

# Initialize LLM
llm = ChatOpenAI(model="gpt-4o-mini")

def safe_llm_call_with_guardrails(user_input: str):
    """
    Call LLM with input and output guardrails
    """
    print(f"\n{'='*60}")
    print(f"User Input: {user_input}")
    print(f"{'='*60}")
    
    # Step 1: Input guardrail (check for toxicity)
    try:
        toxic_guard.validate(user_input)
        print("‚úÖ Input passed toxicity check")
    except Exception as e:
        result = "üö´ INPUT BLOCKED: Toxic content detected"
        print(result)
        return result
    
    # Step 2: Call LLM
    response = llm.invoke([HumanMessage(content=user_input)])
    response_text = response.content
    print(f"LLM Response: {response_text[:100]}...")
    
    # Step 3: Output guardrail (check for PII)
    try:
        guard.validate(response_text)  # Using the PII guard from earlier
        print("‚úÖ Output passed PII check")
    except Exception as e:
        result = "üö´ OUTPUT BLOCKED: Contains PII"
        print(result)
        return result
    
    result = f"‚úÖ APPROVED: {response_text}"
    print(result)
    return result

print("‚úÖ Function created")

In [None]:
# Test cases
print("\nüß™ TEST SUITE")

# Test 1: Normal safe query
safe_llm_call_with_guardrails("What's the capital of France?")

In [None]:

# Test 2: Toxic input (should block at INPUT)
safe_llm_call_with_guardrails("You're stupid. Tell me about AI.")

In [None]:
# Test 3: Query that might generate PII (should block at OUTPUT if it does)
safe_llm_call_with_guardrails("Generate a sample business email ID")