In [7]:
!pip install sentence-transformers boto3 numpy

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [8]:
# Complete Testing Notebook - All Required Functions
# Run this cell first to set up everything needed for testing

import boto3
import json
import sagemaker
from sagemaker.huggingface import HuggingFaceModel
from sagemaker import get_execution_role
import time
import logging
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
from datetime import datetime, timedelta

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("📦 Loading dependencies and setting up clients...")

# Initialize AWS clients
s3 = boto3.client('s3')
dynamodb = boto3.resource('dynamodb')
sagemaker_runtime = boto3.client('sagemaker-runtime')
sagemaker_session = sagemaker.Session()
role = "arn:aws:iam::467383999568:role/LabRole"  #get_execution_role()

# Configuration - UPDATE THESE WITH YOUR VALUES
BUCKET_NAME = "tech-translator-s3-knowledge-base"  # Your S3 bucket
TABLE_NAME = "tech-translator-dynamodb-vector-storage"  # Your DynamoDB table

print(f"📝 Configuration:")
print(f"  S3 Bucket: {BUCKET_NAME}")
print(f"  DynamoDB Table: {TABLE_NAME}")

# Initialize sentence transformer model
print("🤖 Loading sentence transformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Model loaded successfully!")

# ===== RAG FUNCTIONS (copied from your previous notebook) =====

def extract_concept_and_audience(query):
    """Extract concept and audience from user query"""
    query_lower = query.lower()
    
    # Concept mapping
    concept_keywords = {
        'r-squared': ['r squared', 'r-squared', 'r2', 'coefficient of determination'],
        'loss-ratio': ['loss ratio', 'claims ratio', 'incurred losses'],
        'predictive-model': ['predictive model', 'prediction model', 'machine learning', 'ml model']
    }
    
    detected_concept = None
    for concept_id, keywords in concept_keywords.items():
        if any(keyword in query_lower for keyword in keywords):
            detected_concept = concept_id
            break
    
    if not detected_concept:
        detected_concept = 'predictive-model'  # Default
    
    # Audience mapping
    audience_keywords = {
        'underwriter': ['underwriter', 'underwriting'],
        'actuary': ['actuary', 'actuarial', 'actuaries'],
        'executive': ['executive', 'ceo', 'manager', 'leadership']
    }
    
    detected_audience = None
    for audience_id, keywords in audience_keywords.items():
        if any(keyword in query_lower for keyword in keywords):
            detected_audience = audience_id
            break
    
    if not detected_audience:
        detected_audience = 'general'
    
    return {'concept': detected_concept, 'audience': detected_audience}

def vector_search(query, concept_id=None, top_k=5):
    """Perform vector search on stored embeddings"""
    
    # Generate query embedding
    query_embedding = model.encode(query)
    
    # Query DynamoDB
    table = dynamodb.Table(TABLE_NAME)
    
    try:
        if concept_id:
            print(f"🔍 Searching in concept: {concept_id}")
            response = table.query(
                KeyConditionExpression="concept_id = :concept_id",
                ExpressionAttributeValues={":concept_id": concept_id}
            )
        else:
            print("🔍 Searching across all concepts")
            response = table.scan()
        
        items = response.get('Items', [])
        print(f"Found {len(items)} items to search")
        
        if not items:
            return []
        
        # Calculate similarities
        results = []
        for item in items:
            # Parse stored embedding
            stored_embedding = json.loads(item['embedding'])
            
            # Calculate cosine similarity
            similarity = 1 - cosine(query_embedding, stored_embedding)
            
            results.append({
                'item': item,
                'similarity': similarity
            })
        
        # Sort by similarity (highest first)
        results.sort(key=lambda x: x['similarity'], reverse=True)
        
        return results[:top_k]
        
    except Exception as e:
        print(f"❌ Vector search error: {str(e)}")
        return []

# ===== DEPLOYMENT FUNCTIONS =====
def deploy_model():
    """Deploy model to SageMaker endpoint """
    try:
        print("📦 Creating HuggingFace model configuration...")
        
        # Model configuration using env parameter
        hub = {
            'HF_MODEL_ID': 'google/flan-t5-large', #'google/flan-t5-base', #'google/flan-t5-small', 
            'HF_TASK': 'text2text-generation'
        }
        
        # Create model with supported versions
        huggingface_model = HuggingFaceModel(
            transformers_version="4.37.0",
            pytorch_version="2.1.0",
            py_version="py310",
            env=hub,
            role=role,
        )
        
        print("🚀 Deploying model...")
        print("⏱️  This may take 5-10 minutes...")
        
        # List of instance types to try
        instance_types = [
            #"ml.m5.large",
            #"ml.c5.large", 
            "ml.m5.xlarge",
            "ml.c5.xlarge",
        ]
        
        predictor = None
        deployment_successful = False
        
        for instance_type in instance_types:
            try:
                print(f"\n🔄 Trying deployment on {instance_type}...")
                
                # FIXED: Proper endpoint name parameter
                endpoint_name = f"tech-translator-model-{int(time.time())}"
                
                predictor = huggingface_model.deploy(
                    initial_instance_count=1,
                    instance_type=instance_type,
                    endpoint_name=endpoint_name,  
                    container_startup_health_check_timeout=600,
                    model_data_download_timeout=600,
                    wait=True
                )
                
                print(f"✅ Successfully deployed on {instance_type}!")
                deployment_successful = True
                break
                
            except Exception as e:
                print(f"❌ Failed to deploy on {instance_type}: {str(e)}")
                if any(err in str(e) for err in ["ResourceLimitExceeded", "InsufficientCapacity", "ValidationException"]):
                    print("   Trying next instance type...")
                    continue
                else:
                    print(f"   Unexpected error, continuing...")
                    continue
        
        if deployment_successful:
            print(f"✅ Model deployed successfully!")
            print(f"📍 Endpoint name: {endpoint_name}")
            return predictor, endpoint_name
        else:
            print("❌ All deployment attempts failed!")
            return None, None
            
    except Exception as e:
        print(f"❌ Model creation failed: {str(e)}")
        return None, None




def cleanup_endpoint(endpoint_name):
    """Delete the endpoint to avoid charges"""
    print(f"\n🧹 Cleaning up endpoint: {endpoint_name}")
    
    try:
        sagemaker_client = boto3.client('sagemaker')
        sagemaker_client.delete_endpoint(EndpointName=endpoint_name)
        print("✅ Endpoint deletion initiated!")
        print("💰 This will stop incurring charges.")
        return True
    except Exception as e:
        print(f"❌ Error deleting endpoint: {str(e)}")
        return False



INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


📦 Loading dependencies and setting up clients...
📝 Configuration:
  S3 Bucket: tech-translator-s3-knowledge-base
  DynamoDB Table: tech-translator-dynamodb-vector-storage
🤖 Loading sentence transformer model...
✅ Model loaded successfully!


In [9]:
# ===== TESTING FUNCTIONS =====
def call_endpoint(endpoint_name, prompt, max_new_tokens=100):
    """Helper function to call the deployed FLAN-T5 endpoint with proper format"""
    
    # FLAN-T5 is a text-to-text model, so we need to format the prompt properly
    # It works best with instruction-style prompts
    
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": max_new_tokens,  # Use max_new_tokens instead of max_length
            "temperature": 0.7,
            "do_sample": True,
            "top_p": 0.9,
            "repetition_penalty": 1.1,
            # Remove return_full_text - not supported by FLAN-T5
            # Remove pad_token_id - FLAN-T5 handles this automatically
        }
    }
    
    try:
        response = sagemaker_runtime.invoke_endpoint(
            EndpointName=endpoint_name,
            ContentType='application/json',
            Body=json.dumps(payload)
        )
        
        result = json.loads(response['Body'].read().decode())
        
        # Handle FLAN-T5 response format
        if isinstance(result, list) and len(result) > 0:
            if isinstance(result[0], dict):
                generated_text = result[0].get('generated_text', '')
            else:
                generated_text = str(result[0])
        elif isinstance(result, dict):
            generated_text = result.get('generated_text', '')
        else:
            generated_text = str(result)
        
        # Clean up the response
        generated_text = generated_text.strip()
        
        return generated_text
        
    except Exception as e:
        print(f"❌ Endpoint call failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return None


def quick_test(endpoint_name, test_query="What is R-squared?"):
    """Quick test of the FLAN-T5 endpoint with proper instruction format"""
    print(f"⚡ Quick Test: '{test_query}'")
    
    # Format the query as an instruction for FLAN-T5
    instruction_prompt = f"Explain the following concept: {test_query}"
    
    response = call_endpoint(endpoint_name, instruction_prompt, max_new_tokens=50)
    
    if response:
        print(f"Response: {response}")
        print("✅ Endpoint is working!")
        return True
    else:
        print("❌ Test failed")
        return False


def test_simple_prompts(endpoint_name):
    """Test with instruction-style prompts optimized for FLAN-T5"""
    print("🧪 Testing Simple Prompts for FLAN-T5")
    print("=" * 30)
    
    simple_tests = [
        {
            "prompt": "Define R-squared in statistics.",
            "description": "R-squared definition"
        },
        {
            "prompt": "Explain what loss ratio means in insurance.",
            "description": "Loss ratio explanation"
        },
        {
            "prompt": "Describe how predictive models help insurance companies.",
            "description": "Predictive models in insurance"
        }
    ]
    
    for i, test in enumerate(simple_tests, 1):
        print(f"\n{i}. {test['description']}")
        print(f"Prompt: '{test['prompt']}'")
        
        response = call_endpoint(endpoint_name, test['prompt'], max_new_tokens=60)
        
        if response:
            print(f"Response: {response}")
            
            # Check if response is relevant
            insurance_terms = ['insurance', 'claim', 'premium', 'risk', 'policy', 'coverage']
            stats_terms = ['measure', 'ratio', 'model', 'predict', 'data', 'statistical']
            
            relevant_count = sum(1 for term in insurance_terms + stats_terms if term in response.lower())
            
            if relevant_count >= 2:
                print("✅ Good response - contains relevant terms")
            elif relevant_count == 1:
                print("⚠️  Okay response - some relevant content")
            else:
                print("❌ Poor response - lacks relevant content")
        else:
            print("❌ No response generated")
        
        print("-" * 25)


def test_model_basic(predictor):
    """Basic test of the deployed FLAN-T5 model"""
    print("\n🧪 Running Basic Model Tests for FLAN-T5...")
    print("-" * 30)
    
    # Test cases optimized for FLAN-T5 instruction format
    test_cases = [
        {
            "name": "Simple Definition",
            "prompt": "Define R-squared in statistics.",
            "max_new_tokens": 50
        },
        {
            "name": "Insurance Context",
            "prompt": "Explain what loss ratio means in insurance.",
            "max_new_tokens": 60
        },
        {
            "name": "Concept Explanation", 
            "prompt": "What is a predictive model and how is it used?",
            "max_new_tokens": 70
        },
        {
            "name": "Professional Context",
            "prompt": "Explain how R-squared helps underwriters assess risk models.",
            "max_new_tokens": 80
        }
    ]
    
    for i, test_case in enumerate(test_cases, 1):
        print(f"\nTest {i}: {test_case['name']}")
        print(f"Input: '{test_case['prompt']}'")
        
        try:
            # Prepare payload for FLAN-T5 model
            payload = {
                "inputs": test_case["prompt"],
                "parameters": {
                    "max_new_tokens": test_case["max_new_tokens"],
                    "temperature": 0.7,
                    "do_sample": True,
                    "top_p": 0.9,
                    "repetition_penalty": 1.1,
                    # No return_full_text or pad_token_id for FLAN-T5
                }
            }
            
            # Call the model
            result = predictor.predict(payload)
            
            # Process result - FLAN-T5 format
            if isinstance(result, list) and len(result) > 0:
                if isinstance(result[0], dict):
                    generated_text = result[0].get('generated_text', 'No text generated')
                else:
                    generated_text = str(result[0])
            elif isinstance(result, dict):
                generated_text = result.get('generated_text', 'No text generated')
            else:
                generated_text = str(result)
            
            # Clean up the output
            generated_text = generated_text.strip()
            
            print(f"Output: '{generated_text}'")
            
            # Check if output looks reasonable for FLAN-T5
            if len(generated_text) > 5 and not generated_text.startswith("Error"):
                # Check for relevant terms
                relevant_terms = ['ratio', 'model', 'measure', 'data', 'insurance', 'risk', 'statistical']
                has_relevant_content = any(term in generated_text.lower() for term in relevant_terms)
                
                if has_relevant_content:
                    print("✅ Test passed - relevant content generated!")
                else:
                    print("⚠️  Test passed but content may not be domain-specific")
            else:
                print("⚠️  Test passed but output seems short")
            
        except Exception as e:
            print(f"❌ Test failed: {str(e)}")
            import traceback
            traceback.print_exc()
    
    return True


def test_rag_integration(predictor, endpoint_name):
    """Test FLAN-T5 with your RAG system using proper instruction formatting"""
    print("\n🔗 Testing RAG Integration with FLAN-T5...")
    print("-" * 30)
    
    test_query = "What is R-squared for an underwriter?"
    
    try:
        # Extract concept and audience
        concept_and_audience = extract_concept_and_audience(test_query)
        print(f"📊 Extracted - Concept: {concept_and_audience['concept']}, Audience: {concept_and_audience['audience']}")
        
        # Perform vector search
        search_results = vector_search(test_query, concept_and_audience['concept'], top_k=3)
        print(f"🔍 Found {len(search_results)} relevant chunks")
        
        if not search_results:
            print("⚠️  No search results found - make sure your DynamoDB table has data")
            return False
        
        # Show top results
        for i, result in enumerate(search_results[:2]):
            print(f"  {i+1}. {result['item']['vector_id']} (similarity: {result['similarity']:.3f})")
        
        # Create context from search results
        context_text = ""
        for result in search_results[:2]:  # Top 2 results
            context_text += f"- {result['item']['text'][:150]}...\n"
        
        # Create instruction-style prompt for FLAN-T5
        audience = concept_and_audience['audience']
        
        # FLAN-T5 works better with clear, direct instructions
        prompt = f"""Based on the following information, explain R-squared to an {audience} in the insurance industry.

Context:
{context_text}

Question: {test_query}

Provide a clear, professional explanation:"""
        
        print(f"📝 Created instruction prompt ({len(prompt.split())} words)")
        
        # Call FLAN-T5
        response = call_endpoint(endpoint_name, prompt, max_new_tokens=120)
        
        if response:
            print(f"🤖 FLAN-T5 Response: {response}")
            print("✅ RAG integration test passed!")
            return True
        else:
            print("❌ Failed to get FLAN-T5 response")
            return False
        
    except Exception as e:
        print(f"❌ RAG integration test failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return False


In [10]:
print("🎉 All functions loaded successfully!")
print("\nNext steps:")
print("1. Run: predictor, endpoint_name = deploy_model()")
print("2. Wait for deployment to complete")
print("3. Run: test_model_basic(predictor)")
print("4. Run: test_rag_integration(predictor, endpoint_name)")
print("5. When done: cleanup_endpoint(endpoint_name)")

🎉 All functions loaded successfully!

Next steps:
1. Run: predictor, endpoint_name = deploy_model()
2. Wait for deployment to complete
3. Run: test_model_basic(predictor)
4. Run: test_rag_integration(predictor, endpoint_name)
5. When done: cleanup_endpoint(endpoint_name)


# STEP 1: Deploy the model

In [11]:
# STEP 1: Deploy the model
print("🚀 STEP 1: Deploying model...")
print("=" * 50)

predictor, endpoint_name = deploy_model()

if predictor and endpoint_name:
    print(f"\n✅ Deployment successful!")
    print(f"📍 Endpoint: {endpoint_name}")
    
    # Store for later use
    DEPLOYED_ENDPOINT_NAME = endpoint_name
    print(f"📝 Stored in variable: DEPLOYED_ENDPOINT_NAME")
    
    # Wait for endpoint to be ready
    print("\n⏳ Waiting for endpoint to be fully ready...")
    time.sleep(60)  # Wait 1 minute for endpoint to stabilize
    
else:
    print("❌ Deployment failed!")
    print("Check error messages above and try troubleshooting")

🚀 STEP 1: Deploying model...
📦 Creating HuggingFace model configuration...
🚀 Deploying model...
⏱️  This may take 5-10 minutes...

🔄 Trying deployment on ml.m5.xlarge...


INFO:sagemaker:Creating model with name: huggingface-pytorch-inference-2025-05-24-14-57-51-240
INFO:sagemaker:Creating endpoint-config with name tech-translator-model-1748098670
INFO:sagemaker:Creating endpoint with name tech-translator-model-1748098670


----------!✅ Successfully deployed on ml.m5.xlarge!
✅ Model deployed successfully!
📍 Endpoint name: tech-translator-model-1748098670

✅ Deployment successful!
📍 Endpoint: tech-translator-model-1748098670
📝 Stored in variable: DEPLOYED_ENDPOINT_NAME

⏳ Waiting for endpoint to be fully ready...


# STEP 2: Test the deployed model

In [12]:
# STEP 2: Test the deployed model
if 'DEPLOYED_ENDPOINT_NAME' in locals() and DEPLOYED_ENDPOINT_NAME:
    print("🧪 STEP 2: Testing the deployed FLAN-T5 model...")
    print("=" * 50)
    
    # Quick verification first
    print("⚡ Quick verification test...")
    if quick_test(DEPLOYED_ENDPOINT_NAME):
        print("\n✅ Basic functionality confirmed!")
        
        # Run comprehensive tests
        print("\n🔍 Running comprehensive tests...")
        test_model_basic(predictor)
        test_simple_prompts(DEPLOYED_ENDPOINT_NAME)
        
        # Test RAG integration
        print("\n🔗 Testing RAG integration...")
        rag_success = test_rag_integration(predictor, DEPLOYED_ENDPOINT_NAME)
        
        if rag_success:
            print("\n🎉 All tests passed! Your TechTranslator system is working!")
        else:
            print("\n⚠️  RAG integration has issues - check your DynamoDB data")
    else:
        print("❌ Basic test failed - check the error messages above")

🧪 STEP 2: Testing the deployed FLAN-T5 model...
⚡ Quick verification test...
⚡ Quick Test: 'What is R-squared?'
Response: r squar
✅ Endpoint is working!

✅ Basic functionality confirmed!

🔍 Running comprehensive tests...

🧪 Running Basic Model Tests for FLAN-T5...
------------------------------

Test 1: Simple Definition
Input: 'Define R-squared in statistics.'
Output: 'r = r * 2'
⚠️  Test passed but content may not be domain-specific

Test 2: Insurance Context
Input: 'Explain what loss ratio means in insurance.'
Output: 'In an insurance policy, a loss ratio is the percentage of a policyholder's loss attributable to insurance policies.'
✅ Test passed - relevant content generated!

Test 3: Concept Explanation
Input: 'What is a predictive model and how is it used?'
Output: 'A predictive model is a method for predicting the behavior of an individual or a group of individuals'
✅ Test passed - relevant content generated!

Test 4: Professional Context
Input: 'Explain how R-squared helps unde

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

🔍 Searching in concept: r-squared
Found 8 items to search
🔍 Found 3 relevant chunks
  1. r-squared-underwriter (similarity: 0.727)
  2. r-squared-definition (similarity: 0.609)
📝 Created instruction prompt (78 words)
🤖 FLAN-T5 Response: As an underwriter, you can think of R-squared as a measure of how well your pricing model captures risk factors. If your pricing model has an R-squared, you can think of R-squared as a measure of how well your pricing model captures risk factors.
✅ RAG integration test passed!

🎉 All tests passed! Your TechTranslator system is working!


# STEP 3: Advanced testing and demonstration

In [13]:
# STEP 3: Advanced testing and demonstration - Updated for FLAN-T5

if 'DEPLOYED_ENDPOINT_NAME' in locals() and DEPLOYED_ENDPOINT_NAME:
    print("🎯 STEP 3: Advanced Testing & Demonstration (FLAN-T5)")
    print("=" * 50)
    
    # Test different insurance scenarios with instruction-style prompts
    test_scenarios = [
        {
            "query": "What is R-squared for an underwriter?",
            "description": "R-squared explanation for underwriters"
        },
        {
            "query": "Explain loss ratio to an executive", 
            "description": "Loss ratio for executives"
        },
        {
            "query": "How do predictive models help actuaries?",
            "description": "Predictive models for actuaries"
        }
    ]
    
    for i, scenario in enumerate(test_scenarios, 1):
        print(f"\n🧪 Scenario {i}: {scenario['description']}")
        print("-" * 40)
        
        query = scenario['query']
        print(f"User Query: '{query}'")
        
        # Extract concept and audience
        concept_and_audience = extract_concept_and_audience(query)
        print(f"📊 Concept: {concept_and_audience['concept']}, Audience: {concept_and_audience['audience']}")
        
        # Vector search
        search_results = vector_search(query, concept_and_audience['concept'], top_k=3)
        
        if search_results:
            print(f"🔍 Found {len(search_results)} relevant chunks")
            print(f"   Top match: {search_results[0]['item']['vector_id']} (similarity: {search_results[0]['similarity']:.3f})")
            
            # Create context from top results
            context_text = ""
            for result in search_results[:2]:
                context_text += f"- {result['item']['text'][:120]}...\n"
            
            # Create instruction-style prompt optimized for FLAN-T5
            audience = concept_and_audience['audience']
            concept = concept_and_audience['concept'].replace('-', ' ').title()
            
            # FLAN-T5 works best with clear, structured instructions
            prompt = f"""You are explaining {concept} to an insurance {audience}. 

Context information:
{context_text}

Task: Based on the context above, explain {concept} in a way that an insurance {audience} would understand. Focus on practical applications in insurance.

Explanation:"""
            
            # Get FLAN-T5 response with appropriate parameters
            response = call_endpoint(DEPLOYED_ENDPOINT_NAME, prompt, max_new_tokens=100)
            
            if response:
                print(f"🤖 TechTranslator (FLAN-T5) Response:\n{response}")
                
                # Evaluate response quality
                insurance_terms = ['insurance', 'premium', 'risk', 'claim', 'policy', 'underwriting', 'actuarial']
                concept_terms = ['ratio', 'model', 'squared', 'predictive', 'statistical', 'measure']
                
                insurance_count = sum(1 for term in insurance_terms if term.lower() in response.lower())
                concept_count = sum(1 for term in concept_terms if term.lower() in response.lower())
                
                if insurance_count >= 1 and concept_count >= 1:
                    print("✅ High-quality response - contains both insurance and technical terms")
                elif insurance_count >= 1 or concept_count >= 1:
                    print("⚠️  Good response - contains relevant domain terms")
                else:
                    print("⚠️  Basic response - may lack domain specificity")
                
                print("✅ Scenario completed successfully!")
            else:
                print("❌ Failed to generate response")
        else:
            print("⚠️  No relevant chunks found in vector search")
        
        print("-" * 40)
        time.sleep(2)  # Brief pause between scenarios
    
    # Additional FLAN-T5 specific tests
    print(f"\n🎯 FLAN-T5 Specific Instruction Tests")
    print("-" * 40)
    
    instruction_tests = [
        {
            "instruction": "Summarize the key benefits of using R-squared in insurance pricing models.",
            "description": "Summarization task"
        },
        {
            "instruction": "List three ways predictive models improve insurance operations.",
            "description": "List generation task"
        },
        {
            "instruction": "Compare loss ratio and combined ratio in insurance.",
            "description": "Comparison task"
        }
    ]
    
    for j, test in enumerate(instruction_tests, 1):
        print(f"\n📋 Instruction Test {j}: {test['description']}")
        print(f"Instruction: '{test['instruction']}'")
        
        response = call_endpoint(DEPLOYED_ENDPOINT_NAME, test['instruction'], max_new_tokens=80)
        
        if response:
            print(f"Response: {response}")
            
            # Check if FLAN-T5 followed the instruction format
            if test['description'] == "List generation task":
                has_list_format = any(marker in response for marker in ['1.', '2.', '3.', '-', '•'])
                if has_list_format:
                    print("✅ Good - followed list format instruction")
                else:
                    print("⚠️  Okay - content relevant but didn't follow list format")
            else:
                print("✅ Response generated successfully")
        else:
            print("❌ No response generated")
        
        print("-" * 25)
    
    print(f"\n🏁 Advanced FLAN-T5 testing completed!")
    
    # Cost reminder
    print(f"\n💰 COST REMINDER:")
    print(f"Your FLAN-T5 endpoint '{DEPLOYED_ENDPOINT_NAME}' is running and incurring charges.")
    print(f"Estimated cost: ~$0.19/hour for ml.m5.large")
    print(f"Run cleanup_endpoint('{DEPLOYED_ENDPOINT_NAME}') when done!")
    
else:
    print("❌ No endpoint available - complete Steps 1 and 2 first!")

🎯 STEP 3: Advanced Testing & Demonstration (FLAN-T5)

🧪 Scenario 1: R-squared explanation for underwriters
----------------------------------------
User Query: 'What is R-squared for an underwriter?'
📊 Concept: r-squared, Audience: underwriter


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

🔍 Searching in concept: r-squared
Found 8 items to search
🔍 Found 3 relevant chunks
   Top match: r-squared-underwriter (similarity: 0.727)
🤖 TechTranslator (FLAN-T5) Response:
R-squared (R2) is a statistical measure that represents the proportion of the variance for a dependent variable that's e.g. the average of the odds of a driver being killed in an accident.
⚠️  Good response - contains relevant domain terms
✅ Scenario completed successfully!
----------------------------------------

🧪 Scenario 2: Loss ratio for executives
----------------------------------------
User Query: 'Explain loss ratio to an executive'
📊 Concept: loss-ratio, Audience: executive


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

🔍 Searching in concept: loss-ratio
Found 8 items to search
🔍 Found 3 relevant chunks
   Top match: loss-ratio-definition (similarity: 0.626)
🤖 TechTranslator (FLAN-T5) Response:
The basic formula is: Loss Ratio = (Incurred Losses + Loss Adjustment Expenses) / Earned Premiums  100%. A combined rat...
✅ High-quality response - contains both insurance and technical terms
✅ Scenario completed successfully!
----------------------------------------

🧪 Scenario 3: Predictive models for actuaries
----------------------------------------
User Query: 'How do predictive models help actuaries?'
📊 Concept: predictive-model, Audience: actuary


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

🔍 Searching in concept: predictive-model
Found 8 items to search
🔍 Found 3 relevant chunks
   Top match: predictive-model-context (similarity: 0.602)
🤖 TechTranslator (FLAN-T5) Response:
A predictive model is a statistical algorithm that uses historical data to predict future outcomes or classify new data.
⚠️  Good response - contains relevant domain terms
✅ Scenario completed successfully!
----------------------------------------

🎯 FLAN-T5 Specific Instruction Tests
----------------------------------------

📋 Instruction Test 1: Summarization task
Instruction: 'Summarize the key benefits of using R-squared in insurance pricing models.'
Response: Use R-squared to calculate model parameters.
✅ Response generated successfully
-------------------------

📋 Instruction Test 2: List generation task
Instruction: 'List three ways predictive models improve insurance operations.'
Response: predict loss.
⚠️  Okay - content relevant but didn't follow list format
-------------------------

📋 Instr

In [15]:
# Improved instruction tests with better prompts for FLAN-T5
def test_improved_instructions(endpoint_name):
    """Test FLAN-T5 with more explicit, better-structured instructions"""
    print(f"\n🎯 Improved FLAN-T5 Instruction Tests")
    print("-" * 40)
    
    improved_tests = [
        {
            "instruction": "Based on insurance knowledge, summarize in 2-3 sentences: What are the key benefits of using R-squared in insurance pricing models?",
            "description": "Improved summarization task",
            "max_tokens": 60
        },
        {
            "instruction": "List exactly 3 ways predictive models improve insurance operations. Format your answer as:\n1. [first way]\n2. [second way]\n3. [third way]",
            "description": "Explicit list format task",
            "max_tokens": 80
        },
        {
            "instruction": "In insurance, explain the difference between loss ratio and combined ratio. Loss ratio is... Combined ratio is...",
            "description": "Structured comparison task",
            "max_tokens": 70
        }
    ]
    
    for j, test in enumerate(improved_tests, 1):
        print(f"\n📋 Improved Test {j}: {test['description']}")
        print(f"Instruction: '{test['instruction']}'")
        
        response = call_endpoint(endpoint_name, test['instruction'], max_new_tokens=test['max_tokens'])
        
        if response:
            print(f"Response: {response}")
            
            # Evaluate improvement
            if "1." in response or "2." in response or "3." in response:
                print("✅ Excellent - followed numbered list format!")
            elif any(word in response.lower() for word in ['ratio', 'model', 'insurance', 'risk']):
                print("✅ Good - contains relevant insurance terms")
            else:
                print("⚠️  Basic response generated")
        else:
            print("❌ No response generated")
        
        print("-" * 25)

# Test with context-enhanced prompts
def test_context_enhanced_prompts(endpoint_name):
    """Test using your RAG context for better responses"""
    print(f"\n🎯 Context-Enhanced Instruction Tests")
    print("-" * 40)
    
    # Use your vector search to get context
    context_query = "What is loss ratio?"
    search_results = vector_search(context_query, "loss-ratio", top_k=2)
    
    if search_results:
        context = search_results[0]['item']['text'][:200] + "..."
        
        enhanced_prompt = f"""Context: {context}

Task: Based on the context above, list 3 key points about loss ratio in insurance.

Format your answer as:
1. [point one]
2. [point two]  
3. [point three]

Answer:"""
        
        print("📝 Testing context-enhanced prompt...")
        response = call_endpoint(endpoint_name, enhanced_prompt, max_new_tokens=90)
        
        if response:
            print(f"🤖 Context-Enhanced Response: {response}")
            if "1." in response and "2." in response:
                print("✅ Excellent - used context AND followed format!")
            else:
                print("✅ Good - generated relevant response")
        else:
            print("❌ No response")
    else:
        print("⚠️ Could not retrieve context for enhanced test")

# Run the improved tests
if 'DEPLOYED_ENDPOINT_NAME' in locals() and DEPLOYED_ENDPOINT_NAME:
    test_improved_instructions(DEPLOYED_ENDPOINT_NAME)
    test_context_enhanced_prompts(DEPLOYED_ENDPOINT_NAME)
else:
    print("No endpoint available")


🎯 Improved FLAN-T5 Instruction Tests
----------------------------------------

📋 Improved Test 1: Improved summarization task
Instruction: 'Based on insurance knowledge, summarize in 2-3 sentences: What are the key benefits of using R-squared in insurance pricing models?'
Response: In insurance pricing models, R-squared is a key factor in determining the best pricing model.
✅ Good - contains relevant insurance terms
-------------------------

📋 Improved Test 2: Explicit list format task
Instruction: 'List exactly 3 ways predictive models improve insurance operations. Format your answer as:
1. [first way]
2. [second way]
3. [third way]'
Response: 3.
✅ Excellent - followed numbered list format!
-------------------------

📋 Improved Test 3: Structured comparison task
Instruction: 'In insurance, explain the difference between loss ratio and combined ratio. Loss ratio is... Combined ratio is...'
Response: The ratio between the amount of insurance paid and the amount of loss.
✅ Good - conta

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

🔍 Searching in concept: loss-ratio
Found 8 items to search
📝 Testing context-enhanced prompt...
🤖 Context-Enhanced Response: 1. Loss ratio is a key insurance metric that measures the relationship between incurred losses and earned premiums, expressed as a percentage. 2. Loss ratio is used to estimate the risk of loss. 3. Loss ratio is used to measure the risk of loss.
✅ Excellent - used context AND followed format!


# STEP 4: CRITICAL - Cleanup to avoid charges

In [15]:
# STEP 4: CRITICAL - Cleanup to avoid charges
if 'DEPLOYED_ENDPOINT_NAME' in locals() and DEPLOYED_ENDPOINT_NAME:
    print("🧹 STEP 4: Cleanup (CRITICAL for cost management)")
    print("=" * 50)
    
    print(f"Current endpoint: {DEPLOYED_ENDPOINT_NAME}")
    
    # Check endpoint status before cleanup
    try:
        sagemaker_client = boto3.client('sagemaker')
        response = sagemaker_client.describe_endpoint(EndpointName=DEPLOYED_ENDPOINT_NAME)
        status = response['EndpointStatus']
        print(f"📊 Current status: {status}")
        
        if status == 'InService':
            print("✅ Endpoint is running (and charging you money!)")
        else:
            print(f"ℹ️  Endpoint status: {status}")
            
    except Exception as e:
        print(f"⚠️  Could not check endpoint status: {str(e)}")
    
    # Confirm cleanup
    print(f"\n⚠️  WARNING: You are about to delete endpoint: {DEPLOYED_ENDPOINT_NAME}")
    print("This will stop all charges but you'll need to redeploy to continue testing.")
    
    # For notebook use, we'll proceed with cleanup
    # In production, you might want user confirmation
    
    print("🗑️  Proceeding with endpoint deletion...")
    
    if cleanup_endpoint(DEPLOYED_ENDPOINT_NAME):
        print("✅ Cleanup completed successfully!")
        print("💰 Your endpoint is no longer incurring charges.")
        
        # Clear the variable
        DEPLOYED_ENDPOINT_NAME = None
        print("📝 Endpoint variable cleared.")
        
    else:
        print("❌ Cleanup failed - you may need to delete manually from SageMaker console")
        print(f"Endpoint name: {DEPLOYED_ENDPOINT_NAME}")
        
else:
    print("ℹ️  No active endpoint found to cleanup.")

print("\n🏁 Session complete!")
print("Summary of what we accomplished:")
print("✅ Deployed model to SageMaker")
print("✅ Tested basic model functionality") 
print("✅ Integrated with RAG system")
print("✅ Demonstrated prompt engineering")
print("✅ Cleaned up resources to avoid charges")
print("\nYour TechTranslator system is working! 🎉")

🧹 STEP 4: Cleanup (CRITICAL for cost management)
Current endpoint: tech-translator-gpt2-1748092833
📊 Current status: InService
✅ Endpoint is running (and charging you money!)

This will stop all charges but you'll need to redeploy to continue testing.
🗑️  Proceeding with endpoint deletion...

🧹 Cleaning up endpoint: tech-translator-gpt2-1748092833
✅ Endpoint deletion initiated!
💰 This will stop incurring charges.
✅ Cleanup completed successfully!
💰 Your endpoint is no longer incurring charges.
📝 Endpoint variable cleared.

🏁 Session complete!
Summary of what we accomplished:
✅ Deployed model to SageMaker
✅ Tested basic model functionality
✅ Integrated with RAG system
✅ Demonstrated prompt engineering
✅ Cleaned up resources to avoid charges

Your TechTranslator system is working! 🎉
