In [2]:
!pip install sentence-transformers boto3 numpy

Collecting sentence-transformers
  Using cached sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-4.1.0


# Cell 1: Install Dependencies and Setup

In [3]:
# Import required libraries
import json
import boto3
import uuid
import time
from datetime import datetime, timedelta
import logging
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import numpy as np

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("Dependencies installed and imported successfully!")

2025-05-23 20:38:11.692063: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Dependencies installed and imported successfully!


# Cell 2: Upload to S3 Knowledge Base

In [4]:
# Initialize AWS clients
s3 = boto3.client('s3')

# Use your existing bucket
BUCKET_NAME = "tech-translator-s3-knowledge-base"

print(f"Using existing S3 bucket: {BUCKET_NAME}")

# Check if bucket exists and is accessible
try:
    response = s3.head_bucket(Bucket=BUCKET_NAME)
    print(f"✅ Successfully connected to bucket: {BUCKET_NAME}")
    
    # List existing objects
    objects = s3.list_objects_v2(Bucket=BUCKET_NAME, MaxKeys=5)
    if 'Contents' in objects:
        print(f"📁 Bucket contains {len(objects['Contents'])} objects (showing max 5):")
        for obj in objects['Contents'][:5]:
            print(f"  - {obj['Key']} ({obj['Size']} bytes)")
    else:
        print("📁 Bucket is empty")
        
except Exception as e:
    print(f"❌ Error accessing bucket: {str(e)}")
    print("Please check if the bucket exists and you have proper permissions")
    raise

# Insurance data science concepts knowledge base
concepts = [
    {
        "concept_id": "r-squared",
        "title": "R-squared",
        "content": {
            "definition": "R-squared (R²) is a statistical measure that represents the proportion of the variance for a dependent variable that's explained by independent variables in a regression model.",
            "technical_details": "R-squared values range from 0 to 1, where 0 indicates that the model explains none of the variability, and 1 indicates perfect prediction. It is calculated as 1 minus the ratio of the residual sum of squares to the total sum of squares.",
            "insurance_context": "In insurance pricing, R-squared helps actuaries understand how well factors like age, location, or claim history explain premium variations. A high R-squared indicates that the selected rating factors are good predictors of risk.",
            "limitations": "R-squared will always increase as more variables are added to a model, even if those variables are not significant. Adjusted R-squared addresses this limitation by penalizing the addition of variables that don't improve the model."
        },
        "audience_explanations": {
            "underwriter": "As an underwriter, you can think of R-squared as a measure of how well your pricing model captures risk factors. If your pricing model has an R-squared of 0.75, it means that 75% of the premium variation is explained by the factors in your model, while 25% remains unexplained. This unexplained portion might represent risk factors you're not capturing, which could lead to adverse selection if competitors have better models.",
            "actuary": "When comparing generalized linear models (GLMs) for pricing, the model with higher R-squared (all else being equal) is explaining more of the variance in loss ratios across segments. However, be cautious of overfitting - a model with too many parameters might have a high R-squared on training data but perform poorly on new data. Cross-validation and consideration of information criteria like AIC and BIC are essential complements to R-squared evaluation.",
            "executive": "R-squared provides a simple measure of how well our predictive models are working. An R-squared of 0.8 means our pricing model captures 80% of what drives premium differences, indicating a strong predictive model. The remaining 20% represents potential opportunity for competitive advantage if we can identify additional predictive factors that our competitors haven't discovered yet."
        },
        "examples": [
            {
                "context": "Auto Insurance Pricing",
                "explanation": "In an auto insurance pricing model, an R-squared of 0.72 indicates that factors like driver age, vehicle type, and prior claims explain 72% of the variation in claim costs across policyholders."
            },
            {
                "context": "Policy Renewal Prediction",
                "explanation": "A customer retention model with an R-squared of 0.35 suggests that while you have some predictive power, much of what drives customers to renew or leave remains unexplained by your current variables."
            }
        ],
        "related_concepts": ["predictive modeling", "statistical significance", "p-value", "adjusted r-squared"]
    },
    {
        "concept_id": "loss-ratio",
        "title": "Loss Ratio",
        "content": {
            "definition": "Loss ratio is a key insurance metric that measures the relationship between incurred losses and earned premiums, expressed as a percentage.",
            "technical_details": "The basic formula is: Loss Ratio = (Incurred Losses + Loss Adjustment Expenses) / Earned Premiums × 100%. A combined ratio additionally includes underwriting expenses and is calculated as Loss Ratio + Expense Ratio.",
            "insurance_context": "Loss ratio is one of the most important profitability metrics in insurance. Generally, a loss ratio below 100% indicates underwriting profit (before considering investment income), while a ratio above 100% indicates an underwriting loss.",
            "limitations": "Loss ratios can be volatile in the short term, especially for low-frequency, high-severity lines of business or for small portfolios. Loss ratios also don't account for the time value of money or investment income."
        },
        "audience_explanations": {
            "underwriter": "If you're seeing a loss ratio of 85% in a particular segment, it means that for every $100 in premium, $85 is being paid out in claims and claim expenses. This leaves only $15 for operational expenses, commissions, and profit. If your company's expense ratio is 20%, this segment is operating at a 5% loss. You may need to consider rate adjustments or tighter underwriting guidelines for this segment.",
            "actuary": "When modeling loss ratios, we need to consider both frequency and severity trends, as well as large claim volatility and development patterns. A pure loss ratio that excludes IBNR and case reserve development can give a misleading picture of profitability. For long-tail lines, analyzing loss ratios by accident year versus calendar year can reveal important trends in ultimate loss expectations.",
            "executive": "A loss ratio trend that increases from 60% to 70% over three quarters may signal emerging profitability challenges that require attention. With an expense ratio of 25%, this change would reduce your combined ratio from a profitable 85% to a borderline 95%, significantly impacting your underwriting margin. This trend could be due to inflation, changing risk profiles, competitor pricing actions, or claims handling efficiency."
        },
        "examples": [
            {
                "context": "Property Insurance Performance",
                "explanation": "A homeowners insurance portfolio with a 65% loss ratio and 30% expense ratio yields a combined ratio of 95%, indicating a 5% underwriting profit margin."
            },
            {
                "context": "Line of Business Comparison",
                "explanation": "Commercial auto typically runs at higher loss ratios (around 75-80%) compared to commercial property (around 50-60%), which means pricing, underwriting, and reinsurance strategies need to be tailored differently for these lines."
            }
        ],
        "related_concepts": ["combined ratio", "expense ratio", "underwriting profit", "IBNR", "loss development"]
    },
    {
        "concept_id": "predictive-model",
        "title": "Predictive Model",
        "content": {
            "definition": "A predictive model is a statistical algorithm that uses historical data to predict future outcomes or classify new data points.",
            "technical_details": "Common predictive modeling techniques include linear and logistic regression, decision trees, random forests, gradient boosting machines, neural networks, and ensemble methods. Models are evaluated using metrics like accuracy, precision, recall, F1-score, AUC-ROC, and mean squared error.",
            "insurance_context": "In insurance, predictive models help estimate the likelihood of claims, premium adequacy, customer behavior, and fraud. They are used throughout the insurance lifecycle, from marketing and underwriting to claims management and renewal.",
            "limitations": "Predictive models can only identify patterns present in historical data, may struggle with rare events, and can perpetuate historical biases if not carefully designed. They also require ongoing monitoring and retraining as conditions change."
        },
        "audience_explanations": {
            "underwriter": "The predictive model flags applications with risk scores based on patterns in historical data. For example, if an application scores in the highest risk decile, it has characteristics similar to policies that historically had 2.5 times more claims than average. These models don't replace your judgment - they provide an additional data point to complement your expertise, especially for factors that might not be obvious from traditional underwriting guidelines.",
            "actuary": "When building predictive models for insurance applications, we need to balance predictive power with interpretability and regulatory compliance. A black-box model might achieve higher accuracy but could raise regulatory concerns about explainability. Generalized linear models (GLMs) remain popular in insurance because they provide a good balance of predictive power and interpretability, with clear indications of which factors drive predictions and by how much.",
            "executive": "Our predictive models give us a competitive edge by identifying patterns that traditional approaches might miss. For example, our customer retention predictive model has improved retention by 5% by identifying at-risk policies before renewal, allowing targeted interventions. This translates to approximately $2M in saved premium that would otherwise have been lost, with minimal additional operational cost."
        },
        "examples": [
            {
                "context": "Claims Triage",
                "explanation": "A predictive model analyzes new claims and assigns each a complexity score from 1-10. Claims scoring 8+ are automatically routed to senior adjusters, while scores of 3 or below are fast-tracked for simple processing, optimizing adjuster workloads."
            },
            {
                "context": "Premium Leakage Detection",
                "explanation": "A random forest model analyzes policy characteristics and identifies applications with a high probability of misclassification or missing information, flagging them for underwriter review before binding to prevent premium leakage."
            }
        ],
        "related_concepts": ["machine learning", "artificial intelligence", "data mining", "feature engineering", "model validation"]
    }
]

# Upload concepts to existing bucket
try:
    # Check if concepts directory exists, create if not
    try:
        s3.head_object(Bucket=BUCKET_NAME, Key='concepts/')
    except:
        s3.put_object(Bucket=BUCKET_NAME, Key='concepts/', Body='')
        print("📁 Created concepts/ directory")
    
    # Upload each concept as a separate JSON file
    for concept in concepts:
        concept_id = concept["concept_id"]
        file_content = json.dumps(concept, indent=2)
        key = f"concepts/{concept_id}.json"
        
        s3.put_object(
            Bucket=BUCKET_NAME,
            Key=key,
            Body=file_content,
            ContentType='application/json'
        )
        logger.info(f"Uploaded {concept_id} to s3://{BUCKET_NAME}/{key}")
    
    print(f"✅ Knowledge base concepts uploaded to existing bucket: {BUCKET_NAME}")
    
except Exception as e:
    logger.error(f"Error uploading to existing bucket: {str(e)}")
    raise

INFO:__main__:Uploaded r-squared to s3://tech-translator-s3-knowledge-base/concepts/r-squared.json


Using existing S3 bucket: tech-translator-s3-knowledge-base
✅ Successfully connected to bucket: tech-translator-s3-knowledge-base
📁 Bucket is empty
📁 Created concepts/ directory


INFO:__main__:Uploaded loss-ratio to s3://tech-translator-s3-knowledge-base/concepts/loss-ratio.json
INFO:__main__:Uploaded predictive-model to s3://tech-translator-s3-knowledge-base/concepts/predictive-model.json


✅ Knowledge base concepts uploaded to existing bucket: tech-translator-s3-knowledge-base


# Cell 3: Access to DynamoDB Vector Storage Table

In [6]:
# Initialize DynamoDB client and resource
dynamodb = boto3.client('dynamodb')
dynamodb_resource = boto3.resource('dynamodb')

# Use your existing table
TABLE_NAME = "tech-translator-dynamodb-vector-storage"

print(f"Using existing DynamoDB table: {TABLE_NAME}")

# Check if table exists and is accessible
try:
    # Get table description
    response = dynamodb.describe_table(TableName=TABLE_NAME)
    table_status = response['Table']['TableStatus']
    item_count = response['Table']['ItemCount']
    
    print(f"✅ Successfully connected to table: {TABLE_NAME}")
    print(f"📊 Table Status: {table_status}")
    print(f"📊 Current Item Count: {item_count}")
    
    # Check table schema
    key_schema = response['Table']['KeySchema']
    print("🔑 Table Key Schema:")
    for key in key_schema:
        print(f"  - {key['AttributeName']} ({key['KeyType']})")
    
    # Check if table has any existing data
    if item_count > 0:
        print(f"⚠️  Table contains {item_count} existing items")
        
        # Sample a few items to see what's there
        table = dynamodb_resource.Table(TABLE_NAME)
        sample_response = table.scan(Limit=3)
        
        if sample_response.get('Items'):
            print("📋 Sample existing items:")
            for i, item in enumerate(sample_response['Items'][:3]):
                print(f"  {i+1}. concept_id: {item.get('concept_id', 'N/A')}, vector_id: {item.get('vector_id', 'N/A')}")
            
            # Ask if user wants to clear existing data
            print("\n🤔 Options for existing data:")
            print("1. Keep existing data and add new embeddings alongside")
            print("2. Clear existing data and start fresh")
            print("Note: We'll proceed with option 1 (keep existing data) by default")
            
            # Set flag for later use
            HAS_EXISTING_DATA = True
        else:
            HAS_EXISTING_DATA = False
    else:
        print("📝 Table is empty and ready for new embeddings")
        HAS_EXISTING_DATA = False
    
    # Verify table is ready for operations
    if table_status == 'ACTIVE':
        print("🟢 Table is ACTIVE and ready for read/write operations")
    else:
        print(f"🟡 Table status is {table_status} - waiting for it to become ACTIVE...")
        waiter = dynamodb.get_waiter('table_exists')
        waiter.wait(TableName=TABLE_NAME)
        print("🟢 Table is now ACTIVE")
        
except Exception as e:
    if 'ResourceNotFoundException' in str(e):
        print(f"❌ Table {TABLE_NAME} does not exist!")
        print("Creating the table now...")
        
        # Create the table if it doesn't exist
        try:
            dynamodb.create_table(
                TableName=TABLE_NAME,
                KeySchema=[
                    {'AttributeName': 'concept_id', 'KeyType': 'HASH'},  # Partition key
                    {'AttributeName': 'vector_id', 'KeyType': 'RANGE'}   # Sort key
                ],
                AttributeDefinitions=[
                    {'AttributeName': 'concept_id', 'AttributeType': 'S'},
                    {'AttributeName': 'vector_id', 'AttributeType': 'S'},
                ],
                BillingMode='PAY_PER_REQUEST'  # On-demand capacity
            )
            
            # Wait for table to be created
            print(f"Creating table {TABLE_NAME}...")
            waiter = dynamodb.get_waiter('table_exists')
            waiter.wait(TableName=TABLE_NAME)
            print(f"✅ Table {TABLE_NAME} created successfully!")
            HAS_EXISTING_DATA = False
            
        except Exception as create_error:
            print(f"❌ Error creating table: {str(create_error)}")
            raise
    else:
        print(f"❌ Error accessing table: {str(e)}")
        raise

# Store these values for later use
print(f"\n📝 Configuration:")
print(f"S3 Bucket: {BUCKET_NAME}")
print(f"DynamoDB Table: {TABLE_NAME}")
print(f"Has Existing Data: {HAS_EXISTING_DATA if 'HAS_EXISTING_DATA' in locals() else False}")

print(f"\n✅ DynamoDB table verification complete!")

Using existing DynamoDB table: tech-translator-dynamodb-vector-storage
✅ Successfully connected to table: tech-translator-dynamodb-vector-storage
📊 Table Status: ACTIVE
📊 Current Item Count: 0
🔑 Table Key Schema:
  - concept_id (HASH)
  - vector_id (RANGE)
📝 Table is empty and ready for new embeddings
🟢 Table is ACTIVE and ready for read/write operations

📝 Configuration:
S3 Bucket: tech-translator-s3-knowledge-base
DynamoDB Table: tech-translator-dynamodb-vector-storage
Has Existing Data: False

✅ DynamoDB table verification complete!


# Cell 4: Generate and Store Vector Embeddings

In [7]:
# Initialize sentence transformer model and DynamoDB resource
print("Loading sentence transformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
dynamodb_resource = boto3.resource('dynamodb')

def load_concepts_from_s3(bucket_name):
    """Load all concept documents from S3"""
    concepts = []
    
    try:
        logger.info(f"Loading concepts from s3://{bucket_name}/concepts/")
        response = s3.list_objects_v2(Bucket=bucket_name, Prefix='concepts/')
        
        if 'Contents' not in response:
            logger.warning(f"No concept documents found")
            return concepts
        
        # Load each concept document
        for obj in response['Contents']:
            key = obj['Key']
            if key.endswith('.json'):
                logger.info(f"Loading {key}")
                obj_response = s3.get_object(Bucket=bucket_name, Key=key)
                concept = json.loads(obj_response['Body'].read().decode('utf-8'))
                concepts.append(concept)
                logger.info(f"Loaded concept: {concept['title']}")
                
    except Exception as e:
        logger.error(f"Error loading concepts: {str(e)}")
        raise
    
    return concepts

def generate_chunks(concept):
    """Generate text chunks for each concept for embedding"""
    chunks = []
    concept_id = concept["concept_id"]
    title = concept["title"]
    
    # Definition chunk
    chunks.append({
        "concept_id": concept_id,
        "vector_id": f"{concept_id}-definition",
        "title": title,
        "text": concept["content"]["definition"],
        "type": "definition"
    })
    
    # Technical details chunk
    chunks.append({
        "concept_id": concept_id,
        "vector_id": f"{concept_id}-technical",
        "title": title,
        "text": concept["content"]["technical_details"],
        "type": "technical"
    })
    
    # Insurance context chunk
    chunks.append({
        "concept_id": concept_id,
        "vector_id": f"{concept_id}-context",
        "title": title,
        "text": concept["content"]["insurance_context"],
        "type": "context"
    })
    
    # Audience-specific explanations
    for audience, explanation in concept["audience_explanations"].items():
        chunks.append({
            "concept_id": concept_id,
            "vector_id": f"{concept_id}-{audience}",
            "title": title,
            "text": explanation,
            "type": "audience",
            "audience": audience
        })
    
    # Examples
    for i, example in enumerate(concept["examples"]):
        chunks.append({
            "concept_id": concept_id,
            "vector_id": f"{concept_id}-example-{i}",
            "title": title,
            "text": example["explanation"],
            "type": "example",
            "context": example["context"]
        })
    
    return chunks

def generate_and_store_embeddings(chunks, table_name):
    """Generate embeddings and store in DynamoDB"""
    table = dynamodb_resource.Table(table_name)
    
    print(f"Generating embeddings for {len(chunks)} chunks...")
    
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i+1}/{len(chunks)}: {chunk['vector_id']}")
        
        # Generate embedding
        embedding = model.encode(chunk["text"])
        
        # Prepare item for DynamoDB
        item = {
            "concept_id": chunk["concept_id"],
            "vector_id": chunk["vector_id"],
            "title": chunk["title"],
            "text": chunk["text"],
            "type": chunk["type"],
            "embedding": json.dumps(embedding.tolist())  # Store as JSON string
        }
        
        # Add optional attributes
        if "audience" in chunk:
            item["audience"] = chunk["audience"]
        if "context" in chunk:
            item["context"] = chunk["context"]
        
        # Store in DynamoDB
        try:
            table.put_item(Item=item)
            logger.info(f"Stored embedding for {chunk['vector_id']}")
        except Exception as e:
            logger.error(f"Error storing {chunk['vector_id']}: {str(e)}")

# Main embedding generation process
print("Starting RAG embedding generation process...")

# Load concepts from S3
concepts = load_concepts_from_s3(BUCKET_NAME)
print(f"Loaded {len(concepts)} concepts")

if concepts:
    # Generate chunks
    all_chunks = []
    for concept in concepts:
        chunks = generate_chunks(concept)
        all_chunks.extend(chunks)
    print(f"Generated {len(all_chunks)} text chunks")
    
    # Generate and store embeddings
    generate_and_store_embeddings(all_chunks, TABLE_NAME)
    
    print("✅ RAG embedding generation completed successfully!")
else:
    print("❌ No concepts loaded - check S3 bucket")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Loading sentence transformer model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

INFO:__main__:Loading concepts from s3://tech-translator-s3-knowledge-base/concepts/
INFO:__main__:Loading concepts/loss-ratio.json
INFO:__main__:Loaded concept: Loss Ratio
INFO:__main__:Loading concepts/predictive-model.json
INFO:__main__:Loaded concept: Predictive Model
INFO:__main__:Loading concepts/r-squared.json
INFO:__main__:Loaded concept: R-squared


Starting RAG embedding generation process...
Loaded 3 concepts
Generated 24 text chunks
Generating embeddings for 24 chunks...
Processing chunk 1/24: loss-ratio-definition


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for loss-ratio-definition


Processing chunk 2/24: loss-ratio-technical


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for loss-ratio-technical


Processing chunk 3/24: loss-ratio-context


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for loss-ratio-context


Processing chunk 4/24: loss-ratio-underwriter


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for loss-ratio-underwriter


Processing chunk 5/24: loss-ratio-actuary


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for loss-ratio-actuary


Processing chunk 6/24: loss-ratio-executive


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for loss-ratio-executive


Processing chunk 7/24: loss-ratio-example-0


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for loss-ratio-example-0


Processing chunk 8/24: loss-ratio-example-1


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for loss-ratio-example-1


Processing chunk 9/24: predictive-model-definition


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for predictive-model-definition


Processing chunk 10/24: predictive-model-technical


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for predictive-model-technical


Processing chunk 11/24: predictive-model-context


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for predictive-model-context


Processing chunk 12/24: predictive-model-underwriter


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for predictive-model-underwriter


Processing chunk 13/24: predictive-model-actuary


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for predictive-model-actuary


Processing chunk 14/24: predictive-model-executive


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for predictive-model-executive


Processing chunk 15/24: predictive-model-example-0


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for predictive-model-example-0


Processing chunk 16/24: predictive-model-example-1


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for predictive-model-example-1


Processing chunk 17/24: r-squared-definition


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for r-squared-definition


Processing chunk 18/24: r-squared-technical


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for r-squared-technical


Processing chunk 19/24: r-squared-context


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for r-squared-context


Processing chunk 20/24: r-squared-underwriter


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for r-squared-underwriter


Processing chunk 21/24: r-squared-actuary


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for r-squared-actuary


Processing chunk 22/24: r-squared-executive


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for r-squared-executive


Processing chunk 23/24: r-squared-example-0


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for r-squared-example-0


Processing chunk 24/24: r-squared-example-1


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Stored embedding for r-squared-example-1


✅ RAG embedding generation completed successfully!


# Cell 5: Test Vector Search Implementation

In [30]:
def extract_concept_and_audience(query):
    """Extract concept and audience from user query"""
    query_lower = query.lower()
    
    # Concept mapping
    concept_keywords = {
        'r-squared': ['r squared', 'r-squared', 'r2', 'coefficient of determination'],
        'loss-ratio': ['loss ratio', 'claims ratio', 'incurred losses'],
        'predictive-model': ['predictive model', 'prediction model', 'machine learning', 'ml model']
    }
    
    detected_concept = None
    for concept_id, keywords in concept_keywords.items():
        if any(keyword in query_lower for keyword in keywords):
            detected_concept = concept_id
            break
    
    if not detected_concept:
        detected_concept = 'predictive-model'  # Default
    
    # Audience mapping
    audience_keywords = {
        'underwriter': ['underwriter', 'underwriting'],
        'actuary': ['actuary', 'actuarial', 'actuaries'],
        'executive': ['executive', 'ceo', 'manager', 'leadership']
    }
    
    detected_audience = None
    for audience_id, keywords in audience_keywords.items():
        if any(keyword in query_lower for keyword in keywords):
            detected_audience = audience_id
            break
    
    if not detected_audience:
        detected_audience = 'general'
    
    return {'concept': detected_concept, 'audience': detected_audience}


def vector_search(query, concept_id=None, top_k=5):
    """Perform vector search on stored embeddings"""
    
    # Generate query embedding
    query_embedding = model.encode(query)
    
    # Query DynamoDB
    table = dynamodb_resource.Table(TABLE_NAME)
    
    if concept_id:
        print(f"🔍 Searching in concept: {concept_id}")
        response = table.query(
            KeyConditionExpression="concept_id = :concept_id",
            ExpressionAttributeValues={":concept_id": concept_id}
        )
    else:
        print("🔍 Searching across all concepts")
        response = table.scan()
    
    items = response.get('Items', [])
    print(f"Found {len(items)} items to search")
    
    if not items:
        return []
    
    # Calculate similarities
    results = []
    for item in items:
        # Parse stored embedding
        stored_embedding = json.loads(item['embedding'])
        
        # Calculate cosine similarity
        similarity = 1 - cosine(query_embedding, stored_embedding)
        
        results.append({
            'item': item,
            'similarity': similarity
        })
    
    # Sort by similarity (highest first)
    results.sort(key=lambda x: x['similarity'], reverse=True)
    
    return results[:top_k]


def generate_rag_response(query, search_results):
    """Generate response using retrieved information"""
    if not search_results:
        return "I couldn't find relevant information for your query."
    
    # Get the best match
    best_match = search_results[0]['item']
    concept_title = best_match['title']
    
    # Build response using retrieved chunks
    response_parts = [f"# {concept_title} in Insurance\n"]
    
    # Add definition if available
    definition_results = [r for r in search_results if r['item']['type'] == 'definition']
    if definition_results:
        response_parts.append(f"## Definition\n{definition_results[0]['item']['text']}\n")
    
    # Add insurance context
    context_results = [r for r in search_results if r['item']['type'] == 'context']
    if context_results:
        response_parts.append(f"## In Insurance\n{context_results[0]['item']['text']}\n")
    
    # Add audience-specific explanation
    audience_results = [r for r in search_results if r['item']['type'] == 'audience']
    if audience_results:
        audience = audience_results[0]['item'].get('audience', 'professional')
        response_parts.append(f"## For {audience.title()}s\n{audience_results[0]['item']['text']}\n")
    
    # Add examples
    example_results = [r for r in search_results if r['item']['type'] == 'example'][:2]
    if example_results:
        response_parts.append("## Examples")
        for example in example_results:
            response_parts.append(f"**{example['item'].get('context', 'Example')}**: {example['item']['text']}")
    
    return "\n".join(response_parts)

In [31]:
# Test the complete RAG system
test_queries = [
    "What is R-squared for an underwriter?",
    "Explain loss ratio to an executive",
    "How do predictive models help actuaries?",
    "Tell me about R-squared in simple terms"
]

print("🧪 Testing Complete RAG System\n" + "="*50)

for query in test_queries:
    print(f"\n💬 Query: '{query}'")
    print("-" * 40)
    
    # Extract concept and audience
    extracted = extract_concept_and_audience(query)
    print(f"📊 Extracted - Concept: {extracted['concept']}, Audience: {extracted['audience']}")
    
    # Perform vector search
    search_results = vector_search(query, extracted['concept'])
    
    if search_results:
        print(f"🎯 Top matches (similarity scores):")
        for i, result in enumerate(search_results[:3]):
            item = result['item']
            print(f"  {i+1}. [{item['type']}] {item['vector_id']} ({result['similarity']:.3f})")
        
        # Generate RAG response
        response = generate_rag_response(query, search_results)
        print(f"\n🤖 Generated Response:\n{response}")
    else:
        print("❌ No results found")
    
    print("\n" + "="*80)

print("\n✅ RAG System Testing Complete!")

🧪 Testing Complete RAG System

💬 Query: 'What is R-squared for an underwriter?'
----------------------------------------
📊 Extracted - Concept: r-squared, Audience: underwriter


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

🔍 Searching in concept: r-squared
Found 8 items to search
🎯 Top matches (similarity scores):
  1. [audience] r-squared-underwriter (0.727)
  2. [definition] r-squared-definition (0.609)
  3. [example] r-squared-example-0 (0.593)

🤖 Generated Response:
# R-squared in Insurance

## Definition
R-squared (R²) is a statistical measure that represents the proportion of the variance for a dependent variable that's explained by independent variables in a regression model.

## For Underwriters
As an underwriter, you can think of R-squared as a measure of how well your pricing model captures risk factors. If your pricing model has an R-squared of 0.75, it means that 75% of the premium variation is explained by the factors in your model, while 25% remains unexplained. This unexplained portion might represent risk factors you're not capturing, which could lead to adverse selection if competitors have better models.

## Examples
**Auto Insurance Pricing**: In an auto insurance pricing model, an R-s

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

🔍 Searching in concept: loss-ratio
Found 8 items to search
🎯 Top matches (similarity scores):
  1. [definition] loss-ratio-definition (0.626)
  2. [technical] loss-ratio-technical (0.619)
  3. [context] loss-ratio-context (0.615)

🤖 Generated Response:
# Loss Ratio in Insurance

## Definition
Loss ratio is a key insurance metric that measures the relationship between incurred losses and earned premiums, expressed as a percentage.

## In Insurance
Loss ratio is one of the most important profitability metrics in insurance. Generally, a loss ratio below 100% indicates underwriting profit (before considering investment income), while a ratio above 100% indicates an underwriting loss.

## For Underwriters
If you're seeing a loss ratio of 85% in a particular segment, it means that for every $100 in premium, $85 is being paid out in claims and claim expenses. This leaves only $15 for operational expenses, commissions, and profit. If your company's expense ratio is 20%, this segment is operati

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

🔍 Searching in concept: predictive-model
Found 8 items to search
🎯 Top matches (similarity scores):
  1. [context] predictive-model-context (0.602)
  2. [definition] predictive-model-definition (0.559)
  3. [audience] predictive-model-underwriter (0.526)

🤖 Generated Response:
# Predictive Model in Insurance

## Definition
A predictive model is a statistical algorithm that uses historical data to predict future outcomes or classify new data points.

## In Insurance
In insurance, predictive models help estimate the likelihood of claims, premium adequacy, customer behavior, and fraud. They are used throughout the insurance lifecycle, from marketing and underwriting to claims management and renewal.

## For Underwriters
The predictive model flags applications with risk scores based on patterns in historical data. For example, if an application scores in the highest risk decile, it has characteristics similar to policies that historically had 2.5 times more claims than average. These model

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

🔍 Searching in concept: r-squared
Found 8 items to search
🎯 Top matches (similarity scores):
  1. [definition] r-squared-definition (0.864)
  2. [audience] r-squared-executive (0.738)
  3. [technical] r-squared-technical (0.702)

🤖 Generated Response:
# R-squared in Insurance

## Definition
R-squared (R²) is a statistical measure that represents the proportion of the variance for a dependent variable that's explained by independent variables in a regression model.

## In Insurance
In insurance pricing, R-squared helps actuaries understand how well factors like age, location, or claim history explain premium variations. A high R-squared indicates that the selected rating factors are good predictors of risk.

## For Executives
R-squared provides a simple measure of how well our predictive models are working. An R-squared of 0.8 means our pricing model captures 80% of what drives premium differences, indicating a strong predictive model. The remaining 20% represents potential opportunity 

# Cell 6: Save Configuration for Lambda Implementation

In [None]:
# Save configuration for future reference
config = {
    "bucket_name": BUCKET_NAME,
    "table_name": TABLE_NAME,
    "embedding_model": "all-MiniLM-L6-v2",
    "region": "us-east-1",
    "creation_timestamp": int(time.time()),
    "concepts": [concept["concept_id"] for concept in concepts]
}

# Save configuration to S3
config_content = json.dumps(config, indent=2)
s3.put_object(
    Bucket=BUCKET_NAME,
    Key='rag_config.json',
    Body=config_content,
    ContentType='application/json'
)

print("📋 RAG Configuration Summary")
print("=" * 40)
print(f"S3 Bucket: {BUCKET_NAME}")
print(f"DynamoDB Table: {TABLE_NAME}")
print(f"Embedding Model: all-MiniLM-L6-v2")
print(f"Concepts: {len(concepts)}")
print(f"Total Chunks: {len(all_chunks)}")
print("\n✅ Configuration saved to S3 as 'rag_config.json'")

print("\n🎉 RAG Implementation Complete!")
print("\nNext Steps:")
print("1. Deploy an open-source LLM (Phi-2 or Mistral-7B)")
print("2. Create Lambda functions for your API")
print("3. Connect the frontend to your RAG system")
print("4. Remember to STOP this SageMaker space when done to save costs!")