# Genie Enhancement v3 - Debug Notebook

## Three-Space Architecture with Sequential Fixes

**Safe enhancement workflow:**
1. Clone production ‚Üí dev-working + dev-best
2. Score benchmarks on dev-working
3. Apply fixes ONE AT A TIME to dev-working
4. After each fix: score ‚Üí keep if improved, rollback if worse
5. Promote dev-best ‚Üí production (user decision)
6. Cleanup dev spaces

**Key Safety Features:**
- Production is NEVER modified directly
- Automatic rollback on score regression
- User controls final promotion

## 1Ô∏è‚É£ Setup

In [None]:
# IMPORTANT: Clear cached modules to ensure latest code is loaded
import sys

modules_to_remove = [m for m in sys.modules if m.startswith('lib')]
for m in modules_to_remove:
    del sys.modules[m]

print(f"Cleared {len(modules_to_remove)} cached lib modules")

In [None]:
# Project path setup
import sys
import os
from pathlib import Path

# Find project root
current_path = Path(os.getcwd())
if current_path.name == 'genie_enhancer':
    project_root = current_path
else:
    project_root = current_path
    while project_root.name != 'genie_enhancer' and project_root != project_root.parent:
        project_root = project_root.parent

if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Configure logging
import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s | %(name)s | %(levelname)s | %(message)s',
    datefmt='%H:%M:%S'
)

# Verbose logging for lib modules
for module in ['lib.genie_client', 'lib.scorer', 'lib.llm', 'lib.enhancer', 
               'lib.applier', 'lib.space_api', 'lib.space_cloner', 'lib.sequential_enhancer']:
    logging.getLogger(module).setLevel(logging.DEBUG)

print(f"Project root: {project_root}")
print(f"Logging: DEBUG mode enabled")

In [None]:
# Imports
import json
import time
from datetime import datetime

from lib.genie_client import GenieConversationalClient
from lib.space_cloner import SpaceCloner
from lib.scorer import BenchmarkScorer
from lib.benchmark_parser import BenchmarkLoader
from lib.llm import DatabricksLLMClient
from lib.sql import SQLExecutor
from lib.sequential_enhancer import SequentialEnhancer

print("‚úÖ All imports successful")

## 2Ô∏è‚É£ Configuration

In [None]:
# === UPDATE THESE VALUES ===
DATABRICKS_HOST = "your-workspace.cloud.databricks.com"
DATABRICKS_TOKEN = "YOUR_TOKEN_HERE"
GENIE_SPACE_ID = "your-space-id"  # Production space to enhance
WAREHOUSE_ID = "your-warehouse-id"  # For metric views
LLM_ENDPOINT = "databricks-claude-sonnet-4"

# Enhancement settings
TARGET_SCORE = 0.90
INDEXING_WAIT = 60  # seconds to wait after each change

print(f"Host: {DATABRICKS_HOST}")
print(f"Production Space: {GENIE_SPACE_ID}")
print(f"Warehouse: {WAREHOUSE_ID}")
print(f"LLM: {LLM_ENDPOINT}")
print(f"Target: {TARGET_SCORE:.0%}")

## 3Ô∏è‚É£ Initialize Clients

In [None]:
# Space Cloner (for three-space architecture)
print("Initializing Space Cloner...")
space_cloner = SpaceCloner(
    host=DATABRICKS_HOST,
    token=DATABRICKS_TOKEN
)
print("‚úÖ Space Cloner initialized")

In [None]:
# LLM Client (with rate limit protection)
print("Initializing LLM Client...")
llm_client = DatabricksLLMClient(
    host=DATABRICKS_HOST,
    token=DATABRICKS_TOKEN,
    endpoint_name=LLM_ENDPOINT,
    request_delay=10.0,          # 10s delay between requests
    rate_limit_base_delay=90.0   # 90s base on rate limit
)

if llm_client.test_connection():
    print("‚úÖ LLM Client connected")
    print("   - Request delay: 10s")
    print("   - Rate limit backoff: 90s base")
else:
    print("‚ùå LLM connection failed")

In [None]:
# SQL Executor (for metric views)
print("Initializing SQL Executor...")
sql_executor = SQLExecutor(
    host=DATABRICKS_HOST,
    token=DATABRICKS_TOKEN,
    warehouse_id=WAREHOUSE_ID
)
print("‚úÖ SQL Executor initialized")

## 4Ô∏è‚É£ Load Benchmarks

In [None]:
# Load benchmarks
benchmark_file = project_root / "benchmarks" / "benchmarks.json"
print(f"Loading from: {benchmark_file}")

loader = BenchmarkLoader(str(benchmark_file))
all_benchmarks = loader.load()
print(f"‚úÖ Loaded {len(all_benchmarks)} benchmarks")

# Preview
for i, b in enumerate(all_benchmarks[:3]):
    print(f"  {i+1}. {b['question'][:60]}...")

In [None]:
# Optional: Use subset for faster testing
USE_SUBSET = True  # Set to False for full run

if USE_SUBSET:
    benchmarks = all_benchmarks[:5]  # First 5 only
    print(f"‚ö†Ô∏è TEST MODE: Using {len(benchmarks)} benchmarks")
else:
    benchmarks = all_benchmarks
    print(f"FULL MODE: Using {len(benchmarks)} benchmarks")

---
## 5Ô∏è‚É£ Setup Three-Space Architecture

This creates:
- **Production** - Original space (never modified)
- **Dev-Working** - Where changes are tested
- **Dev-Best** - Holds best configuration (for rollback)

In [None]:
print("="*60)
print("SETTING UP THREE-SPACE ARCHITECTURE")
print("="*60)
print()
print(f"Production Space: {GENIE_SPACE_ID}")
print("Creating dev-working and dev-best clones...")
print()

setup_result = space_cloner.setup_three_spaces(
    production_space_id=GENIE_SPACE_ID
)

if setup_result['success']:
    print()
    print("="*60)
    print("THREE-SPACE ARCHITECTURE READY")
    print("="*60)
    print(f"Production:   {setup_result['production_id']}")
    print(f"Dev-Working:  {setup_result['dev_working_id']}")
    print(f"Dev-Best:     {setup_result['dev_best_id']}")
    
    # Store for later
    PRODUCTION_ID = setup_result['production_id']
    DEV_WORKING_ID = setup_result['dev_working_id']
    DEV_BEST_ID = setup_result['dev_best_id']
    INITIAL_CONFIG = setup_result['initial_config']
else:
    print(f"‚ùå Setup failed: {setup_result['error']}")
    raise RuntimeError("Three-space setup failed")

## 6Ô∏è‚É£ Initialize Scorer and Enhancer

In [None]:
# Genie Client pointing to DEV-WORKING space
print("Initializing Genie Client for dev-working space...")
genie_client = GenieConversationalClient(
    host=DATABRICKS_HOST,
    token=DATABRICKS_TOKEN,
    space_id=DEV_WORKING_ID,  # Point to dev-working, NOT production
    verbose=True
)
print(f"‚úÖ Genie Client initialized (space: {DEV_WORKING_ID[:16]}...)")

In [None]:
# Benchmark Scorer
print("Initializing Scorer...")
scorer = BenchmarkScorer(
    genie_client=genie_client,
    llm_client=llm_client,
    sql_executor=sql_executor,
    config={
        "question_timeout": 120,
        "question_delay": 3.0,
        "error_delay": 5.0,
        "parallel_workers": 0,  # Sequential for debugging
    }
)
print("‚úÖ Scorer initialized (sequential mode)")

In [None]:
# Sequential Enhancer (orchestrates the whole flow)
print("Initializing Sequential Enhancer...")
enhancer = SequentialEnhancer(
    llm_client=llm_client,
    space_cloner=space_cloner,
    scorer=scorer,
    sql_executor=sql_executor
)
print("‚úÖ Sequential Enhancer initialized")

---
## 7Ô∏è‚É£ Initial Scoring

In [None]:
print("="*60)
print("INITIAL SCORING (on dev-working)")
print("="*60)
print()

start_time = datetime.now()
initial_results = scorer.score(benchmarks)
duration = (datetime.now() - start_time).total_seconds()

print()
print("="*60)
print("INITIAL SCORING COMPLETE")
print("="*60)
print(f"Score: {initial_results['score']:.1%}")
print(f"Passed: {initial_results['passed']}/{initial_results['total']}")
print(f"Failed: {initial_results['failed']}")
print(f"Duration: {duration:.1f}s")

# Check if already at target
if initial_results['score'] >= TARGET_SCORE:
    print()
    print("üéâ Already at target score! No enhancement needed.")

In [None]:
# Show failed benchmarks
failed_results = [r for r in initial_results['results'] if not r['passed']]

print(f"\n‚ùå Failed Benchmarks ({len(failed_results)}):\n")
for i, r in enumerate(failed_results, 1):
    print(f"{i}. {r['question'][:60]}...")
    print(f"   Category: {r.get('failure_category', 'unknown')}")
    if r.get('failure_reason'):
        print(f"   Reason: {r['failure_reason'][:80]}")
    print()

---
## 8Ô∏è‚É£ Generate Enhancement Plan

In [None]:
print("="*60)
print("GENERATING ENHANCEMENT PLAN")
print("="*60)
print()
print(f"Analyzing {len(failed_results)} failures...")
print("Categories: metric_view, metadata, sample_query, instruction")
print()

plan_start = datetime.now()
grouped_fixes = enhancer.analyze_all_failures(
    benchmark_results=initial_results,
    space_config=INITIAL_CONFIG,
    parallel_workers=1  # Sequential to avoid rate limits
)
plan_duration = (datetime.now() - plan_start).total_seconds()

total_fixes = sum(len(f) for f in grouped_fixes.values())
print()
print("="*60)
print("PLAN GENERATION COMPLETE")
print("="*60)
print(f"Total fixes: {total_fixes}")
print(f"Duration: {plan_duration:.1f}s")

for category in ['metric_view', 'metadata', 'sample_query', 'instruction']:
    count = len(grouped_fixes.get(category, []))
    print(f"  - {category}: {count}")

In [None]:
# Preview fixes
print("\nFix Preview:")
print("-"*40)

for category in ['metric_view', 'metadata', 'sample_query', 'instruction']:
    fixes = grouped_fixes.get(category, [])
    if fixes:
        print(f"\n{category.upper()} ({len(fixes)} fixes):")
        for i, fix in enumerate(fixes[:3], 1):  # Show first 3
            fix_type = fix.get('type', 'unknown')
            if fix_type == 'add_synonym':
                print(f"  {i}. {fix_type}: {fix.get('table')}.{fix.get('column')} ‚Üí '{fix.get('synonym')}'")
            elif fix_type == 'add_column_description':
                print(f"  {i}. {fix_type}: {fix.get('table')}.{fix.get('column')}")
            else:
                print(f"  {i}. {fix_type}")
        if len(fixes) > 3:
            print(f"  ... and {len(fixes) - 3} more")

---
## 9Ô∏è‚É£ Run Sequential Enhancement Loop

This applies fixes one at a time:
- Apply fix to dev-working
- Wait for indexing
- Score benchmarks
- If improved: keep fix, update dev-best
- If worse: rollback from dev-best

In [None]:
print("="*60)
print("SEQUENTIAL ENHANCEMENT LOOP")
print("="*60)
print()
print(f"Total fixes to try: {total_fixes}")
print(f"Indexing wait: {INDEXING_WAIT}s per fix")
print(f"Estimated time: ~{total_fixes * (INDEXING_WAIT + 60) / 60:.0f} minutes")
print()
print("Starting...")
print()

loop_start = datetime.now()
loop_result = enhancer.run_sequential_loop(
    benchmarks=benchmarks,
    grouped_fixes=grouped_fixes,
    indexing_wait_time=INDEXING_WAIT,
    target_score=TARGET_SCORE
)
loop_duration = (datetime.now() - loop_start).total_seconds()

print()
print("="*60)
print("ENHANCEMENT LOOP COMPLETE")
print("="*60)
print(f"Initial Score: {loop_result['initial_score']:.1%}")
print(f"Final Score:   {loop_result['final_score']:.1%}")
print(f"Improvement:   {loop_result['final_score'] - loop_result['initial_score']:+.1%}")
print(f"Fixes Applied: {len(loop_result['fixes_applied'])}")
print(f"Fixes Rejected: {len(loop_result['fixes_rejected'])}")
print(f"Duration:      {loop_duration:.1f}s ({loop_duration/60:.1f} min)")

In [None]:
# Show applied fixes
if loop_result['fixes_applied']:
    print("\n‚úÖ Applied Fixes:")
    for i, fix in enumerate(loop_result['fixes_applied'], 1):
        print(f"  {i}. {fix.get('type')}")

# Show rejected fixes
if loop_result['fixes_rejected']:
    print(f"\n‚ùå Rejected Fixes ({len(loop_result['fixes_rejected'])})")
    for i, fix in enumerate(loop_result['fixes_rejected'][:5], 1):
        print(f"  {i}. {fix.get('type')}: {fix.get('rejection_reason', 'N/A')[:50]}")
    if len(loop_result['fixes_rejected']) > 5:
        print(f"  ... and {len(loop_result['fixes_rejected']) - 5} more")

---
## üîü Promotion Decision

**Your options:**
1. **Promote** - Apply dev-best config to production
2. **Keep for review** - Leave dev spaces for manual inspection
3. **Discard** - Delete dev spaces, keep production unchanged

In [None]:
# Summary
print("="*60)
print("ENHANCEMENT SUMMARY")
print("="*60)
print()
print(f"Production Space: {PRODUCTION_ID}")
print(f"Dev-Best Space:   {DEV_BEST_ID}")
print(f"Dev-Working:      {DEV_WORKING_ID}")
print()
print(f"Initial Score: {loop_result['initial_score']:.1%}")
print(f"Best Score:    {loop_result['final_score']:.1%}")
print(f"Improvement:   {loop_result['final_score'] - loop_result['initial_score']:+.1%}")
print()
print("Ready for your decision:")
print("  1. Run 'Promote to Production' cell to apply changes")
print("  2. Run 'Cleanup' cell to discard and keep production unchanged")

In [None]:
# OPTION 1: Promote to Production
# WARNING: This will modify your production Genie Space!

CONFIRM_PROMOTE = False  # Set to True to enable

if CONFIRM_PROMOTE:
    print("Promoting dev-best to production...")
    promote_result = space_cloner.promote_to_production()
    
    if promote_result['success']:
        print("‚úÖ Production updated with best configuration!")
        
        # Cleanup dev spaces
        print("\nCleaning up dev spaces...")
        cleanup_result = space_cloner.cleanup_dev_spaces()
        if cleanup_result['success']:
            print("‚úÖ Dev spaces deleted")
        else:
            print(f"‚ö†Ô∏è Cleanup warning: {cleanup_result['error']}")
    else:
        print(f"‚ùå Promotion failed: {promote_result['error']}")
else:
    print("Set CONFIRM_PROMOTE = True to promote dev-best to production")

In [None]:
# OPTION 2: Cleanup without promoting
# This discards all changes and keeps production unchanged

CONFIRM_CLEANUP = False  # Set to True to enable

if CONFIRM_CLEANUP:
    print("Cleaning up dev spaces (no changes to production)...")
    cleanup_result = space_cloner.cleanup_dev_spaces()
    
    if cleanup_result['success']:
        print("‚úÖ Dev spaces deleted")
        print("Production space unchanged.")
    else:
        print(f"‚ö†Ô∏è Cleanup warning: {cleanup_result['error']}")
else:
    print("Set CONFIRM_CLEANUP = True to delete dev spaces")

---
## Debug Utilities

In [None]:
# Export results to JSON
output = {
    "production_id": PRODUCTION_ID,
    "dev_working_id": DEV_WORKING_ID,
    "dev_best_id": DEV_BEST_ID,
    "initial_score": loop_result['initial_score'],
    "final_score": loop_result['final_score'],
    "fixes_applied": len(loop_result['fixes_applied']),
    "fixes_rejected": len(loop_result['fixes_rejected']),
}

with open('enhancement_result.json', 'w') as f:
    json.dump(output, f, indent=2)
print("‚úÖ Results saved to enhancement_result.json")

In [None]:
# Test Genie on dev-working
test_question = "What tables are available?"
print(f"Testing Genie (dev-working): {test_question}")

response = genie_client.ask(test_question, timeout=60)
print(f"Status: {response['status']}")
if response.get('sql'):
    print(f"SQL: {response['sql'][:100]}...")