# Genie Enhancement v3 - Debug Notebook

## 4-Stage Batch Apply Flow

This notebook tests the v3 enhancement workflow:

1. **Score** - Evaluate benchmarks on Genie Space
2. **Plan** - Analyze failures, generate ALL fixes
3. **Apply** - Apply ALL fixes in ONE batch update
4. **Validate** - Re-score and check improvement

## Key Difference from v2
- v2: Apply fixes one-at-a-time with rollback
- v3: Apply ALL fixes at once (batch)

## Usage
Run cells in order. Each section can be debugged independently.

## 1Ô∏è‚É£ Setup

In [None]:
# Project path setup
import sys
import os
from pathlib import Path

# Find project root
current_path = Path(os.getcwd())
if current_path.name == 'genie_enhancer':
    project_root = current_path
else:
    project_root = current_path
    while project_root.name != 'genie_enhancer' and project_root != project_root.parent:
        project_root = project_root.parent

if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")

In [None]:
# Test imports from lib/
try:
    from lib.genie_client import GenieConversationalClient
    print("‚úÖ lib.genie_client")
except Exception as e:
    print(f"‚ùå lib.genie_client: {e}")

try:
    from lib.space_api import SpaceUpdater
    print("‚úÖ lib.space_api")
except Exception as e:
    print(f"‚ùå lib.space_api: {e}")

try:
    from lib.scorer import BenchmarkScorer
    print("‚úÖ lib.scorer")
except Exception as e:
    print(f"‚ùå lib.scorer: {e}")

try:
    from lib.benchmark_parser import BenchmarkLoader
    print("‚úÖ lib.benchmark_parser")
except Exception as e:
    print(f"‚ùå lib.benchmark_parser: {e}")

try:
    from lib.llm import DatabricksLLMClient
    print("‚úÖ lib.llm")
except Exception as e:
    print(f"‚ùå lib.llm: {e}")

try:
    from lib.sql import SQLExecutor
    print("‚úÖ lib.sql")
except Exception as e:
    print(f"‚ùå lib.sql: {e}")

try:
    from lib.enhancer import EnhancementPlanner
    print("‚úÖ lib.enhancer")
except Exception as e:
    print(f"‚ùå lib.enhancer: {e}")

try:
    from lib.applier import BatchApplier
    print("‚úÖ lib.applier")
except Exception as e:
    print(f"‚ùå lib.applier: {e}")

In [None]:
# Full imports
import json
import time
from datetime import datetime

from lib.genie_client import GenieConversationalClient
from lib.space_api import SpaceUpdater
from lib.scorer import BenchmarkScorer
from lib.benchmark_parser import BenchmarkLoader
from lib.llm import DatabricksLLMClient
from lib.sql import SQLExecutor
from lib.enhancer import EnhancementPlanner
from lib.applier import BatchApplier

print("‚úÖ All imports successful")

## 2Ô∏è‚É£ Configuration

In [None]:
# === UPDATE THESE VALUES ===
DATABRICKS_HOST = "your-workspace.cloud.databricks.com"
DATABRICKS_TOKEN = "YOUR_TOKEN_HERE"
GENIE_SPACE_ID = "your-space-id"
WAREHOUSE_ID = "your-warehouse-id"  # For metric views
LLM_ENDPOINT = "databricks-claude-sonnet-4"

# Target score
TARGET_SCORE = 0.90

print(f"Host: {DATABRICKS_HOST}")
print(f"Space ID: {GENIE_SPACE_ID}")
print(f"Warehouse: {WAREHOUSE_ID}")
print(f"LLM: {LLM_ENDPOINT}")
print(f"Target: {TARGET_SCORE:.0%}")

## 3Ô∏è‚É£ Initialize Clients

In [None]:
# Genie Client
print("Initializing Genie Client...")
genie_client = GenieConversationalClient(
    host=DATABRICKS_HOST,
    token=DATABRICKS_TOKEN,
    space_id=GENIE_SPACE_ID
)
print("‚úÖ Genie Client initialized")

In [None]:
# LLM Client
print("Initializing LLM Client...")
llm_client = DatabricksLLMClient(
    host=DATABRICKS_HOST,
    token=DATABRICKS_TOKEN,
    endpoint_name=LLM_ENDPOINT
)

if llm_client.test_connection():
    print("‚úÖ LLM Client connected")
else:
    print("‚ùå LLM connection failed")

In [None]:
# Space API (for export/import)
print("Initializing Space API...")
space_api = SpaceUpdater(
    host=DATABRICKS_HOST,
    token=DATABRICKS_TOKEN
)
print("‚úÖ Space API initialized")

In [None]:
# SQL Executor (for metric views)
print("Initializing SQL Executor...")
sql_executor = SQLExecutor(
    host=DATABRICKS_HOST,
    token=DATABRICKS_TOKEN,
    warehouse_id=WAREHOUSE_ID
)
print("‚úÖ SQL Executor initialized")

In [None]:
# Benchmark Scorer
print("Initializing Scorer...")
scorer = BenchmarkScorer(
    genie_client=genie_client,
    llm_client=llm_client,
    sql_executor=sql_executor,
    config={"question_timeout": 120}
)
print("‚úÖ Scorer initialized")

## 4Ô∏è‚É£ Load Benchmarks

In [None]:
# Load benchmarks
benchmark_file = project_root / "benchmarks" / "benchmarks.json"
print(f"Loading from: {benchmark_file}")

loader = BenchmarkLoader(str(benchmark_file))
all_benchmarks = loader.load()
print(f"‚úÖ Loaded {len(all_benchmarks)} benchmarks")

# Show first few
for i, b in enumerate(all_benchmarks[:3]):
    print(f"  {i+1}. {b['question'][:60]}...")

In [None]:
# Optional: Filter for faster testing
USE_SUBSET = True  # Set to False for full run

if USE_SUBSET:
    benchmarks = all_benchmarks[:5]  # First 5 only
    print(f"‚ö†Ô∏è TEST MODE: Using {len(benchmarks)} benchmarks")
else:
    benchmarks = all_benchmarks
    print(f"FULL MODE: Using {len(benchmarks)} benchmarks")

---
# STAGE 1: SCORE
---

In [None]:
# Run scoring
print("="*60)
print("STAGE 1: SCORING BENCHMARKS")
print("="*60)

start_time = datetime.now()
score_results = scorer.score(benchmarks)
duration = (datetime.now() - start_time).total_seconds()

print(f"\nScore: {score_results['score']:.1%}")
print(f"Passed: {score_results['passed']}/{score_results['total']}")
print(f"Duration: {duration:.1f}s")

In [None]:
# Show failed benchmarks
failed_results = [r for r in score_results['results'] if not r['passed']]

print(f"\n‚ùå Failed Benchmarks ({len(failed_results)}):\n")
for i, r in enumerate(failed_results, 1):
    print(f"{i}. {r['question'][:60]}...")
    print(f"   Category: {r.get('failure_category', 'unknown')}")
    print()

---
# STAGE 2: PLAN
---

In [None]:
# Get current space config
print("Exporting current space config...")
space_config = space_api.export_space(GENIE_SPACE_ID)
print(f"‚úÖ Config loaded")
print(f"   Tables: {len(space_config.get('data_sources', {}).get('tables', []))}")

In [None]:
# Initialize Enhancement Planner
print("Initializing Enhancement Planner...")
prompts_dir = project_root / "prompts"
planner = EnhancementPlanner(llm_client, prompts_dir)
print("‚úÖ Planner initialized")

In [None]:
# Generate enhancement plan
print("="*60)
print("STAGE 2: GENERATING ENHANCEMENT PLAN")
print("="*60)

grouped_fixes = planner.generate_plan(
    failed_benchmarks=failed_results,
    space_config=space_config,
    parallel_workers=2  # Reduce for debugging
)

total_fixes = sum(len(f) for f in grouped_fixes.values())
print(f"\n‚úÖ Generated {total_fixes} fixes")

In [None]:
# Show fixes by category
print("\nFixes by Category:")
print("-"*40)

for category in ["metric_view", "metadata", "sample_query", "instruction"]:
    fixes = grouped_fixes.get(category, [])
    print(f"\n{category.upper()} ({len(fixes)} fixes)")
    for i, fix in enumerate(fixes[:5], 1):  # Show first 5
        fix_type = fix.get('type', 'unknown')
        if fix_type == 'add_synonym':
            print(f"  {i}. {fix_type}: {fix.get('table')}.{fix.get('column')} ‚Üí '{fix.get('synonym')}'")
        elif fix_type == 'add_column_description':
            print(f"  {i}. {fix_type}: {fix.get('table')}.{fix.get('column')}")
        else:
            print(f"  {i}. {fix_type}")
    if len(fixes) > 5:
        print(f"  ... and {len(fixes)-5} more")

---
# STAGE 3: APPLY (Batch)
---

In [None]:
# Initialize Batch Applier
print("Initializing Batch Applier...")
applier = BatchApplier(
    space_api=space_api,
    sql_executor=sql_executor,
    config={
        "catalog": "sandbox",
        "schema": "genie_enhancement"
    }
)
print("‚úÖ Applier initialized")

In [None]:
# DRY RUN first
DRY_RUN = True  # Set to False to actually apply

print("="*60)
print(f"STAGE 3: APPLY ALL FIXES {'(DRY RUN)' if DRY_RUN else '(LIVE)'}")
print("="*60)

apply_result = applier.apply_all(
    space_id=GENIE_SPACE_ID,
    grouped_fixes=grouped_fixes,
    dry_run=DRY_RUN
)

print(f"\nApplied: {len(apply_result['applied'])}")
print(f"Failed: {len(apply_result['failed'])}")

In [None]:
# Show applied fixes
print("\n‚úÖ Applied Fixes:")
for i, fix in enumerate(apply_result['applied'][:10], 1):
    print(f"  {i}. {fix.get('type')}")

if apply_result['failed']:
    print("\n‚ùå Failed Fixes:")
    for i, fix in enumerate(apply_result['failed'], 1):
        print(f"  {i}. {fix.get('type')}: {fix.get('error')}")

In [None]:
# LIVE RUN (uncomment to execute)
# WARNING: This will modify your Genie Space!

# print("Applying fixes for real...")
# apply_result = applier.apply_all(
#     space_id=GENIE_SPACE_ID,
#     grouped_fixes=grouped_fixes,
#     dry_run=False
# )
# print(f"Applied: {len(apply_result['applied'])}")

---
# STAGE 4: VALIDATE
---

In [None]:
# Wait for Genie indexing (only if not dry run)
INDEXING_WAIT = 60  # seconds

if not DRY_RUN and len(apply_result['applied']) > 0:
    print(f"Waiting {INDEXING_WAIT}s for Genie indexing...")
    time.sleep(INDEXING_WAIT)
    print("‚úÖ Wait complete")
else:
    print("Skipping wait (dry run or no changes)")

In [None]:
# Re-score benchmarks
print("="*60)
print("STAGE 4: VALIDATING RESULTS")
print("="*60)

final_results = scorer.score(benchmarks)

initial_score = score_results['score']
final_score = final_results['score']
improvement = final_score - initial_score

print(f"\nInitial Score: {initial_score:.1%}")
print(f"Final Score:   {final_score:.1%}")
print(f"Improvement:   {improvement:+.1%}")
print(f"Target:        {TARGET_SCORE:.1%}")
print()
if final_score >= TARGET_SCORE:
    print("üéâ TARGET REACHED!")
else:
    print(f"‚ö†Ô∏è Need another loop (gap: {TARGET_SCORE - final_score:.1%})")

---
# Debug Utilities
---

In [None]:
# Test Genie API directly
test_question = "What tables are available?"
print(f"Testing Genie: {test_question}")

response = genie_client.ask(test_question, timeout=60)
print(f"Status: {response['status']}")
if response.get('sql'):
    print(f"SQL: {response['sql'][:100]}...")

In [None]:
# Test LLM directly
test_prompt = "Say 'Hello, Genie Enhancement is working!'"
print(f"Testing LLM...")

response = llm_client.generate(test_prompt, max_tokens=50)
print(f"Response: {response}")

In [None]:
# Export current config to JSON
output_file = "debug_space_config.json"
with open(output_file, 'w') as f:
    json.dump(space_config, f, indent=2)
print(f"‚úÖ Config saved to {output_file}")

In [None]:
# Export fixes to JSON
output_file = "debug_fixes.json"
with open(output_file, 'w') as f:
    json.dump(grouped_fixes, f, indent=2, default=str)
print(f"‚úÖ Fixes saved to {output_file}")