# Module 3 - LLM Fundamentals Assessment (Instructor Version)

**INSTRUCTOR / GRADING TEMPLATE - Spark LLM Edition**

This notebook uses the Spark-hosted LLM for grading, with rubric-aligned scoring:

| Rubric Area | Weight | Method |
|------------|--------|--------|
| Exercise structure & types | 40% | Code inspection + runtime |
| JSON robustness | 30% | Injected malformed responses |
| Live LLM tolerance | 30% | Spark API calls |

In [None]:
# === HIDDEN: SCORING SETUP ===
__assessment_scores = {}
__assessment_feedback = {}

def record_score(exercise, points, max_points, feedback=None):
    __assessment_scores[exercise] = (points, max_points)
    if feedback:
        __assessment_feedback[exercise] = feedback

In [None]:
# === GRADING ENVIRONMENT CONFIG ===
import os
import requests
import json
import re

# Spark LLM via ngrok gateway - uses /chat/direct with API key
SPARK_BASE_URL = os.environ.get('SPARK_BASE_URL', 'https://jbchat.jonbowden.com.ngrok.app')
SPARK_API_KEY = os.environ.get('SPARK_API_KEY')  # Set in GitHub Actions secrets
LLM_BASE_URL = SPARK_BASE_URL  # Alias for student code compatibility
LLM_API_KEY = None  # Student code uses local Ollama
DEFAULT_MODEL = "phi3:mini"

_spark_available = None

def check_spark_available():
    """Check if Spark LLM is reachable via /chat/direct."""
    global _spark_available
    if _spark_available is not None:
        return _spark_available
    if not SPARK_API_KEY:
        print("DEBUG: No SPARK_API_KEY set, using mock")
        _spark_available = False
        return False
    try:
        # Test with a minimal request
        payload = {"model": DEFAULT_MODEL, "messages": [{"role": "user", "content": "test"}], "stream": False}
        headers = {
            'Content-Type': 'application/json',
            'X-API-Key': SPARK_API_KEY,
            'ngrok-skip-browser-warning': 'true'
        }
        r = requests.post(f"{SPARK_BASE_URL}/chat/direct", json=payload, headers=headers, timeout=15)
        _spark_available = r.status_code == 200
        if not _spark_available:
            print(f"DEBUG: Spark check failed with status {r.status_code}")
    except Exception as e:
        print(f"DEBUG: Spark check failed: {e}")
        _spark_available = False
    return _spark_available

def call_spark_llm(prompt: str, temperature: float = 0.0) -> str:
    """Call Spark LLM API via /chat/direct endpoint."""
    payload = {
        "model": DEFAULT_MODEL,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": temperature,
        "stream": False
    }
    headers = {
        'Content-Type': 'application/json',
        'X-API-Key': SPARK_API_KEY,
        'ngrok-skip-browser-warning': 'true',
        'Bypass-Tunnel-Reminder': 'true'
    }
    r = requests.post(f"{SPARK_BASE_URL}/chat/direct", json=payload, headers=headers, timeout=60)
    r.raise_for_status()
    return r.json()["message"]["content"]

# === MOCK LLM FALLBACK ===
# Used only when Spark gateway is unreachable or no API key
_original_post = requests.post
_mock_call_count = 0

def _mock_post(url, **kwargs):
    """Mock requests.post for LLM endpoints when Spark unavailable."""
    global _mock_call_count
    _mock_call_count += 1
    
    if '/api/chat' in url or '/chat/direct' in url:
        payload = kwargs.get('json', {})
        messages = payload.get('messages', [])
        prompt = messages[0].get('content', '') if messages else ''
        temperature = payload.get('temperature', 0.0)
        
        if 'json' in prompt.lower() or '{' in prompt:
            if _mock_call_count % 2 == 0:
                response_text = '```json\n{"test": 1, "value": 42}\n```'
            else:
                response_text = '{"test": 1, "value": 42}'
        elif 'hello' in prompt.lower():
            response_text = "Hello!"
        elif 'color' in prompt.lower() or 'fruit' in prompt.lower():
            if temperature > 0.5:
                response_text = "Red, Blue, Green (high temp response)"
            else:
                response_text = "Red, Blue, Green"
        else:
            response_text = f"Mock response for: {prompt[:50]}..."
        
        class MockResponse:
            status_code = 200
            def raise_for_status(self): pass
            def json(self): return {"message": {"content": response_text}}
        
        return MockResponse()
    
    return _original_post(url, **kwargs)

# Check Spark availability and set mode
_grading_mode = 'spark' if check_spark_available() else 'mock'
if _grading_mode == 'mock':
    requests.post = _mock_post
print(f"DEBUG: GRADING_MODE={_grading_mode.upper()}, SPARK_URL={SPARK_BASE_URL}")

## Exercise 1 - Basic LLM Caller (4 points)

In [None]:
# === HIDDEN TEST: Exercise 1 ===
points = 0
feedback = []

# Test 1: Function exists
try:
    assert 'call_llm' in dir() or 'call_llm' in globals(), "Function 'call_llm' not defined"
    assert callable(call_llm), "'call_llm' should be a function"
    points += 1
    feedback.append("✓ Function 'call_llm' defined")
except AssertionError as e:
    feedback.append(f"✗ {e}")
except Exception as e:
    feedback.append(f"✗ Error: {type(e).__name__}")

# Test 2: Returns string
try:
    result = call_llm("Say hello in one word.")
    assert isinstance(result, str), f"Should return str, got {type(result).__name__}"
    points += 1
    feedback.append("✓ Function returns a string")
except AssertionError as e:
    feedback.append(f"✗ {e}")
except Exception as e:
    feedback.append(f"✗ Error calling function: {type(e).__name__}")

# Test 3: Non-empty response
try:
    result = call_llm("Say hello in one word.")
    assert len(result) > 0, "Response should not be empty"
    points += 1
    feedback.append(f"✓ Response contains text ({len(result)} chars)")
except AssertionError as e:
    feedback.append(f"✗ {e}")
except Exception as e:
    feedback.append(f"✗ Error: {type(e).__name__}")

# Test 4: Temperature parameter
try:
    result = call_llm("Say hello.", temperature=0.5)
    assert isinstance(result, str), "Should work with temperature parameter"
    points += 1
    feedback.append("✓ Temperature parameter works")
except AssertionError as e:
    feedback.append(f"✗ {e}")
except Exception as e:
    feedback.append(f"✗ Temperature param error: {type(e).__name__}")

record_score('Exercise 1', points, 4, feedback)

## Exercise 2 - Extract Response Text (3 points)

In [None]:
# === HIDDEN TEST: Exercise 2 ===
points = 0
feedback = []

try:
    assert 'get_response_text' in dir() or 'get_response_text' in globals()
    assert callable(get_response_text)
    points += 1
    feedback.append("✓ Function 'get_response_text' defined")
except:
    feedback.append("✗ Function 'get_response_text' not defined")

try:
    result = get_response_text("Say hello")
    assert isinstance(result, str)
    points += 1
    feedback.append("✓ Function returns a string")
except Exception as e:
    feedback.append(f"✗ Error: {type(e).__name__}")

try:
    result = get_response_text("Say hello")
    assert len(result) > 0
    points += 1
    feedback.append(f"✓ Returns non-empty text ({len(result)} chars)")
except Exception as e:
    feedback.append(f"✗ Error: {type(e).__name__}")

record_score('Exercise 2', points, 3, feedback)

## Exercise 3 - JSON Output Parser (4 points)

**Rubric focus: JSON Robustness (30% of module)**

Tests student's ability to handle malformed LLM responses.

In [None]:
# === HIDDEN TEST: Exercise 3 - JSON Robustness ===
points = 0
feedback = []

# Test 1: Function exists and returns tuple
try:
    assert 'parse_json_response' in dir() or 'parse_json_response' in globals()
    assert callable(parse_json_response)
    points += 1
    feedback.append("✓ Function 'parse_json_response' defined")
except:
    feedback.append("✗ Function 'parse_json_response' not defined")

# Test 2: Returns correct tuple format
try:
    result = parse_json_response('Return JSON: {"a": 1}')
    assert isinstance(result, tuple) and len(result) == 2
    assert isinstance(result[0], bool)
    points += 1
    feedback.append("✓ Returns (bool, result) tuple")
except Exception as e:
    feedback.append(f"✗ Wrong return format: {type(e).__name__}")

# Test 3: Handles markdown-wrapped JSON (critical robustness test)
MALFORMED_TESTS = [
    ('```json\n{"test": 1}\n```', 'markdown-wrapped'),
    ('{"test": 1}', 'clean JSON'),
]

passed_malformed = 0
for test_input, desc in MALFORMED_TESTS:
    try:
        # Temporarily override get_response_text to return test input
        _orig_get = get_response_text if 'get_response_text' in dir() else None
        def mock_get(p): return test_input
        globals()['get_response_text'] = mock_get
        
        success, result = parse_json_response('test')
        if success and isinstance(result, dict):
            passed_malformed += 1
        
        if _orig_get:
            globals()['get_response_text'] = _orig_get
    except:
        if _orig_get:
            globals()['get_response_text'] = _orig_get

if passed_malformed >= 1:
    points += 1
    feedback.append(f"✓ Handles {passed_malformed}/2 JSON formats")
else:
    feedback.append("✗ Failed to parse JSON responses")

# Test 4: Live LLM JSON parsing
try:
    success, result = parse_json_response('Return ONLY: {"value": 42}')
    if success:
        points += 1
        feedback.append("✓ Parses live LLM JSON response")
    else:
        feedback.append(f"✗ Live JSON parse failed: {result}")
except Exception as e:
    feedback.append(f"✗ Live test error: {type(e).__name__}")

record_score('Exercise 3', points, 4, feedback)

## Exercise 4 - Temperature Comparison (4 points)

In [None]:
# === HIDDEN TEST: Exercise 4 ===
points = 0
feedback = []

try:
    assert 'compare_temperatures' in dir() or 'compare_temperatures' in globals()
    assert callable(compare_temperatures)
    points += 1
    feedback.append("✓ Function 'compare_temperatures' defined")
except:
    feedback.append("✗ Function not defined")

try:
    result = compare_temperatures("List 3 colors.")
    assert isinstance(result, dict)
    points += 1
    feedback.append("✓ Returns a dictionary")
except Exception as e:
    feedback.append(f"✗ Error: {type(e).__name__}")

try:
    result = compare_temperatures("List 3 colors.")
    assert 'low_temp' in result and 'high_temp' in result and 'are_identical' in result
    points += 1
    feedback.append("✓ Has all required keys")
except Exception as e:
    feedback.append(f"✗ Missing keys: {e}")

try:
    result = compare_temperatures("List 3 colors.")
    assert isinstance(result['are_identical'], bool)
    assert isinstance(result['low_temp'], str)
    assert isinstance(result['high_temp'], str)
    points += 1
    feedback.append("✓ All values have correct types")
except Exception as e:
    feedback.append(f"✗ Type error: {e}")

record_score('Exercise 4', points, 4, feedback)

## Exercise 5 - Structured Prompt Builder (5 points)

**Rubric focus: Prompt Discipline (35% of module)**

In [None]:
# === HIDDEN TEST: Exercise 5 - Prompt Discipline ===
points = 0
feedback = []

try:
    assert 'build_structured_prompt' in dir() or 'build_structured_prompt' in globals()
    assert callable(build_structured_prompt)
    points += 1
    feedback.append("✓ Function 'build_structured_prompt' defined")
except:
    feedback.append("✗ Function not defined")

try:
    result = build_structured_prompt("Be helpful", "Explain X", ["Short"])
    assert isinstance(result, str)
    points += 1
    feedback.append("✓ Returns a string")
except Exception as e:
    feedback.append(f"✗ Error: {type(e).__name__}")

try:
    result = build_structured_prompt("Be helpful", "Explain X", ["Short"])
    assert 'SYSTEM:' in result and 'TASK:' in result and 'CONSTRAINTS:' in result
    points += 1
    feedback.append("✓ Contains all section labels")
except:
    feedback.append("✗ Missing section labels")

try:
    result = build_structured_prompt("Be helpful", "Explain X", ["Short", "Clear"])
    assert '- Short' in result or '- short' in result.lower()
    assert '- Clear' in result or '- clear' in result.lower()
    points += 1
    feedback.append("✓ Constraints formatted with '- ' prefix")
except:
    feedback.append("✗ Constraints not properly formatted")

try:
    result = build_structured_prompt("Test sys", "Test task", ["C1"])
    assert 'Test sys' in result and 'Test task' in result
    points += 1
    feedback.append("✓ All inputs included in output")
except:
    feedback.append("✗ Inputs not in output")

record_score('Exercise 5', points, 5, feedback)

In [None]:
# === HIDDEN: WRITE RESULTS ===
import json
import datetime

result = {
    'scores': __assessment_scores,
    'feedback': __assessment_feedback,
    'timestamp': datetime.datetime.now().isoformat(),
    'grading_mode': _grading_mode
}

with open('assessment_result.json', 'w') as f:
    json.dump(result, f, indent=2)

print("Assessment Results:")
print(f"Grading mode: {_grading_mode.upper()}")
total = sum(s[0] for s in __assessment_scores.values())
max_total = sum(s[1] for s in __assessment_scores.values())
for exercise, (pts, max_pts) in __assessment_scores.items():
    print(f"  {exercise}: {pts}/{max_pts}")
    if exercise in __assessment_feedback:
        for fb in __assessment_feedback[exercise]:
            print(f"    {fb}")
print(f"\nTotal: {total}/{max_total}")