In [None]:
%pip install --upgrade --quiet google-genai nest-asyncio==1.5.9 pytest

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.4/45.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.7/244.7 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import json
from inspect import cleandoc
from typing import Dict, Iterable, List, Mapping, Optional, Sequence

import pandas as pd
from IPython.display import Markdown

try:
    import vertexai
    from vertexai.generative_models import GenerativeModel, GenerationConfig
    from vertexai.evaluation import (
        MetricPromptTemplateExamples,
        EvalTask,
    )
except ImportError:  # pragma: no cover - handled in test harness
    vertexai = None  # type: ignore[assignment]
    GenerativeModel = None  # type: ignore[assignment]
    GenerationConfig = None  # type: ignore[assignment]
    MetricPromptTemplateExamples = None  # type: ignore[assignment]
    EvalTask = None  # type: ignore[assignment]

pd.set_option("display.max_colwidth", None)

In [None]:
CATEGORIES: Sequence[str] = (
    "Employment",
    "General Information",
    "Emergency Services",
    "Tax Related",
)


def _default_model(model_name: str = "gemini-2.0-flash-001"):
    """Instantiate a default Gemini model with deterministic configuration."""
    if GenerativeModel is None or GenerationConfig is None:
        raise RuntimeError(
            "Google Vertex AI SDK is unavailable. Provide a configured `model` instance "
            "when calling these helper functions, or install `google-cloud-aiplatform`."
        )
    return GenerativeModel(model_name, generation_config=GenerationConfig(temperature=0))


def classify_question(
    question: str,
    model: Optional[object] = None,
) -> str:
    """Classify a user question into one of the predefined civic categories."""
    if not question or not question.strip():
        raise ValueError("Question must be a non-empty string.")

    prompt = (
        "Classify the following user question into exactly one category. "
        "Valid categories are: Employment, General Information, Emergency Services, "
        "or Tax Related. Respond with a JSON object containing a single key "
        '"category" whose value is one of the valid categories.\n\n'
        f"Question: {question.strip()}"
    )

    model = model or _default_model()
    response = model.generate_content(prompt)
    text = getattr(response, "text", "").strip()

    try:
        parsed = json.loads(text)
    except json.JSONDecodeError as exc:
        raise ValueError(f"Unable to parse classification response: {text}") from exc

    category = parsed.get("category")
    if category not in CATEGORIES:
        raise ValueError(f"Model returned unsupported category: {category!r}")
    return category


def generate_announcement_posts(
    topic: str,
    channels: Iterable[str],
    tone: str = "reassuring",
    key_details: Optional[Mapping[str, str]] = None,
    model: Optional[object] = None,
) -> Mapping[str, str]:
    """Generate social media posts tailored for specified communication channels."""
    channels_list: List[str] = [ch.strip() for ch in channels if ch and ch.strip()]
    if not channels_list:
        raise ValueError("At least one channel must be provided.")
    if not topic or not topic.strip():
        raise ValueError("Topic must be a non-empty string.")

    details_lines = ""
    if key_details:
        details_lines = "\n".join(f"- {label}: {value}" for label, value in key_details.items())

    prompt = (
        "You are a communications specialist for a municipal government. "
        "Create concise social media posts for each specified channel. "
        "Follow official tone guidelines: keep the language clear, accessible, "
        "and aligned with public service communication. "
        "Return your answer as JSON with each channel name as a key and the "
        "post text as the value.\n\n"
        f"Topic: {topic.strip()}\n"
        f"Tone: {tone.strip()}\n"
        f"Channels: {', '.join(channels_list)}\n"
    )

    if details_lines:
        prompt += f"Key details:\n{details_lines}\n"

    model = model or _default_model()
    response = model.generate_content(prompt)
    text = getattr(response, "text", "").strip()

    try:
        parsed = json.loads(text)
    except json.JSONDecodeError as exc:
        raise ValueError(f"Unable to parse announcement response: {text}") from exc

    missing = [ch for ch in channels_list if ch not in parsed]
    if missing:
        raise ValueError(f"Model response missing channels: {missing}")

    return {ch: parsed[ch] for ch in channels_list}



In [None]:
class DummyModel:
    """Lightweight stand-in for Gemini responses used in notebook assertions."""

    def __init__(self, response_text: str):
        self.response_text = response_text
        self.last_prompt = None

    def generate_content(self, prompt: str):
        self.last_prompt = prompt
        return type("Response", (), {"text": self.response_text})


def run_notebook_sanity_checks() -> str:
    """Execute minimal assertion-based checks for helper functions."""
    results = []

    # classify_question happy path
    classifier_model = DummyModel(json.dumps({"category": "Employment"}))
    category = classify_question("How do I apply for a government job?", model=classifier_model)
    assert category == "Employment"
    assert "How do I apply" in classifier_model.last_prompt
    results.append("classifier_ok")

    # classify_question invalid JSON should raise
    try:
        classify_question("Is there a tax form due?", model=DummyModel("not-json"))
    except ValueError:
        results.append("classifier_invalid_json_ok")
    else:  # pragma: no cover - defensive fallback
        raise AssertionError("classify_question did not raise on invalid JSON")

    # generate_announcement_posts happy path
    channels = ["Twitter", "Facebook"]
    payload = {ch: f"{ch} post content" for ch in channels}
    generator_model = DummyModel(json.dumps(payload))
    generated = generate_announcement_posts(
        topic="Winter storm warning",
        channels=channels,
        tone="urgent, calm",
        key_details={"Shelter Hotline": "555-0100"},
        model=generator_model,
    )
    assert generated == payload
    for ch in channels:
        assert ch in generator_model.last_prompt
    results.append("generator_ok")

    # generate_announcement_posts missing channel should raise
    try:
        generate_announcement_posts(
            topic="Heat advisory",
            channels=["Instagram", "TikTok"],
            model=DummyModel(json.dumps({"Instagram": "Stay safe"})),
        )
    except ValueError:
        results.append("generator_missing_channel_ok")
    else:  # pragma: no cover - defensive fallback
        raise AssertionError("generate_announcement_posts did not raise when channels missing")

    return "Notebook sanity checks passed: " + ", ".join(results)


run_notebook_sanity_checks()


In [None]:
if vertexai is not None:
    vertexai.init(location="us-central1")
else:  # pragma: no cover - environment without Vertex AI SDK
    print("vertexai SDK not installed; skipping initialization.")

In [None]:
sample_questions = [
    "How do I apply for a city maintenance job?",
    "Is the 311 hotline available for downed power lines?",
    "What documents do I need to submit my quarterly tax payment?",
    "Where can I find the latest community event calendar?",
]

coastal_flood_context = cleandoc(
    """
    Event: Coastal Flood Warning
    Location: Riverfront District and South Harbor
    Effective: April 18, 2025, 6:00 PM – April 20, 2025, 6:00 AM
    Risk Level: Moderate to high flooding along low-lying streets
    Required Actions:
      - Move vehicles to higher ground
      - Avoid driving through standing water
      - Monitor official channels for evacuation updates
    Resources:
      - Sandbag pickup at Public Works Yard (1220 Harbor Ave.)
      - Emergency shelter at Civic Center opens at 6:00 PM
      - Hotline: Dial 311 for non-emergencies; call 911 for life-threatening emergencies
    """
)

announcement_channels = ["Twitter", "Facebook", "SMS Alerts"]
announcement_details = {
    "Sandbag Pickup": "Public Works Yard, 7 AM – 7 PM",
    "Shelter": "Civic Center Gymnasium, opens 6 PM",
    "Hotline": "311",
}

In [None]:
classification_results = pd.DataFrame(
    {
        "question": sample_questions,
        "predicted_category": [classify_question(question) for question in sample_questions],
    }
)

classification_results
CATEGORIES

In [None]:
baseline_posts = generate_announcement_posts(
    topic="Coastal flood warning for Riverfront District",
    channels=announcement_channels,
    tone="calm and informative",
    key_details=announcement_details,
)

baseline_posts

In [None]:
candidate_posts = generate_announcement_posts(
    topic="Coastal flood warning for Riverfront District",
    channels=announcement_channels,
    tone="urgent and action-oriented",
    key_details=announcement_details,
)

candidate_posts

In [None]:
def format_posts(posts: Dict[str, str]) -> str:
    return "\n\n".join(f"{channel}: {message}" for channel, message in posts.items())

formatted_baseline = format_posts(baseline_posts)
formatted_candidate = format_posts(candidate_posts)

Markdown(f"**Baseline messaging**\n\n{formatted_baseline}")

In [None]:
Markdown(f"**Candidate messaging**\n\n{formatted_candidate}")

Based on the context provided, here is the breakdown of the days required for each phase of the project.

**Writing:**
The Screenwriter requires 72 hours.
*   72 hours / 8 hours per day = **9 days**

**Pre-Production:**
The Director will work for 36 hours, and the Camera Operator will work for 24 hours in parallel. The longest duration is used.
*   36 hours / 8 hours per day = **4.5 days**

**Production Phase 1:**
This phase is explicitly stated to last for **3 days**.

**Production Phase 2:**
This phase is explicitly stated to last for **3 days**.

**Post-Production:**
The Editor will work for 64 hours, and the Director will work for 24 hours during that time. The longest duration is used.
*   64 hours / 8 hours per day = **8 days**

---

### **Total Project Duration**

*   **Writing:** 9 days
*   **Pre-Production:** 4.5 days
*   **Production Phase 1:** 3 days
*   **Production Phase 2:** 3 days
*   **Post-Production:** 8 days

**Total:** 9 + 4.5 + 3 + 3 + 8 = **27.5 days**

In [None]:
evaluation_prompt = cleandoc(
    f"""
    <instructions>
    You are evaluating city government social media messaging. Choose the response that best informs residents about the event while maintaining clarity, accuracy, actionable guidance, and a calm civic tone.
    </instructions>
    <context>
    {coastal_flood_context}
    </context>
    """
)

evaluation_prompt

Here's a breakdown of the project timeline, calculated in days based on an 8-hour workday:

**Phase Breakdown:**

*   **Writing:**
    *   Screenwriter: 72 hours / 8 hours/day = 9 days

*   **Pre-Production:**
    *   Director: 36 hours / 8 hours/day = 4.5 days
    *   Camera Operator: 24 hours / 8 hours/day = 3 days
    *   *Since the Director and Camera Operator work in parallel, the longest duration determines the phase length.*
    *   **Pre-Production Length: 4.5 days**

*   **Production Phase 1:**
    *   3 days (given)

*   **Production Phase 2:**
    *   3 days (given)

*   **Post-Production:**
    *   Editor: 64 hours / 8 hours/day = 8 days
    *   Director: 24 hours / 8 hours/day = 3 days
    *   *Since the Director and Editor work in parallel, the longest duration determines the phase length.*
    *   **Post-Production Length: 8 days**

**Total Project Length:**

9 days (Writing) + 4.5 days (Pre-Production) + 3 days (Production Phase 1) + 3 days (Production Phase 2) + 8 days (Post-Production) = **27.5 days**


In [None]:
eval_records = {
    "prompt": evaluation_prompt,
    "baseline_model_response": formatted_baseline,
    "response": formatted_candidate,
}

eval_records

In [None]:
eval_dataset = pd.DataFrame(eval_records, index=[0])

eval_dataset

'Based on the context provided, here is the breakdown of the days required for each phase of the project.\n\n**Writing:**\nThe Screenwriter requires 72 hours.\n*   72 hours / 8 hours per day = **9 days**\n\n**Pre-Production:**\nThe Director will work for 36 hours, and the Camera Operator will work for 24 hours in parallel. The longest duration is used.\n*   36 hours / 8 hours per day = **4.5 days**\n\n**Production Phase 1:**\nThis phase is explicitly stated to last for **3 days**.\n\n**Production Phase 2:**\nThis phase is explicitly stated to last for **3 days**.\n\n**Post-Production:**\nThe Editor will work for 64 hours, and the Director will work for 24 hours during that time. The longest duration is used.\n*   64 hours / 8 hours per day = **8 days**\n\n---\n\n### **Total Project Duration**\n\n*   **Writing:** 9 days\n*   **Pre-Production:** 4.5 days\n*   **Production Phase 1:** 3 days\n*   **Production Phase 2:** 3 days\n*   **Post-Production:** 8 days\n\n**Total:** 9 + 4.5 + 3 + 3 

In [None]:
eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=[MetricPromptTemplateExamples.Pairwise.QUESTION_ANSWERING_QUALITY],
    experiment="municipal-communications",
)

In [None]:
import datetime

run_ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
eval_result = eval_task.evaluate(
    experiment_run_name=f"gov-comms-{run_ts}"
)

INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 1 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 1/1 [00:05<00:00,  5.79s/it]
INFO:vertexai.evaluation._evaluation:All 1 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:5.799854666000101 seconds


In [32]:
eval_result.summary_metrics

{'row_count': 1,
 'pairwise_question_answering_quality/candidate_model_win_rate': np.float64(0.0),
 'pairwise_question_answering_quality/baseline_model_win_rate': np.float64(0.0)}

In [34]:
eval_result.metrics_table

Unnamed: 0,prompt,baseline_model_response,response,pairwise_question_answering_quality/explanation,pairwise_question_answering_quality/pairwise_choice
0,"<instructions>\n Prepare a document to fulfill the task based on the context provided.\n </instructions>\n<task>\n How many days will each phase require? Assume an \n 8 hour work day. If multiple people are working in parallel, \n do not add those times together, but only use the longest time. \n Also include a count of the total number of days of the entire \n project.\n </task>\n<context>\n Screenwriter: $40\nActor: $25\nDirector: $30\nCamera Operator: $35\nSound Engineer: $20\nEditor: $30\n\nPhases of Production:\n Writing:\n The Screenwriter will write the script.\n They need 72 hours to do so.\n\n\n Pre-Production:\n The Director needs time to analyze the script.\n They will work on it for 36 hours.\n The Camera Operator will join the director for 24 hours of planning.\n\n\n Production Phase 1\n The first three days of filming will require the director, 4 actors, the camera operator, and the sound engineer\n\n\n Production Phase 2\n The next three days of filming will require the director, 8 actors, the camera operator, and the sound engineer\n\n\n Post-Production\n The editor will take 64 hours to edit the film.\n The director will work with the editor for 24 hours during this phase.\n </context>\n","Based on the context provided, here is the breakdown of the days required for each phase of the project.\n\n**Writing:**\nThe Screenwriter requires 72 hours.\n* 72 hours / 8 hours per day = **9 days**\n\n**Pre-Production:**\nThe Director will work for 36 hours, and the Camera Operator will work for 24 hours in parallel. The longest duration is used.\n* 36 hours / 8 hours per day = **4.5 days**\n\n**Production Phase 1:**\nThis phase is explicitly stated to last for **3 days**.\n\n**Production Phase 2:**\nThis phase is explicitly stated to last for **3 days**.\n\n**Post-Production:**\nThe Editor will work for 64 hours, and the Director will work for 24 hours during that time. The longest duration is used.\n* 64 hours / 8 hours per day = **8 days**\n\n---\n\n### **Total Project Duration**\n\n* **Writing:** 9 days\n* **Pre-Production:** 4.5 days\n* **Production Phase 1:** 3 days\n* **Production Phase 2:** 3 days\n* **Post-Production:** 8 days\n\n**Total:** 9 + 4.5 + 3 + 3 + 8 = **27.5 days**","Here's a breakdown of the project timeline, calculated in days based on an 8-hour workday:\n\n**Phase Breakdown:**\n\n* **Writing:**\n * Screenwriter: 72 hours / 8 hours/day = 9 days\n\n* **Pre-Production:**\n * Director: 36 hours / 8 hours/day = 4.5 days\n * Camera Operator: 24 hours / 8 hours/day = 3 days\n * *Since the Director and Camera Operator work in parallel, the longest duration determines the phase length.*\n * **Pre-Production Length: 4.5 days**\n\n* **Production Phase 1:**\n * 3 days (given)\n\n* **Production Phase 2:**\n * 3 days (given)\n\n* **Post-Production:**\n * Editor: 64 hours / 8 hours/day = 8 days\n * Director: 24 hours / 8 hours/day = 3 days\n * *Since the Director and Editor work in parallel, the longest duration determines the phase length.*\n * **Post-Production Length: 8 days**\n\n**Total Project Length:**\n\n9 days (Writing) + 4.5 days (Pre-Production) + 3 days (Production Phase 1) + 3 days (Production Phase 2) + 8 days (Post-Production) = **27.5 days**\n","Both responses correctly calculate the days for each phase and the total project duration, accurately applying the 8-hour workday rule and correctly handling parallel tasks by taking the longest duration. Both are well-structured, complete, and grounded in the context.",TIE


In [35]:
eval_result.metrics_table["pairwise_question_answering_quality/pairwise_choice"]

Unnamed: 0,pairwise_question_answering_quality/pairwise_choice
0,TIE


In [36]:
eval_result.metrics_table["pairwise_question_answering_quality/explanation"]

Unnamed: 0,pairwise_question_answering_quality/explanation
0,"Both responses correctly calculate the days for each phase and the total project duration, accurately applying the 8-hour workday rule and correctly handling parallel tasks by taking the longest duration. Both are well-structured, complete, and grounded in the context."


In [None]:
!pytest -q
