Same Generate Functions as Task 1

In [13]:
from google import genai
from google.genai import types
import base64


def generate_contents(chat_history):
    """
    Converts a structured chat history and system instruction into Gemini-compatible contents.

    Args:
        chat_history (list): A list of dicts with keys 'role' and 'content'

    Returns:
        list[types.Content]: Gemini-compatible message sequence.
    """
    contents = [
        types.Content(
            role=entry["role"],
            parts=[types.Part.from_text(text=entry["content"])]
        ) for entry in chat_history
    ]

    return contents


def generate(content, instructions,temp=0):
  client = genai.Client(
      vertexai=True,
      project="qwiklabs-gcp-02-c706fd6470f9",
      location="us-central1",
  )



  model = "gemini-2.0-flash-001" #Use flash because its fast
  contents = content

  generate_content_config = types.GenerateContentConfig(
    temperature = temp, #0 Temp is better for classification
    top_p = 1,
    seed = 0,
    max_output_tokens = 8192,
    safety_settings = [types.SafetySetting(
      category="HARM_CATEGORY_HATE_SPEECH",
      threshold="BLOCK_LOW_AND_ABOVE"
    ),types.SafetySetting(
      category="HARM_CATEGORY_DANGEROUS_CONTENT",
      threshold="BLOCK_LOW_AND_ABOVE"
    ),types.SafetySetting(
      category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
      threshold="BLOCK_LOW_AND_ABOVE"
    ),types.SafetySetting(
      category="HARM_CATEGORY_HARASSMENT",
      threshold="BLOCK_LOW_AND_ABOVE"
    )],
    system_instruction=[types.Part.from_text(text=instructions)],
  )
  response_text=""
  for chunk in client.models.generate_content_stream(
    model = model,
    contents = contents,
    config = generate_content_config,
    ):
    # print(chunk.text, end="") #No need to print
    response_text += chunk.text
  return response_text.strip()

Evaluate User Question

In [2]:
user_question_prompt1 = (
        "You are a classification assistant for civic technology services.\n"
        "Classify the following question into exactly one of the following categories:\n"
        "- Employment\n- General Information\n- Emergency Services\n- Tax Related\n"
        "Respond ONLY with the category name."
    )

In [3]:
def classify_user_question(question: str,system_instruction=user_question_prompt1) -> str:
    """
    Classifies a user question into one of the predefined categories:
    Employment, General Information, Emergency Services, or Tax Related.
    """
    contents = generate_contents([{"role": "user", "content": question}])
    return generate(contents, system_instruction)


Generating the social Media post

In [4]:
social_post_prompt1 = (
        "You are a public information assistant. Generate a clear, informative, and professional social media post "
        "for a government agency announcement. Keep it under 280 characters."
    )

In [5]:
def generate_social_post(event_type: str, details: str,system_instruction=social_post_prompt1) -> str:
    """
    Generates a short, professional social media post for a government announcement.
    """
    prompt = f"Event Type: {event_type}\nDetails: {details}"
    contents = generate_contents([{"role": "user", "content": prompt}])
    return generate(contents, system_instruction)


Tests Below

In [26]:
import pytest
def compare_contents(output1: str, output2: str) -> str:
    """
    Uses Gemini to compare two pieces of content and determine if they are a close enough match.
    Returns 'yes' or 'no'.
    """
    system_instruction = (
        "You are a comparison assistant. Determine if the two outputs below are semantically similar "
        "and express the same core meaning. Only respond with 'yes' or 'no'."
    )

    comparison_prompt = f"""
Are the following two outputs similar enough in meaning?

Output 1:
{output1}

Output 2:
{output2}

Only respond with 'yes' or 'no'.
"""

    contents = generate_contents([{"role": "user", "content": comparison_prompt}])
    result = generate(contents, instructions=system_instruction).strip().lower()

    # Normalize the response
    if result.startswith("yes"):
        return "yes"
    return "no"



def test_classify_user_question():
    category = classify_user_question("How do I apply for a government job?")
    assert category in ["Employment", "General Information", "Emergency Services", "Tax Related"] #Static Evaluation
    print("Assert passed: "+category)
    comp=compare_contents(classify_user_question("How do I apply for a government job?"), "Employment")
    assert 'yes'==comp #GenAI Evaluation
    print("Assert passed, category matched as expected"+category)

def test_generate_social_post():
    post = generate_social_post("School Closing", "Schools will be closed due to snow on Friday.")
    assert len(post) < 300
    print("Assert passed: "+post)


test_classify_user_question()
test_generate_social_post()

Assert passed: Employment
yes
Assert passed: ❄️ SCHOOL CLOSURE ALERT ❄️ All schools will be closed this Friday due to inclement weather. Stay safe and enjoy the snow! #SchoolClosure #SnowDay


Evaluation API Testing

In [28]:
import pandas as pd
from vertexai.preview.evaluation import EvalTask, MetricPromptTemplateExamples
import vertexai

PROJECT_ID = "qwiklabs-gcp-02-c706fd6470f9"
vertexai.init(project=PROJECT_ID, location="us-central1")


def generate_eval_data_for_classification(question: str, instructions: list) -> pd.DataFrame:
    """
    Given a single user question and a list of instructions, generate a DataFrame
    with Gemini responses to each instruction.
    """
    rows = []
    for instruction in instructions:
        # You may need to update this line to match your existing generation logic
        response = classify_user_question(question, system_instruction=instruction)

        rows.append({
            "instruction": instruction,
            "context": question,
            "response": response
        })

    return pd.DataFrame(rows)


instructions = [
    "Classify this question into one of: Employment, General Information, Emergency Services, or Tax Related.",
    "Look at this question and return one of: Employment, General Information, Emergency Services, or Tax Related. Please return just a value such as Employment, do not include any markdown or other info",
    "Is this Employment, General Information, Emergency Services, or Tax Related?"
]

question = "How do I apply for a government job?"

eval_dataset = generate_eval_data_for_classification(question, instructions)

eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=[
        MetricPromptTemplateExamples.Pointwise.INSTRUCTION_FOLLOWING, #Check instruction following, required for instructions
        MetricPromptTemplateExamples.Pointwise.TEXT_QUALITY, #Ensure text is high quality and not random
        MetricPromptTemplateExamples.Pointwise.VERBOSITY, #Ensure responses are not verbose for this classification problem
    ],
)


prompt_template = (
    "Instruction: {instruction}\n"
    "Question: {context}\n"
    "Model classification: {response}"
)

result = eval_task.evaluate(prompt_template=prompt_template)

print("📊 Summary Metrics:\n")
for key, value in result.summary_metrics.items():
    print(f"{key}: \t{value}")

print("\n📋 Detailed Table:\n")
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.expand_frame_repr", False)

# Show the table directly (not via print)
print(result.metrics_table)
prompt_template = (
    "Event type: {instruction}\n"
    "Details: {context}\n"
    "Generated Post: {response}"
)




INFO:vertexai.preview.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 9 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 9/9 [00:08<00:00,  1.04it/s]
INFO:vertexai.preview.evaluation._evaluation:All 9 metric requests are successfully computed.
INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:8.69548723600019 seconds


📊 Summary Metrics:

row_count: 	3
instruction_following/mean: 	5.0
instruction_following/std: 	0.0
text_quality/mean: 	5.0
text_quality/std: 	0.0
verbosity/mean: 	0.0
verbosity/std: 	0.0

📋 Detailed Table:

                                                                                                                                                                                              instruction                               context                         response                                                                                                                                                                                                                                                                                                 prompt                                                                                     instruction_following/explanation  instruction_following/score                                                                               