In [20]:
from google import genai
from google.genai import types
import base64
import pytest
import vertexai
from vertexai.generative_models import GenerativeModel
from vertexai.evaluation import (
    MetricPromptTemplateExamples,
    EvalTask,
    PairwiseMetric,
    PairwiseMetricPromptTemplate,
    PointwiseMetric,
    PointwiseMetricPromptTemplate,
)
import datetime
import pandas as pd
import pytest

def generate_response(user_input, sys_instruction):

  client = genai.Client(
      vertexai=True,
      project="qwiklabs-gcp-03-28c3125acb2b",
      location="us-central1",
  )

  model = "gemini-2.0-flash-001"

  contents = [
    types.Content(
      role="user",
      parts=[
        types.Part.from_text(text = user_input)
      ]
    )
  ]

  generate_content_config = types.GenerateContentConfig(
    temperature = 1,
    top_p = 0.95,
    max_output_tokens = 8192,
    response_modalities = ["TEXT"],
    speech_config = types.SpeechConfig(
      voice_config = types.VoiceConfig(
        prebuilt_voice_config = types.PrebuiltVoiceConfig(
          voice_name = "zephyr"
        )
      ),
    ),
    system_instruction=[types.Part.from_text(text=sys_instruction)],
  )

  output_string = ""

  for chunk in client.models.generate_content_stream(
    model = model,
    contents = contents,
    config = generate_content_config,
    ):
    #print(chunk.text, end="")
    output_string = output_string + chunk.text

  return output_string

def classify_question(user_question):

  # Provides the classification of a question in one of the types of Employment, General Information, Emergency Services, or Tax Related

  sys_instruction = """
    You are a query classification agent that will provide an answer back on the type of user question provided to you.
    This is in support of another agent providing answers to employees. But your focus is ONLY on classifying the type of query.

    These are the four possible results you should return (and please return a single word only for the best fitting category):
    Employment, General Information, Emergency Services, or Tax Related
  """

  classification = generate_response(user_question, sys_instruction).replace('\n', ' ').replace('\r', '').rstrip()

  return classification

def generate_tweet_announcement(user_prompt):

  # Generates a professional sounding tweet to announce something for a government agency
  sys_instruction = """
    You are an agent that specializes in making short tweets for special announcements for a government agency. The tweets should professional and should relate to the context provided to you in the user prompt Pleae make sure the response is highly accurate relative to the input prompt.

    The expected output is the tweet only, less than 100 characters.
  """

  tweet = generate_response(user_prompt, sys_instruction)

  return tweet


def test_classify_question_basic():
    user_input = "I have a question about my W-2 tax form, where can I find it?"
    print(f"Testing with input: '{user_input}'")
    try:
        result = classify_question(user_input)
        print(f"Result received: '{result}'")
        # Single, super simple assertion: Check if the result string has content
        assert result == "Tax Related", "classify_question returned Tax Related as expected"
        print("classify_question test PASSED: Returned non-empty string.")
    except Exception as e:
        print(f"classify_question test FAILED: An error occurred - {e}")
    print("--- classify_question Test Finished ---")


def test_generate_tweet_announcement_basic():
    user_prompt = "There will be a public meeting on Tuesday at 10 AM about the new park."
    print(f"Testing with input: '{user_prompt}'")

    try:
        result = generate_tweet_announcement(user_prompt)
        print(f"Result received: '{result}'")
        # Single, super simple assertion: Check if the result string has content
        assert len(result) < 100, "generate_tweet_announcement returned an empty string"
        print("generate_tweet_announcement test PASSED: String length is less than 100")

    except Exception as e:
        print(f"generate_tweet_announcement test FAILED: An error occurred - {e}")

    print("--- generate_tweet_announcement Test Finished ---")

def classify_eval():

    # Classification Questions Evaluation for groundedness and coherence
  classification_questions = [
      "Who might be able to help me if I see an employee experiencing a heart attack?",
      "Who might help me understand the tax impact of purchasing a home this year?",
      "How can I provide evidence of my employment status?"
  ]

  classifications =[]

  for prompt in classification_questions:
    classification = generate_tweet_announcement(prompt)
    classifications.append(classification)

  # Eval Dataset Setup
  eval_dataset = pd.DataFrame({
    "prompt": classifications,
  })

  eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=[
        MetricPromptTemplateExamples.Pointwise.GROUNDEDNESS,
        MetricPromptTemplateExamples.Pointwise.COHERENCE
    ],
    experiment="question-classification",
  )

  model = GenerativeModel(
    "gemini-2.0-flash-001",
    generation_config={
        "temperature": 0,
        "top_p": 0.4,
    },
  )

  run_ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
  eval_result = eval_task.evaluate(
    model=model,
    experiment_run_name=f"apt-gen-{run_ts}"
  )

  eval_result.metrics_table
  print(eval_result.metrics_table)

def tweet_eval():

  # Tweet Evaluation for groundedness and coherence
  tweet_topics = [
      "Signficant chance of snowfall after 8:00 p.m., likely to reach 6-10 inches",
      "There is going to be a community service day this Saturday in the morning",
      "Please note there is a change of policy on snow removal, priority will now be on roads close to emergency services, then residential"
  ]

  tweets =[]

  for prompt in tweet_topics:
    tweet = generate_tweet_announcement(prompt)
    tweets.append(tweet)

  # Eval Dataset Setup
  eval_dataset = pd.DataFrame({
    "prompt": tweets,
  })

  eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=[
        MetricPromptTemplateExamples.Pointwise.GROUNDEDNESS,
        MetricPromptTemplateExamples.Pointwise.COHERENCE
    ],
    experiment="tweet-generation",
  )

  model = GenerativeModel(
    "gemini-2.0-flash-001",
    generation_config={
        "temperature": 0,
        "top_p": 0.4,
    },
  )

  run_ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
  eval_result = eval_task.evaluate(
    model=model,
    experiment_run_name=f"apt-gen-{run_ts}"
  )

  eval_result.metrics_table
  print(eval_result.metrics_table)

def main():

  # Run basic unit tests for classifying question and producing tweets
  test_classify_question_basic()
  test_generate_tweet_announcement_basic()

  # Run Evaluation Tests for classifying questions and producing tweets
  classify_eval()
  tweet_eval()

if __name__ == "__main__":
    main()

Testing with input: 'I have a question about my W-2 tax form, where can I find it?'
Result received: 'Tax Related'
classify_question test PASSED: Returned non-empty string.
--- classify_question Test Finished ---
Testing with input: 'There will be a public meeting on Tuesday at 10 AM about the new park.'
Result received: 'Public meeting Tuesday at 10 AM to discuss the new park.
'
generate_tweet_announcement test PASSED: String length is less than 100
--- generate_tweet_announcement Test Finished ---


INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/416100490526/locations/us-central1/metadataStores/default/contexts/question-classification-apt-gen-20250428-202533 to Experiment: question-classification


INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'model_name': 'publishers/google/models/gemini-2.0-flash-001', 'temperature': 0, 'top_p': 0.4}
INFO:vertexai.evaluation._evaluation:Generating a total of 3 responses from Gemini model gemini-2.0-flash-001.
100%|██████████| 3/3 [00:03<00:00,  1.07s/it]
INFO:vertexai.evaluation._evaluation:All 3 responses are successfully generated from Gemini model gemini-2.0-flash-001.
INFO:vertexai.evaluation._evaluation:Multithreaded Batch Inference took: 3.2241659620012797 seconds.
INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 6 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 6/6 [00:06<00:00,  1.02s/it]
INFO:vertexai.evaluation._evaluation:All 6 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:6.1634453179976845 seconds


                                              prompt  \
0  Call 9-1-1 immediately if you think someone is...   
1  Check IRS Publication 530, "Tax Information fo...   
2  Acceptable documents include pay stubs, W-2 fo...   

                                            response  \
0  Okay, I understand. If I think someone is havi...   
1  Okay, I understand. You're suggesting that any...   
2  This is a clear and concise statement from the...   

                            groundedness/explanation  groundedness/score  \
0  The response is fully grounded, as it only rep...                 1.0   
1  The response provides a lot of information tha...                 0.0   
2  The response only contains information present...                 1.0   

                               coherence/explanation  coherence/score  
0  The response perfectly and coherently acknowle...              5.0  
1  The response is highly coherent, providing a c...              5.0  
2  The response provides a cl

INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/416100490526/locations/us-central1/metadataStores/default/contexts/tweet-generation-apt-gen-20250428-202545 to Experiment: tweet-generation


INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'model_name': 'publishers/google/models/gemini-2.0-flash-001', 'temperature': 0, 'top_p': 0.4}
INFO:vertexai.evaluation._evaluation:Generating a total of 3 responses from Gemini model gemini-2.0-flash-001.
100%|██████████| 3/3 [00:01<00:00,  1.60it/s]
INFO:vertexai.evaluation._evaluation:All 3 responses are successfully generated from Gemini model gemini-2.0-flash-001.
INFO:vertexai.evaluation._evaluation:Multithreaded Batch Inference took: 1.884751908997714 seconds.
INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 6 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 6/6 [00:06<00:00,  1.02s/it]
INFO:vertexai.evaluation._evaluation:All 6 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:6.115599148000911 seconds


                                              prompt  \
0  Winter Weather Alert: Heavy snowfall expected ...   
1  Join us for Community Service Day this Saturda...   
2  Snow removal priority change: Emergency routes...   

                                            response  \
0  Okay, I understand. A winter weather alert is ...   
1  Okay, I'm in! To help me plan, could you tell ...   
2  Okay, I understand. The new snow removal prior...   

                            groundedness/explanation  groundedness/score  \
0  The response includes information such as "Avo...                 0.0   
1  The response only uses information provided in...                 1.0   
2  The response accurately reflects the informati...                 1.0   

                               coherence/explanation  coherence/score  
0  The response has a logical flow and is expertl...              5.0  
1  The response is completely coherent as it logi...              5.0  
2  The response is completely