In [1]:
import unittest
from google import genai
from google.genai import types
import base64
import requests
import pandas as pd
import datetime
from typing import Dict
from vertexai import init
from vertexai.evaluation import (
    EvalTask,
    MetricPromptTemplateExamples
)
from vertexai.preview.evaluation import notebook_utils

In [2]:
init(project="qwiklabs-gcp-00-171b5867e51b", location="us-central1")

In [3]:
evaluation_results = []

#ADS ChatBot Unit Test Case

In [4]:
class TestADSChatBot(unittest.TestCase):
    def setUp(self):
        self.chatbot_url = "https://ads-chat-bot-479971105418.europe-west1.run.app"
        self.validation_prompt = """
        You are a response validation assistant. Your task is to compare the chatbot's response with the expected response and determine if they are semantically equivalent.

        Consider the following criteria:
        1. Key information matches
        2. Intent is preserved
        3. Tone and formality level are appropriate
        4. No critical information is missing
        5. No misleading or incorrect information is added

        Respond with a JSON object in this format:
        {
            "is_valid": true/false,
            "score": 0-1 (where 1 is perfect match),
            "reason": "Brief explanation of why the response is valid/invalid"
        }
        """

    def test_refund_for_property_damage(self):
        self._run_case(
            "How do I request a refund if a private contractor damaged my property?",
            "ADS is not responsible for private contractor damages. Contact the contractor directly or your local municipality for guidance on claims."
        )

    def test_financial_assistance_snow_removal(self):
        self._run_case(
            "Does ADS offer financial assistance for snow removal equipment?",
            "ADS does not provide direct financial assistance. However, some state grants may be available to local governments for purchasing snow removal equipment."
        )

    def test_plows_for_private_property(self):
        self._run_case(
            "Are ADS plows available for hire for private property?",
            "No. ADS resources are dedicated to public roads and infrastructure. Private snow removal must be arranged through local contractors."
        )

    def test_check_road_conditions(self):
        self._run_case(
            "How can I check current road conditions statewide?",
            "Use the ADS “SnowLine” app or visit the official ADS website’s road conditions dashboard, which is updated hourly with closures and warnings."
        )

    def test_avalanche_control(self):
        self._run_case(
            "Does ADS handle avalanche control?",
            "Yes. In mountainous areas, ADS collaborates with the Alaska Department of Transportation and local authorities for controlled avalanche mitigation."
        )

    def test_out_of_knowledge(self):
        self._run_case(
            "Who is the President of U.S?",
            "I'm not sure about that. Please try rephrasing your question."
        )

    def _run_case(self, query: str, expected_response: str):
        response = self._make_request(query)
        validation_result = self._validate_response(response, expected_response)
        self.assertTrue(validation_result["is_valid"], f"Validation failed: {validation_result['reason']}")
        self.assertGreaterEqual(validation_result["score"], 0.7)

    def _make_request(self, query: str) -> str:
        """Make a request to the chatbot API"""
        try:
            response = requests.post(
                self.chatbot_url,
                json={"query": query},
                headers={"Content-Type": "application/json"}
            )
            response.raise_for_status()
            return response.json()["answer"] or response.json()["response"]
        except Exception as e:
            self.fail(f"Failed to make request to chatbot: {str(e)}")

    def _validate_response(self, actual_response: str, expected_response: str) -> Dict:
        """Validate the chatbot's response against the expected response"""
        try:
            eval_dataset = pd.DataFrame([{
                "instruction": self.validation_prompt,
                "context": f"Expected response: {expected_response}",
                "response": f"Actual response: {actual_response}"
            }])


            run_ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
            eval_task = EvalTask(
                dataset=eval_dataset,
                metrics=[
                    MetricPromptTemplateExamples.Pointwise.GROUNDEDNESS,
                    MetricPromptTemplateExamples.Pointwise.SAFETY
                ],
                experiment=f"response-validation-{run_ts}"
            )

            result = eval_task.evaluate(
                prompt_template="Instruction: {instruction}. Context: {context}. Response: {response}",
                experiment_run_name="response-validation"
            )

            evaluation_results.append(result)

            metrics = result.metrics_table
            score = (metrics["groundedness/score"].mean() + metrics["safety/score"].mean()) / 2

            return {
                "is_valid": score >= 0.7,
                "score": float(score),
                "reason": f"Response matches expected content with score {score:.2f}"
            }

        except Exception as e:
            print(f"Error during response validation: {str(e)}")
            return {
                "is_valid": False,
                "score": 0.0,
                "reason": f"Validation failed: {str(e)}"
            }



In [5]:
unittest.main(argv=[''], verbosity=2, exit=False)

test_avalanche_control (__main__.TestADSChatBot.test_avalanche_control) ... 

INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': 'Instruction: {instruction}. Context: {context}. Response: {response}'}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 2 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 2/2 [00:00<00:00,  2.36it/s]
INFO:vertexai.evaluation._evaluation:All 2 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:0.8562349690000701 seconds


ok
test_check_road_conditions (__main__.TestADSChatBot.test_check_road_conditions) ... 

INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': 'Instruction: {instruction}. Context: {context}. Response: {response}'}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 2 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 2/2 [00:00<00:00,  2.71it/s]
INFO:vertexai.evaluation._evaluation:All 2 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:0.7471204539997416 seconds


ok
test_financial_assistance_snow_removal (__main__.TestADSChatBot.test_financial_assistance_snow_removal) ... 

INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': 'Instruction: {instruction}. Context: {context}. Response: {response}'}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 2 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 2/2 [00:00<00:00,  2.60it/s]
INFO:vertexai.evaluation._evaluation:All 2 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:0.7776000379999459 seconds


ok
test_out_of_knowledge (__main__.TestADSChatBot.test_out_of_knowledge) ... 

INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': 'Instruction: {instruction}. Context: {context}. Response: {response}'}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 2 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 2/2 [00:00<00:00,  2.67it/s]
INFO:vertexai.evaluation._evaluation:All 2 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:0.7561505810008384 seconds


ok
test_plows_for_private_property (__main__.TestADSChatBot.test_plows_for_private_property) ... 

INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': 'Instruction: {instruction}. Context: {context}. Response: {response}'}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 2 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 2/2 [00:00<00:00,  2.79it/s]
INFO:vertexai.evaluation._evaluation:All 2 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:0.7244530100015254 seconds


ok
test_refund_for_property_damage (__main__.TestADSChatBot.test_refund_for_property_damage) ... 

INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': 'Instruction: {instruction}. Context: {context}. Response: {response}'}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 2 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 2/2 [00:00<00:00,  2.48it/s]
INFO:vertexai.evaluation._evaluation:All 2 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:0.8152612110006885 seconds


ok

----------------------------------------------------------------------
Ran 6 tests in 111.278s

OK


<unittest.main.TestProgram at 0x7f940afd3650>

# Test Result Comparision

In [7]:
combined_metrics = pd.concat(
    [result.metrics_table for result in evaluation_results if result],
    ignore_index=True
)

In [9]:
# Create a dummy class to mock EvalResult for display
class CombinedEvalResult:
    def __init__(self, metrics_table):
        self.metrics_table = metrics_table
        self.summary_metrics = self._compute_summary(metrics_table)

    def _compute_summary(self, df: pd.DataFrame):
        # Compute mean for each column ending with "/score"
        summary = {}
        for col in df.columns:
            if col.endswith("/score"):
                summary[col] = df[col].mean()
        return summary

In [10]:
combined_result = CombinedEvalResult(combined_metrics)
notebook_utils.display_eval_result(eval_result=combined_result)

### Summary Metrics

Unnamed: 0,groundedness/score,safety/score
0,1.0,1.0


### Row-based Metrics

Unnamed: 0,instruction,context,response,prompt,groundedness/explanation,groundedness/score,safety/explanation,safety/score
0,\n You are a response validation assist...,"Expected response: Yes. In mountainous areas, ...","Actual response: Yes, it does. In mountainous ...",Instruction: \n You are a response vali...,The actual response is semantically equivalent...,1.0,The response is safe as it does not contain an...,1.0
1,\n You are a response validation assist...,Expected response: Use the ADS “SnowLine” app ...,Actual response: You can check current statewi...,Instruction: \n You are a response vali...,The response is fully grounded in the provided...,1.0,"The response does not contain any hate speech,...",1.0
2,\n You are a response validation assist...,Expected response: ADS does not provide direct...,Actual response: ADS does not provide direct f...,Instruction: \n You are a response vali...,The actual response is identical to the expect...,1.0,"The response does not contain any hate speech,...",1.0
3,\n You are a response validation assist...,Expected response: I'm not sure about that. Pl...,Actual response: I'm not sure about that. Plea...,Instruction: \n You are a response vali...,The actual response is completely grounded in ...,1.0,The response is safe as it does not contain an...,1.0
4,\n You are a response validation assist...,Expected response: No. ADS resources are dedic...,"Actual response: No, ADS resources are dedicat...",Instruction: \n You are a response vali...,The actual response is semantically equivalent...,1.0,"The response does not contain any hate speech,...",1.0
5,\n You are a response validation assist...,Expected response: ADS is not responsible for ...,"Actual response: According to our information,...",Instruction: \n You are a response vali...,The actual response is entirely based on the i...,1.0,"The response does not contain any hate speech,...",1.0
