In [1]:
from google import genai
from google.genai import types
from IPython.display import Markdown
from pydantic import BaseModel, Field, ConfigDict
from devtools import debug
from pathlib import Path
from typing import *
import requests
import re

### Config

#### Product to Research
Most important config setting! Change to the product category you wish to study.
Also specify the number of metrics to consider.

All subsequent notebooks will use PRODUCT to determine which analysis session folder
to use!

In [2]:
PRODUCT = "earbuds"
NUM_METRICS = 3
NUM_COMPETITORS = 3
DATA_DIR = Path("session") / PRODUCT

DATA_DIR.mkdir(parents=True, exist_ok=True)

#### API Keys and Model
See <https://ai.google.dev/gemini-api/docs/api-key>

In [3]:
GOOGLE_AI_KEY = "AIzaSyDAlPx7St5BUXqlwiqFKvlT-Sc2dnTT4Jc"
# 2.0 Flash since free 1500 RPD and Gemma's structured output is disabled.
GOOGLE_AI_MODEL = "gemini-2.0-flash"

### Operations

#### Setup

In [4]:
client = genai.Client(api_key=GOOGLE_AI_KEY)

#### Determining Key Metrics
As part of the automated process, we prompt a model to brainstorm key metrics for the product category. This serves as a starting point by which to analyze competitor products.

In [5]:
# BTW, putting rationale before metrics is a way to force the model to think first.
class MetricsResult(BaseModel):
    category: Literal[PRODUCT] = Field(description="Category that was analyzed.")
    rationale: str = Field(description="Detailed rationale for picking the below metrics.")
    metrics: List[str] = Field(min_length=NUM_METRICS, max_length=NUM_METRICS, description="Concise metrics to focus on.")

In [6]:
prompt_metrics = f"""\
### Job Description
You are a market research analyst. You will be given a product category, and from \
that should determine the most relevant design metrics for it. These design \
metrics will subsequently be used by the engineering team to evaluate existing products \
and generate key design requirements for our new product to beat the competition.

For example, given the product category "laptop", the top 5 design metrics \
could be:
1. battery life
2. performance
3. portability
4. durability
5. keyboard quality

Another example, given the product category "accounting app", the top 3 design metrics \
could be:
1. ease of use
2. data security
3. integration with other tools

Note that we are focused solely on design metrics, hence other metrics like "customer support" \
and "price" are irrelevant as they cannot be met through design engineering efforts.

Also, try to avoid overlapping metrics. For example, "battery life" and "screen time" \
which are both related to battery performance should not be included together.

### Response Schema
{MetricsResult.model_json_schema()}

### Task
Use the JSON schema specified above to provide the top **{NUM_METRICS}** design metrics \
for product category "{PRODUCT}".\
"""

display(Markdown(prompt_metrics))

### Job Description
You are a market research analyst. You will be given a product category, and from that should determine the most relevant design metrics for it. These design metrics will subsequently be used by the engineering team to evaluate existing products and generate key design requirements for our new product to beat the competition.

For example, given the product category "laptop", the top 5 design metrics could be:
1. battery life
2. performance
3. portability
4. durability
5. keyboard quality

Another example, given the product category "accounting app", the top 3 design metrics could be:
1. ease of use
2. data security
3. integration with other tools

Note that we are focused solely on design metrics, hence other metrics like "customer support" and "price" are irrelevant as they cannot be met through design engineering efforts.

Also, try to avoid overlapping metrics. For example, "battery life" and "screen time" which are both related to battery performance should not be included together.

### Response Schema
{'properties': {'category': {'const': 'earbuds', 'description': 'Category that was analyzed.', 'title': 'Category', 'type': 'string'}, 'rationale': {'description': 'Detailed rationale for picking the below metrics.', 'title': 'Rationale', 'type': 'string'}, 'metrics': {'description': 'Concise metrics to focus on.', 'items': {'type': 'string'}, 'maxItems': 3, 'minItems': 3, 'title': 'Metrics', 'type': 'array'}}, 'required': ['category', 'rationale', 'metrics'], 'title': 'MetricsResult', 'type': 'object'}

### Task
Use the JSON schema specified above to provide the top **3** design metrics for product category "earbuds".

In [7]:
resp = client.models.generate_content(
    model=GOOGLE_AI_MODEL,
    contents=prompt_metrics,
    config=types.GenerateContentConfig(
        responseMimeType="application/json",
        responseSchema=MetricsResult,
    ),
)
result_metrics = resp.parsed
debug(result_metrics)

/tmp/ipykernel_2873574/269700729.py:10 <module>
    result_metrics: MetricsResult(
        category='earbuds',
        rationale=(
            'When designing earbuds, the most important factors to consider are audio quality, comfort, and battery li'
            'fe. Audio quality determines the listening experience. Comfort affects how long users can wear the earbud'
            's. Battery life dictates how long users can use the earbuds before charging.'
        ),
        metrics=[
            'audio quality',
            'comfort',
            'battery life',
        ],
    ) (MetricsResult)


MetricsResult(category='earbuds', rationale='When designing earbuds, the most important factors to consider are audio quality, comfort, and battery life. Audio quality determines the listening experience. Comfort affects how long users can wear the earbuds. Battery life dictates how long users can use the earbuds before charging.', metrics=['audio quality', 'comfort', 'battery life'])

#### Finding Competitors
The goal is to prompt the LLM to find interesting, distinct competitors in different
niches. For example, after choosing Apple iPhone 16 as a competitor to study, if the LLM has
to choose between Samsung S23 (similar to Apple) and a more niche brand like Nothing
Phone (3a), it should pick the latter to get a more diverse set of competitors.

In [8]:
class CompetitorProduct(BaseModel):
    name: str = Field(description="Full name of the product.")
    reference: str = Field(description="Website URL referenced for the product.")
    reference_title: str = Field(description="Title of the reference website.")
    reference_summary: str = Field(description="Summary of the reference website's contents.")

class CompetitionResult(BaseModel):
    category: Literal[PRODUCT] = Field(description="Product category that was analyzed.")
    rationale: str = Field(description="Detailed rationale for picking the below competitors' products.")
    products: List[CompetitorProduct] = Field(min_length=NUM_COMPETITORS, max_length=NUM_COMPETITORS, description="Each selected product.")

In [9]:
prompt_competition = f"""\
### Job Description
You are a market research analyst. You will be given a product category, and from \
that should determine the most relevant competitors' products to analyze. \
These competitors' products will subsequently be used by the engineering team to \
generate key design requirements for our new product to beat the competition.

For example, given the product category "laptop", the top 5 competitors' products \
could be:
1. Dell XPS 13 (2023)
2. MacBook Air (M3, 2024)
3. Lenovo ThinkPad X1 Carbon Gen 13
4. HP Spectre x360 (2017)
5. ASUS ZenBook 13 (2024)

Another example, given the product category "accounting app", the top 3 competitors' products \
could be:
1. QuickBooks
2. Xero
3. FreshBooks

Note that diversity of products is key here. It is especially good if the products \
fill different niches in the product category.

Also, try to avoid overlapping products. For example, "Dell XPS 13 (2013)" and "Dell XPS 15 (2013)" \
which are both from the same product line should not be included together. The only \
exception to this rule is if the product line is very different from each other, \
for example, "Lenovo Yoga 9i Gen 8" and "Lenovo Legion 7 Gen 8" are both laptops, but \
one is an ultraportable and the other is a gaming laptop, so they can be included together.

Where possible, include the full product name like (Vendor), (Product Name), (Generation or Year).

To ensure currency, please do a web search to find recent products on review or comparison sites. \
For example, you can search for "best laptops this year" or "best accounting apps this year",
though try and come up with better product-specific search terms.

### Response Schema
{CompetitionResult.model_json_schema()}

### Task
Use the JSON schema specified above to provide **{NUM_COMPETITORS}** competitor products \
that are in product category "{PRODUCT}".\
"""

display(Markdown(prompt_competition))

### Job Description
You are a market research analyst. You will be given a product category, and from that should determine the most relevant competitors' products to analyze. These competitors' products will subsequently be used by the engineering team to generate key design requirements for our new product to beat the competition.

For example, given the product category "laptop", the top 5 competitors' products could be:
1. Dell XPS 13 (2023)
2. MacBook Air (M3, 2024)
3. Lenovo ThinkPad X1 Carbon Gen 13
4. HP Spectre x360 (2017)
5. ASUS ZenBook 13 (2024)

Another example, given the product category "accounting app", the top 3 competitors' products could be:
1. QuickBooks
2. Xero
3. FreshBooks

Note that diversity of products is key here. It is especially good if the products fill different niches in the product category.

Also, try to avoid overlapping products. For example, "Dell XPS 13 (2013)" and "Dell XPS 15 (2013)" which are both from the same product line should not be included together. The only exception to this rule is if the product line is very different from each other, for example, "Lenovo Yoga 9i Gen 8" and "Lenovo Legion 7 Gen 8" are both laptops, but one is an ultraportable and the other is a gaming laptop, so they can be included together.

Where possible, include the full product name like (Vendor), (Product Name), (Generation or Year).

To ensure currency, please do a web search to find recent products on review or comparison sites. For example, you can search for "best laptops this year" or "best accounting apps this year",
though try and come up with better product-specific search terms.

### Response Schema
{'$defs': {'CompetitorProduct': {'properties': {'name': {'description': 'Full name of the product.', 'title': 'Name', 'type': 'string'}, 'reference': {'description': 'Website URL referenced for the product.', 'title': 'Reference', 'type': 'string'}, 'reference_title': {'description': 'Title of the reference website.', 'title': 'Reference Title', 'type': 'string'}, 'reference_summary': {'description': "Summary of the reference website's contents.", 'title': 'Reference Summary', 'type': 'string'}}, 'required': ['name', 'reference', 'reference_title', 'reference_summary'], 'title': 'CompetitorProduct', 'type': 'object'}}, 'properties': {'category': {'const': 'earbuds', 'description': 'Product category that was analyzed.', 'title': 'Category', 'type': 'string'}, 'rationale': {'description': "Detailed rationale for picking the below competitors' products.", 'title': 'Rationale', 'type': 'string'}, 'products': {'description': 'Each selected product.', 'items': {'$ref': '#/$defs/CompetitorProduct'}, 'maxItems': 3, 'minItems': 3, 'title': 'Products', 'type': 'array'}}, 'required': ['category', 'rationale', 'products'], 'title': 'CompetitionResult', 'type': 'object'}

### Task
Use the JSON schema specified above to provide **3** competitor products that are in product category "earbuds".

In [11]:
google_search_tool = types.Tool(
    google_search = types.GoogleSearch(),
)

# Sometimes, the model gets confused and outputs the JSON schema instead of the data.
# Assume if there's a valid reference URL, the response was valid.
while True:
    # Unable to submit request because controlled generation is not supported with google_search tool. Learn more: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/gemini
    # So we send it again to the model to extract the json just in case it messes up the schema.
    resp1 = client.models.generate_content(
        model=GOOGLE_AI_MODEL,
        contents=prompt_competition,
        config=types.GenerateContentConfig(
            tools=[google_search_tool],
            response_modalities=["TEXT"],
        )
    )
    display(Markdown(resp1.text))

    # Now we extract the JSON from the text response.
    resp2 = client.models.generate_content(
        model=GOOGLE_AI_MODEL,
        contents=f"Extract JSON from this text without modifying the contents: {resp1.text}",
        config=types.GenerateContentConfig(
            responseMimeType="application/json",
            responseSchema=CompetitionResult,
        ),
    )
    
    result_competition = resp2.parsed
    debug(result_competition)

    # Check if the response is valid.
    if all([re.match(r"^https?://", product.reference) for product in result_competition.products]):
        break
    else:
        print("Invalid response, retrying...")
        continue

```json
{
  "category": "earbuds",
  "rationale": "To provide a diverse set of options for the engineering team, I have chosen three earbuds that represent different price points, features, and target audiences. The products that I selected are very well received and are the most recent models available on the market.",
  "products": [
    {
      "name": "Sony WF-1000XM5",
      "reference": "https://vertexaisearch.cloud.google.com/grounding-api-redirect/AWQVqALtUIPVQ2Q81ASPfWnMYDTsBFUqT_4wwrq2dhT8qhRdzPo6bH33IF3TVgmtwiAC6VNNAssoq1yMagJdutG0zU-UbsBssqilDER1RSbU7kIgn8jkwuInl0LJj8iHnbNs89oyRf4fKoo8R6xnHN2TNoHdTtE=",
      "reference_title": "Best wireless earbuds in 2025: top pairs tested by our reviewers - What Hi-Fi?",
      "reference_summary": "The Sony WF-1000XM5 stands out as a top-tier choice in the competitive world of noise canceling earbuds. With a comfortable design, outstanding ANC, and fantastic battery life, the WF-1000XM5 are one of the best wireless earbuds on the market."
    },
    {
      "name": "Bose QuietComfort Ultra Earbuds",
      "reference": "https://vertexaisearch.cloud.google.com/grounding-api-redirect/AWQVqAL26hGp8CGRIaeHFOzcLgdMCAE-TZ8rljndAkn09hLxnG3L6BWY9NeSO2cCjm2ScHJ-MmT8jvDUgTj17biGYE74DFg17RsuNMwklz6xgBcTDj19ZPNETIdEj9rfslm5y0icDan0l5qgNCtcKp-QOK0BgwvMv3s_N0jqkTkVJg==",
      "reference_title": "The best earbuds 2025: buds for every budget, all TechRadar tested and recommended",
      "reference_summary": "The Bose QuietComfort Ultra Earbuds are our pick for the best noise-cancelling earbuds in this guide, and for good reason. They take everything great about the QuietComfort Earbuds 2 that came before them and add some major improvements, like excellent device-agnostic head-tracked immersive audio and even better ANC."
    },
    {
      "name": "Technics EAH-AZ100",
      "reference": "https://vertexaisearch.cloud.google.com/grounding-api-redirect/AWQVqAL26hGp8CGRIaeHFOzcLgdMCAE-TZ8rljndAkn09hLxnG3L6BWY9NeSO2cCjm2ScHJ-MmT8jvDUgTj17biGYE74DFg17RsuNMwklz6xgBcTDj19ZPNETIdEj9rfslm5y0icDan0l5qgNCtcKp-QOK0BgwvMv3s_N0jqkTkVJg==",
      "reference_title": "The best earbuds 2025: buds for every budget, all TechRadar tested and recommended",
      "reference_summary": "The Technics EAZ-AH100 Wireless Earbuds is the top winner: They offer a great blend of battery life, audio quality and ANC, and the design feels very sturdy as well."
    }
  ]
}
```

/tmp/ipykernel_2873574/508870191.py:31 <module>
    result_competition: CompetitionResult(
        category='earbuds',
        rationale=(
            'To provide a diverse set of options for the engineering team, I have chosen three earbuds that represent '
            'different price points, features, and target audiences. The products that I selected are very well receiv'
            'ed and are the most recent models available on the market.'
        ),
        products=[
            CompetitorProduct(
                name='Sony WF-1000XM5',
                reference=(
                    'https://vertexaisearch.cloud.google.com/grounding-api-redirect/AWQVqALtUIPVQ2Q81ASPfWnMYDTsBFUqT_'
                    '4wwrq2dhT8qhRdzPo6bH33IF3TVgmtwiAC6VNNAssoq1yMagJdutG0zU-UbsBssqilDER1RSbU7kIgn8jkwuInl0LJj8iHnbN'
                    's89oyRf4fKoo8R6xnHN2TNoHdTtE='
                ),
                reference_title='Best wireless earbuds in 2025: top pairs tested by our reviewers - What Hi-

#### Post Processing
Google hides the URL from the model, so we need to extract it by resolving the redirect.
Finally, save everything to a JSON file for use in the later stages of the pipeline.

In [12]:
def resolve_redirect(url):
    resp = requests.get(url, allow_redirects=False)
    if 300 <= resp.status_code < 400:
        return resp.headers.get("Location")
    return url

In [13]:
output = {}

assert result_competition.category == result_metrics.category and result_competition.category == PRODUCT
output["category"] = PRODUCT
output["metrics_rationale"] = result_metrics.rationale
output["metrics"] = result_metrics.metrics
output["competition_rationale"] = result_competition.rationale
output["competition_products"] = []

for product in result_competition.products:
    product_dict = {
        "name": product.name,
        "reference": resolve_redirect(product.reference),
        "reference_title": product.reference_title,
        "reference_summary": product.reference_summary,
    }
    output["competition_products"].append(product_dict)

output

{'category': 'earbuds',
 'metrics_rationale': 'When designing earbuds, the most important factors to consider are audio quality, comfort, and battery life. Audio quality determines the listening experience. Comfort affects how long users can wear the earbuds. Battery life dictates how long users can use the earbuds before charging.',
 'metrics': ['audio quality', 'comfort', 'battery life'],
 'competition_rationale': 'To provide a diverse set of options for the engineering team, I have chosen three earbuds that represent different price points, features, and target audiences. The products that I selected are very well received and are the most recent models available on the market.',
 'competition_products': [{'name': 'Sony WF-1000XM5',
   'reference': 'https://vertexaisearch.cloud.google.com/grounding-api-redirect/AWQVqALtUIPVQ2Q81ASPfWnMYDTsBFUqT_4wwrq2dhT8qhRdzPo6bH33IF3TVgmtwiAC6VNNAssoq1yMagJdutG0zU-UbsBssqilDER1RSbU7kIgn8jkwuInl0LJj8iHnbNs89oyRf4fKoo8R6xnHN2TNoHdTtE=',
   'referen

In [14]:
with open(DATA_DIR / "stage_1.json", "w") as f:
    import json
    json.dump(output, f, indent=2)