Starting point and first notebook

In [1]:
from google import genai
from google.genai import types
from IPython.display import Markdown
from pydantic import BaseModel, Field
from devtools import debug
from pathlib import Path
from typing import *
import requests
import re
import json

### Config

#### Product to Research
Most important config setting! Change to the product category you wish to study.
Also specify the number of metrics to consider.

All subsequent notebooks will use PRODUCT to determine which analysis session folder
to use!

In [6]:
PRODUCT = "earbuds"
NUM_METRICS = 3
NUM_COMPETITORS = 3
MAX_SPECS = 10
DATA_DIR = Path("session") / PRODUCT

DATA_DIR.mkdir(parents=True, exist_ok=True)

#### API Keys and Model
See <https://ai.google.dev/gemini-api/docs/api-key>

In [3]:
# Should be from Google AI Studio.
GOOGLE_AI_KEY = "AIzaSyDAlPx7St5BUXqlwiqFKvlT-Sc2dnTT4Jc"
# 2.0 Flash since free 1500 RPD and Gemma's structured output is disabled.
GOOGLE_AI_MODEL = "gemini-2.0-flash"

### Operations

#### Setup

In [4]:
client = genai.Client(api_key=GOOGLE_AI_KEY)

#### Determining Key Metrics
As part of the automated process, we prompt a model to brainstorm key metrics for the product category. This serves as a starting point by which to analyze competitor products.

In [None]:
# BTW, putting rationale before metrics is a way to force the model to think first.
class MetricsResult(BaseModel):
    category: Literal[PRODUCT] = Field(description="Category that was analyzed.")
    rationale: str = Field(
        description="Detailed rationale for picking the below metrics."
    )
    metrics: List[str] = Field(
        min_length=NUM_METRICS,
        max_length=NUM_METRICS,
        description="Concise metrics to focus on.",
    )

In [6]:
prompt_metrics = f"""\
### Job Description
You are a market research analyst. You will be given a product category, and from \
that should determine the most relevant design metrics for it. These design \
metrics will subsequently be used by the engineering team to evaluate existing products \
and generate key design requirements for our new product to beat the competition.

For example, given the product category "laptop", the top 5 design metrics \
could be:
1. battery life
2. performance
3. portability
4. durability
5. keyboard quality

Another example, given the product category "accounting app", the top 3 design metrics \
could be:
1. ease of use
2. data security
3. integration with other tools

Note that we are focused solely on design metrics, hence other metrics like "customer support" \
and "price" are irrelevant as they cannot be met through design engineering efforts.

Also, try to avoid overlapping metrics. For example, "battery life" and "screen time" \
which are both related to battery performance should not be included together.

### Response Schema
{MetricsResult.model_json_schema()}

### Task
Use the JSON schema specified above to provide the top **{NUM_METRICS}** design metrics \
for product category "{PRODUCT}".\
"""

display(Markdown(prompt_metrics))

### Job Description
You are a market research analyst. You will be given a product category, and from that should determine the most relevant design metrics for it. These design metrics will subsequently be used by the engineering team to evaluate existing products and generate key design requirements for our new product to beat the competition.

For example, given the product category "laptop", the top 5 design metrics could be:
1. battery life
2. performance
3. portability
4. durability
5. keyboard quality

Another example, given the product category "accounting app", the top 3 design metrics could be:
1. ease of use
2. data security
3. integration with other tools

Note that we are focused solely on design metrics, hence other metrics like "customer support" and "price" are irrelevant as they cannot be met through design engineering efforts.

Also, try to avoid overlapping metrics. For example, "battery life" and "screen time" which are both related to battery performance should not be included together.

### Response Schema
{'properties': {'category': {'const': 'earbuds', 'description': 'Category that was analyzed.', 'title': 'Category', 'type': 'string'}, 'rationale': {'description': 'Detailed rationale for picking the below metrics.', 'title': 'Rationale', 'type': 'string'}, 'metrics': {'description': 'Concise metrics to focus on.', 'items': {'type': 'string'}, 'maxItems': 3, 'minItems': 3, 'title': 'Metrics', 'type': 'array'}}, 'required': ['category', 'rationale', 'metrics'], 'title': 'MetricsResult', 'type': 'object'}

### Task
Use the JSON schema specified above to provide the top **3** design metrics for product category "earbuds".

In [7]:
resp = client.models.generate_content(
    model=GOOGLE_AI_MODEL,
    contents=prompt_metrics,
    config=types.GenerateContentConfig(
        responseMimeType="application/json",
        responseSchema=MetricsResult,
    ),
)
result_metrics = resp.parsed
debug(result_metrics)

/tmp/ipykernel_27473/269700729.py:10 <module>
    result_metrics: MetricsResult(
        category='earbuds',
        rationale=(
            'When designing earbuds, focusing on sound quality ensures a great listening experience. Comfort is essent'
            'ial for prolonged use without discomfort. Battery life is crucial for uninterrupted enjoyment throughout '
            'the day.'
        ),
        metrics=[
            'sound quality',
            'comfort',
            'battery life',
        ],
    ) (MetricsResult)


MetricsResult(category='earbuds', rationale='When designing earbuds, focusing on sound quality ensures a great listening experience. Comfort is essential for prolonged use without discomfort. Battery life is crucial for uninterrupted enjoyment throughout the day.', metrics=['sound quality', 'comfort', 'battery life'])

#### Finding Competitors
The goal is to prompt the LLM to find interesting, distinct competitors in different
niches. For example, after choosing Apple iPhone 16 as a competitor to study, if the LLM has
to choose between Samsung S23 (similar to Apple) and a more niche brand like Nothing
Phone (3a), it should pick the latter to get a more diverse set of competitors.

In [None]:
class CompetitorProduct(BaseModel):
    name: str = Field(description="Full name of the product.")
    reference: str = Field(description="Website URL referenced for the product.")
    reference_title: str = Field(description="Title of the reference website.")
    reference_summary: str = Field(
        description="Summary of the reference website's contents."
    )


class CompetitionResult(BaseModel):
    category: Literal[PRODUCT] = Field(
        description="Product category that was analyzed."
    )
    rationale: str = Field(
        description="Detailed rationale for picking the below competitors' products."
    )
    products: List[CompetitorProduct] = Field(
        min_length=NUM_COMPETITORS,
        max_length=NUM_COMPETITORS,
        description="Each selected product.",
    )

In [11]:
prompt_competition = f"""\
### Job Description
You are a market research analyst. You will be given a product category, and from \
that should determine the most relevant competitors' products to analyze. \
These competitors' products will subsequently be used by the engineering team to \
generate key design requirements for our new product to beat the competition.

For example, given the product category "laptop", the top 5 competitors' products \
could be:
1. Dell XPS 13 (2023)
2. MacBook Air (M3, 2024)
3. Lenovo ThinkPad X1 Carbon Gen 13
4. HP Spectre x360 (2017)
5. ASUS ZenBook 13 (2024)

Another example, given the product category "accounting app", the top 3 competitors' products \
could be:
1. QuickBooks
2. Xero
3. FreshBooks

Note that diversity of products is key here. It is especially good if the products \
fill different niches in the product category.

Also, try to avoid overlapping products. For example, "Dell XPS 13 (2013)" and "Dell XPS 15 (2013)" \
which are both from the same product line should not be included together. The only \
exception to this rule is if the product line is very different from each other, \
for example, "Lenovo Yoga 9i Gen 8" and "Lenovo Legion 7 Gen 8" are both laptops, but \
one is an ultraportable and the other is a gaming laptop, so they can be included together.

Where possible, include the full product name like (Vendor), (Product Name), (Generation or Year).

To ensure currency, please do a web search to find recent products on review or comparison sites. \
For example, you can search for "best laptops this year" or "best accounting apps this year",
though try and come up with better product-specific search terms.

### Response Schema
{CompetitionResult.model_json_schema()}

### Task
Use the JSON schema specified above to provide **{NUM_COMPETITORS}** competitor products \
that are in product category "{PRODUCT}".\
"""

display(Markdown(prompt_competition))

### Job Description
You are a market research analyst. You will be given a product category, and from that should determine the most relevant competitors' products to analyze. These competitors' products will subsequently be used by the engineering team to generate key design requirements for our new product to beat the competition.

For example, given the product category "laptop", the top 5 competitors' products could be:
1. Dell XPS 13 (2023)
2. MacBook Air (M3, 2024)
3. Lenovo ThinkPad X1 Carbon Gen 13
4. HP Spectre x360 (2017)
5. ASUS ZenBook 13 (2024)

Another example, given the product category "accounting app", the top 3 competitors' products could be:
1. QuickBooks
2. Xero
3. FreshBooks

Note that diversity of products is key here. It is especially good if the products fill different niches in the product category.

Also, try to avoid overlapping products. For example, "Dell XPS 13 (2013)" and "Dell XPS 15 (2013)" which are both from the same product line should not be included together. The only exception to this rule is if the product line is very different from each other, for example, "Lenovo Yoga 9i Gen 8" and "Lenovo Legion 7 Gen 8" are both laptops, but one is an ultraportable and the other is a gaming laptop, so they can be included together.

Where possible, include the full product name like (Vendor), (Product Name), (Generation or Year).

To ensure currency, please do a web search to find recent products on review or comparison sites. For example, you can search for "best laptops this year" or "best accounting apps this year",
though try and come up with better product-specific search terms.

### Response Schema
{'$defs': {'CompetitorProduct': {'properties': {'name': {'description': 'Full name of the product.', 'title': 'Name', 'type': 'string'}, 'reference': {'description': 'Website URL referenced for the product.', 'title': 'Reference', 'type': 'string'}, 'reference_title': {'description': 'Title of the reference website.', 'title': 'Reference Title', 'type': 'string'}, 'reference_summary': {'description': "Summary of the reference website's contents.", 'title': 'Reference Summary', 'type': 'string'}}, 'required': ['name', 'reference', 'reference_title', 'reference_summary'], 'title': 'CompetitorProduct', 'type': 'object'}}, 'properties': {'category': {'const': 'earbuds', 'description': 'Product category that was analyzed.', 'title': 'Category', 'type': 'string'}, 'rationale': {'description': "Detailed rationale for picking the below competitors' products.", 'title': 'Rationale', 'type': 'string'}, 'products': {'description': 'Each selected product.', 'items': {'$ref': '#/$defs/CompetitorProduct'}, 'maxItems': 3, 'minItems': 3, 'title': 'Products', 'type': 'array'}}, 'required': ['category', 'rationale', 'products'], 'title': 'CompetitionResult', 'type': 'object'}

### Task
Use the JSON schema specified above to provide **3** competitor products that are in product category "earbuds".

In [None]:
google_search_tool = types.Tool(
    google_search=types.GoogleSearch(),
)

# Sometimes, the model gets confused and outputs the JSON schema instead of the data.
# Assume if there's a valid reference URL, the response was valid.
while True:
    # Unable to submit request because controlled generation is not supported with google_search tool. Learn more: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/gemini
    # So we send it again to the model to extract the json just in case it messes up the schema.
    resp1 = client.models.generate_content(
        model=GOOGLE_AI_MODEL,
        contents=prompt_competition,
        config=types.GenerateContentConfig(
            tools=[google_search_tool],
            response_modalities=["TEXT"],
        ),
    )
    display(Markdown(resp1.text))

    # Now we extract the JSON from the text response.
    resp2 = client.models.generate_content(
        model=GOOGLE_AI_MODEL,
        contents=f"Extract JSON from this text without modifying the contents: {resp1.text}",
        config=types.GenerateContentConfig(
            responseMimeType="application/json",
            responseSchema=CompetitionResult,
        ),
    )

    result_competition = resp2.parsed
    debug(result_competition)

    # Check if the response is valid.
    if all(
        [
            re.match(r"^https?://", product.reference)
            for product in result_competition.products
        ]
    ):
        break
    else:
        print("Invalid response, retrying...")
        continue

```json
{'category': 'earbuds', 'rationale': 'These earbuds represent a range of options available, from high-end audiophile experiences to more budget-friendly choices. The selected products cover different use cases, such as noise cancellation, running, and overall value, ensuring a comprehensive comparison.', 'products': [{'name': 'Sony WF-1000XM5', 'reference': 'https://www.cnet.com/tech/computing/best-wireless-earbuds/', 'reference_title': 'Best Wireless and Bluetooth Earbuds for Premium Sound in 2025 - CNET', 'reference_summary': 'CNET discusses the best wireless earbuds of 2025, highlighting the Sony WF-1000XM5 as the best Sony earbuds.'}, {'name': 'Bose QuietComfort Ultra Earbuds', 'reference': 'https://www.rtings.com/headphones/reviews/bose/quietcomfort-ultra-earbuds-truly-wireless', 'reference_title': 'The 7 Best Wireless Earbuds For Running And Working Out of 2025 - RTINGS.com', 'reference_summary': 'RTINGS.com identifies the Bose QuietComfort Ultra Earbuds Truly Wireless as the best wireless Bluetooth earbuds for running, emphasizing their comfortable and stable fit.'}, {'name': 'OnePlus Buds 3', 'reference': 'https://www.forbes.com/sites/forbes-personal-shopper/article/best-wireless-earbuds/?sh=60763a9a74c8', 'reference_title': 'The Best Wireless Earbuds, Tested By Our Tech Editor - Forbes', 'reference_summary': 'Forbes identifies the OnePlus Buds 3 as the best budget wireless earbuds, noting their sleek design and comfortable fit.'}]}
```

/tmp/ipykernel_27473/508870191.py:31 <module>
    result_competition: CompetitionResult(
        category='earbuds',
        rationale=(
            'These earbuds represent a range of options available, from high-end audiophile experiences to more budget'
            '-friendly choices. The selected products cover different use cases, such as noise cancellation, running, '
            'and overall value, ensuring a comprehensive comparison.'
        ),
        products=[
            CompetitorProduct(
                name='Sony WF-1000XM5',
                reference='https://www.cnet.com/tech/computing/best-wireless-earbuds/',
                reference_title='Best Wireless and Bluetooth Earbuds for Premium Sound in 2025 - CNET',
                reference_summary=(
                    'CNET discusses the best wireless earbuds of 2025, highlighting the Sony WF-1000XM5 as the best So'
                    'ny earbuds.'
                ),
            ),
            CompetitorProduct(
     

#### Get Common Technical Specifications to Look Out For
This is later used as examples to help the extraction model find important technical
specifications.

In [7]:
class SpecsResult(BaseModel):
    category: Literal[PRODUCT] = Field(description="Category that was analyzed")
    rationale: str = Field(
        description="Detailed rationale for naming the below technical specifications"
    )
    specs: List[str] = Field(
        max_length=MAX_SPECS,
        description="Frequently compared technical specifications",
    )

In [None]:
prompt_tech_specs = f"""\
### Job Description
You are a market research analyst. You will be given a product category and a list of \
competitors' products within the category. From that, you shall determine a list \
of frequently compared technical specifications, which will be used by your junior \
analyst to compare different products. Carefully think about how "{PRODUCT}" is used \
(and if applicable, transported), to determine what consumers care about.

### Response Schema
{SpecsResult.model_json_schema()}

### Task
The products are:
{"\n".join(f"- {o.name}" for o in result_competition.products)}

Use the JSON schema to provide at most **{MAX_SPECS}** technical specifications \
that are frequently compared for product category "{PRODUCT}".\
"""
display(Markdown(prompt_tech_specs))

### Job Description
You are a market research analyst. You will be given a product category and a list of competitors' products within the category. From that, you shall determine a list of frequently compared technical specifications, which will be used by your junior analyst to compare different products. Carefully think about how "earbuds" is used (and if applicable, transported), to determine what consumers care about.

### Response Schema
{'properties': {'category': {'const': 'earbuds', 'description': 'Category that was analyzed', 'title': 'Category', 'type': 'string'}, 'rationale': {'description': 'Detailed rationale for naming the below technical specifications', 'title': 'Rationale', 'type': 'string'}, 'specs': {'description': 'Frequently compared technical specifications', 'items': {'type': 'string'}, 'maxItems': 10, 'title': 'Specs', 'type': 'array'}}, 'required': ['category', 'rationale', 'specs'], 'title': 'SpecsResult', 'type': 'object'}

### Task
The products are:
- Sony WF-1000XM5
- Bose QuietComfort Ultra Earbuds
- OnePlus Buds 3

Use the JSON schema to provide at most **10** technical specifications that are frequently compared for product category "earbuds".

In [15]:
resp = client.models.generate_content(
    model=GOOGLE_AI_MODEL,
    contents=prompt_tech_specs,
    config=types.GenerateContentConfig(
        responseMimeType="application/json",
        responseSchema=SpecsResult,
    ),
)
result_specs = resp.parsed
debug(result_specs)

/tmp/ipykernel_3915/339006114.py:10 <module>
    result_specs: SpecsResult(
        category='earbuds',
        rationale=(
            'When evaluating earbuds, consumers frequently compare specifications related to audio quality, noise canc'
            'ellation effectiveness, battery life (as earbuds are portable), comfort and fit (since they are worn insi'
            'de the ear), connectivity options (Bluetooth version), water resistance (for workouts or outdoor use), ch'
            'arging case features, and the presence of a microphone (for calls). Price is considered, but is not a tec'
            'hnical specification, and therefore excluded.'
        ),
        specs=[
            'Noise Cancellation',
            'Battery Life',
            'Water Resistance',
            'Bluetooth Version',
            'Charging Case Features',
            'Microphone Quality',
            'Comfort and Fit',
            'Audio Codecs',
            'Driver Size',
            'Impedance',

SpecsResult(category='earbuds', rationale='When evaluating earbuds, consumers frequently compare specifications related to audio quality, noise cancellation effectiveness, battery life (as earbuds are portable), comfort and fit (since they are worn inside the ear), connectivity options (Bluetooth version), water resistance (for workouts or outdoor use), charging case features, and the presence of a microphone (for calls). Price is considered, but is not a technical specification, and therefore excluded.', specs=['Noise Cancellation', 'Battery Life', 'Water Resistance', 'Bluetooth Version', 'Charging Case Features', 'Microphone Quality', 'Comfort and Fit', 'Audio Codecs', 'Driver Size', 'Impedance'])

#### Post Processing
Google hides the URL from the model, so we need to extract it by resolving the redirect.
Finally, save everything to a JSON file for use in the later stages of the pipeline.

In [15]:
def resolve_redirect(url):
    if url.startswith("https://vertexaisearch.cloud.google.com/grounding-api-redirect"):
        resp = requests.get(url, allow_redirects=False)
        if 300 <= resp.status_code < 400:
            return resp.headers.get("Location")
    return url

In [None]:
output = {}

assert (
    result_competition.category == result_metrics.category
    and result_competition.category == PRODUCT
)
output["category"] = PRODUCT
output["metrics_rationale"] = result_metrics.rationale
output["metrics"] = result_metrics.metrics
output["competition_rationale"] = result_competition.rationale
output["competition_products"] = []
output["specs_rationale"] = result_specs.rationale
output["specs"] = result_specs.specs

for product in result_competition.products:
    product_dict = {
        "name": product.name,
        "reference": resolve_redirect(product.reference),
        "reference_title": product.reference_title,
        "reference_summary": product.reference_summary,
    }
    output["competition_products"].append(product_dict)

output

{'category': 'earbuds',
 'metrics_rationale': 'When designing earbuds, focusing on sound quality ensures a great listening experience. Comfort is essential for prolonged use without discomfort. Battery life is crucial for uninterrupted enjoyment throughout the day.',
 'metrics': ['sound quality', 'comfort', 'battery life'],
 'competition_rationale': 'These earbuds represent a range of options available, from high-end audiophile experiences to more budget-friendly choices. The selected products cover different use cases, such as noise cancellation, running, and overall value, ensuring a comprehensive comparison.',
 'competition_products': [{'name': 'Sony WF-1000XM5',
   'reference': 'https://www.cnet.com/tech/computing/best-wireless-earbuds/',
   'reference_title': 'Best Wireless and Bluetooth Earbuds for Premium Sound in 2025 - CNET',
   'reference_summary': 'CNET discusses the best wireless earbuds of 2025, highlighting the Sony WF-1000XM5 as the best Sony earbuds.'},
  {'name': 'Bose

In [None]:
with open(DATA_DIR / "stage_1.json", "w") as f:
    json.dump(output, f, indent=2)