Continues from 3_gemini_video_analysis.ipynb.

In [1]:
from google import genai
from google.genai import types
from IPython.display import YouTubeVideo, Markdown
from pathlib import Path
from typing import *
from pydantic import BaseModel, Field, create_model
from devtools import debug
import json
from tqdm.auto import tqdm
import pandas as pd

### Config

In [2]:
PRODUCT = "wireless over-ear headphones"

In [3]:
# Should be from Google AI Studio.
GOOGLE_AI_KEY = "AIzaSyDAlPx7St5BUXqlwiqFKvlT-Sc2dnTT4Jc"
# 2.0 Flash since free 1500 RPD and Gemma's structured output is disabled.
GOOGLE_AI_MODEL = "gemini-2.0-flash"

In [24]:
DATA_DIR = Path("session") / PRODUCT
META_PATH = DATA_DIR / "stage_1.json"
assert META_PATH.exists(), "Run 1_describe_product.ipynb first!"

REDDIT_FILEMAP_PATH = DATA_DIR / "reddit" / "competitor_map.json"
YOUTUBE_FILEMAP_PATH = DATA_DIR / "youtube" / "comment_filemap.json"
assert REDDIT_FILEMAP_PATH.exists(), "Run 2_scrape_reddit_reviews.ipynb first!"
assert YOUTUBE_FILEMAP_PATH.exists(), "Run 2_scrape_youtube_comments.ipynb first!"

OUT_DIR = DATA_DIR / "comment_analysis"
OUT_DIR.mkdir(exist_ok=True, parents=True)

### Operations

#### Setup

In [5]:
client = genai.Client(api_key=GOOGLE_AI_KEY)

In [16]:
with open(REDDIT_FILEMAP_PATH, "r") as f:
    reddit_filemap = json.load(f)

with open(YOUTUBE_FILEMAP_PATH, "r") as f:
    youtube_filemap = json.load(f)

with open(META_PATH, "r") as f:
    stage1_meta = json.load(f)

#### Load Reddit Comments

In [None]:
r_comments = {}

for competitor in reddit_filemap:
    filename = reddit_filemap[competitor]
    csv_path = DATA_DIR / "reddit" / "processed_comments" / f"{filename}.csv"
    try:
        df = pd.read_csv(csv_path)
    except Exception as e:
        print(f"Error reading {csv_path}: {e}")
        continue

    r_comments[competitor] = {}
    for metric, gdf in df.groupby("features"):
        r_comments[competitor][metric] = gdf["comment_body"].tolist()
        # print(competitor, metric, len(gdf))

display(r_comments["Focal Bathys"]["Sound Quality"])

Error reading session/wireless over-ear headphones/reddit/processed_comments/Apple_AirPods_Max_reddit_review.csv: [Errno 2] No such file or directory: 'session/wireless over-ear headphones/reddit/processed_comments/Apple_AirPods_Max_reddit_review.csv'


['[TIMESTAMP](https://imgur.com/a/Zzp9rI1)\n\nSelling off a good chunk of my headphone and IEM collection as well as audio equipment. Everything is in excellent condition.\n\n|Item|Price (includes shipping)|Available/Sold|\n|:-|:-|:-|\n|~~AKG K712 Pro~~|~~$135.00~~|**SOLD**|\n|~~Focal Bathys~~|~~$400.00~~|**SOLD**|\n|Rode NTH-100M|$100.00|Available|\n|~~Koss Porta Pro~~|~~$35.00~~|**SOLD**|\n|~~Dan Clark Aeon 2 Noire~~|~~$480.00~~|**SOLD**|\n|~~HarmonicDyne Eris (Z Reviews Collab)~~|~~$150.00~~|**SOLD**|\n|~~Sennheiser HD 620 S~~|~~$220.00~~|**SOLD**|\n|~~Sennheiser HD 560 S~~|~~$90.00~~|**SOLD**|\n|~~Sennheiser HD 598 SE~~|~~$60.00~~|**SOLD**|\n|Sennheiser HD 599 SE|$70.00|Available|\n|~~Sennheiser HD 58X Jubilee~~|~~$80.00~~|**SOLD**|\n|~~Sennheiser HD 6XX~~|~~$125.00~~|**SOLD**|\n|~~Sennheiser HD 600~~|~~$180.00~~|**SOLD**|\n|~~Sennheiser HD 650~~|~~$180.00~~|**SOLD**|\n|Sennheiser HD 660 S|$215.00|Available|\n|~~Sennheiser HD 660 S2~~|~~$300.00~~|**SOLD**|\n|~~Meze 99 Classics~~|~~

#### Load YouTube Comments

In [None]:
y_comments = {}

for competitor in youtube_filemap:
    filename = youtube_filemap[competitor]
    csv_path = DATA_DIR / "youtube" / "processed_comments" / filename
    try:
        df = pd.read_csv(csv_path)
    except Exception as e:
        print(f"Error reading {csv_path}: {e}")
        continue

    y_comments[competitor] = {}
    for metric, gdf in df.groupby("features"):
        y_comments[competitor][metric] = gdf["text"].tolist()
        # print(competitor, metric, len(gdf))

display(y_comments["Focal Bathys"]["Sound Quality"])

["I recently took my AirPod Pros into a Tokyo headphone shop and got a chance to try a Mark Levinson and a Focal Bathys.  I am a professional double bass player.  I play a lot of acoustic music but listen to all kinds of music on any given day.  I listened to my AirPod Pros all the way to the shop.  I was so excited to put the Bathys on and be blown away.  But I have to say, I was shocked that I didn't find them better than my AirPod Pros.  i know I know, that is very controversial.  I did find that the Bathys were better at bringing all the instruments into a clear comprehensive sound.  I could hear a secondary guitar track on Radiohead's Weird Fishes in addition to an organ I never heard.  That is exciting.  When I put the AirPod Pros back on I did miss some of that clarity.  I could hear those details that i heard in the Focal but not as clearly defined and not as pronounced.  But here's the deal.  I felt more moved by the music, more like I was in a beautiful sound space, more imme

#### Extract Feedback From Comments

In [27]:
EVIDENCE_LIMIT = 3
FEEDBACK_LIMIT = 3


def analyze_comments(competitor, metric, comments):
    class Feedback(BaseModel):
        rationale: str = Field(description="Reasoning for the given evaluation")
        rating: int = Field(ge=-3, le=3, description="Rating from -3 to 3")
        component: str = Field(
            description="Specific component responsible for the evaluation"
        )
        evidence: List[str] = Field(
            max_length=EVIDENCE_LIMIT,
            description="List of evidence to support the evaluation",
        )

    class EvalResult(BaseModel):
        product: str = Field(description="Product being evaluated")
        metric: str = Field(description="Metric being evaluated")
        feedbacks: List[Feedback] = Field(
            max_length=FEEDBACK_LIMIT, description=f"Feedback on {metric}"
        )

    prompt = f"""\
## Job Description
You are a product analyst. Your job is to analyze the comments given to the \
{competitor} {PRODUCT} on the basis of {metric} from Reddit and YouTube and extract useful feedback.

When writing feedback on {metric}, extract as much specific information as possible. \
Write down which exact component of \
{competitor} is responsible for the feedback, and quantitative or qualitative values \
for why it is good or bad. For example:
    
```json
{{
    "rationale": "...",
    "rating": 3,
    "component": "carbon fiber reinforced hinges",
    "evidence": [
        "1,000,000 open/close cycles"
    ]
}}
```

## Comments

The comments to analyze are given below:

{"\n\n".join(f"### Comment {i + 1}\n{comment}" for i, comment in enumerate(comments))}

## Response Schema

When giving feedback on the {metric} of {competitor} {PRODUCT}, use the following JSON schema:

{EvalResult.model_json_schema()}\
"""

    resp = client.models.generate_content(
        model=GOOGLE_AI_MODEL,
        contents=prompt,
        config=types.GenerateContentConfig(
            responseMimeType="application/json",
            responseSchema=EvalResult,
        ),
    )
    return resp.parsed


In [28]:
# Combine all comments from Reddit and YouTube
competitors = set(r_comments.keys()).union(set(y_comments.keys()))

all_comments = {}
for competitor in competitors:
    all_comments[competitor] = {}
    for metric in stage1_meta["metrics"]:
        all_comments[competitor][metric] = r_comments.get(competitor, {}).get(
            metric, []
        ) + y_comments.get(competitor, {}).get(metric, [])

tasks = []
for competitor, cdata in all_comments.items():
    for metric, comments in cdata.items():
        if len(comments) == 0:
            continue
        tasks.append((competitor, metric, comments))

In [None]:
for competitor, metric, comments in tqdm(tasks):
    try:
        result = analyze_comments(competitor, metric, comments)
        path = OUT_DIR / competitor / f"{metric}.json"
        path.parent.mkdir(exist_ok=True, parents=True)
        with open(path, "w") as f:
            json.dump(result.model_dump(), f, indent=2)

    except Exception as e:
        print(f"Error analyzing {competitor} {metric}: {e}")

  0%|          | 0/25 [00:00<?, ?it/s]