In [1]:
from google import genai
from google.genai import types
from IPython.display import YouTubeVideo, Markdown
from pathlib import Path
from typing import *
from pydantic import BaseModel, Field
from devtools import debug
import pandas as pd
import json

### Config
Remember to change PRODUCT to match the current analysis session!

In [2]:
PRODUCT = "earbuds"
TOPIC = "review"  # Need to match TOPICS from 2_youtube_search.ipynb!

In [3]:
GOOGLE_AI_KEY = "AIzaSyDAlPx7St5BUXqlwiqFKvlT-Sc2dnTT4Jc"
# 2.5 Flash is good tradeoff between better vid understanding and free 500 RPD.
GOOGLE_AI_MODEL = "gemini-2.5-flash-preview-04-17"
# GOOGLE_AI_MODEL = "gemini-2.5-pro-exp-03-25"

In [4]:
DATA_DIR = Path("session") / PRODUCT
META_PATH = DATA_DIR / "stage_1.json"
assert META_PATH.exists(), "Run 1_describe_product.ipynb first!"

YOUTUBE_DIR = DATA_DIR / "youtube"
VID_META_PATH = YOUTUBE_DIR / "vid_meta.json"
assert VID_META_PATH.exists(), "Run 2_youtube_search.ipynb first!"

TRANSCRIPT_PATH = YOUTUBE_DIR / "transcripts.csv"

### Operations

#### Setup

In [5]:
client = genai.Client(api_key=GOOGLE_AI_KEY)

In [6]:
with open(VID_META_PATH, "r") as f:
    metadata = json.load(f)

videos = {}
for competitor in metadata:
    videos[competitor] = metadata[competitor][TOPIC]

display(videos)

{'Sony WF-1000XM5': ['wTjM-na6ydU'],
 'Bose QuietComfort Ultra Earbuds': ['8vsE8xVN6rE'],
 'Technics EAH-AZ100': ['QgOnME2rRYY']}

#### Generate Full Transcript
Getting the full transcript first is useful for manual analysis (if needed). More
importantly, it ensures the model doesn't miss any details and minimizes hallucination.

In [7]:
prompt_transcript = f"""\
Provide the full section by section transcript of this video. Include all possible
details such as:
- who said what to whom
- on-screen text and charts
- music and sound effects
- change in object states

Give the transcript in neatly formatted markdown sections with headers.
"""

display(Markdown(prompt_transcript))

Provide the full section by section transcript of this video. Include all possible
details such as:
- who said what to whom
- on-screen text and charts
- music and sound effects
- change in object states

Give the transcript in neatly formatted markdown sections with headers.


In [8]:
def transcribe_video(vid):
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_uri(file_uri=f"https://youtu.be/{vid}", mime_type="video/*"),
                types.Part.from_text(text=prompt_transcript),
            ]
        )
    ]

    resp = client.models.generate_content(
        model=GOOGLE_AI_MODEL,
        contents=contents,
    )

    return resp.text

In [9]:
if TRANSCRIPT_PATH.exists():
    df = pd.read_csv(TRANSCRIPT_PATH)
else:
    df = pd.DataFrame([], columns=["competitor", "video_id", "transcript"])

# Drop invalid transcript rows.
df = df[df["transcript"].notna()]
df = df[df["transcript"].str.strip() != ""]

display(df)

Unnamed: 0,competitor,video_id,transcript
0,Sony WF-1000XM5,wTjM-na6ydU,"Okay, here is the detailed section-by-section ..."
1,Technics EAH-AZ100,QgOnME2rRYY,"Okay, here is the section-by-section transcrip..."
2,Bose QuietComfort Ultra Earbuds,8vsE8xVN6rE,Here is the full section-by-section transcript...


In [10]:
# Will take approx one minute per new video!
new_entries = []
for competitor, vids in videos.items():
    for vid in vids:
        _test = df[df.video_id == vid].transcript
        if len(_test) > 0 and isinstance(_testv := _test.values[0], str) and len(_testv) > 42:
            print(f"Already transcribed {vid}")
            continue

        print(f"Transcribing {vid}...")
        text = transcribe_video(vid)
        print(f"Transcribed {vid}")
        new_entries.append({
            "competitor": competitor,
            "video_id": vid,
            "transcript": text,
        })

if len(new_entries) == 0:
    print("No new entries to add.")
else:
    df = pd.concat([df, pd.DataFrame(new_entries)], ignore_index=True)
    df.to_csv(TRANSCRIPT_PATH, index=False)

display(df)

Already transcribed wTjM-na6ydU
Already transcribed 8vsE8xVN6rE
Already transcribed QgOnME2rRYY
No new entries to add.


Unnamed: 0,competitor,video_id,transcript
0,Sony WF-1000XM5,wTjM-na6ydU,"Okay, here is the detailed section-by-section ..."
1,Technics EAH-AZ100,QgOnME2rRYY,"Okay, here is the section-by-section transcrip..."
2,Bose QuietComfort Ultra Earbuds,8vsE8xVN6rE,Here is the full section-by-section transcript...
