Continues from 2_youtube_search.ipynb

In [1]:
from googleapiclient.discovery import build
from IPython.display import YouTubeVideo
from pathlib import Path
import pandas as pd
import json
from tqdm.auto import tqdm
import re

### Config

In [2]:
PRODUCT = "wireless over-ear headphones"
MAX_COMMENTS = 100

In [3]:
# Ensure key has Youtube Data API enabled in Google Cloud Console.
GOOGLE_YOUTUBE_KEY = "AIzaSyCF-hCRmYwVA4v6CD5bAWWvG-m8rwQohmY"

In [4]:
DATA_DIR = Path("session") / PRODUCT
META_PATH = DATA_DIR / "stage_1.json"
assert META_PATH.exists(), "Run 1_describe_product.ipynb first!"

YOUTUBE_DIR = DATA_DIR / "youtube"
VID_META_PATH = YOUTUBE_DIR / "vid_meta.json"
assert VID_META_PATH.exists(), "Run 2_youtube_search.ipynb first!"

OUT_DIR = YOUTUBE_DIR / "raw_comments"
OUT_DIR.mkdir(exist_ok=True, parents=True)

### Operations

#### Setup

In [None]:
youtube = build("youtube", "v3", developerKey=GOOGLE_YOUTUBE_KEY)

In [None]:
with open(VID_META_PATH, "r") as f:
    vid_meta = json.load(f)

display(vid_meta)

{'Bose QuietComfort Ultra Headphones': {'review': ['wjRaEc3QTIA'],
  'comparison': ['Ii-GvoQr4lc'],
  'unboxing': ['Jo3kgR_48qo']},
 'Sony WH-1000XM5': {'review': ['6CsJZxfZsL0'],
  'comparison': ['BdvOuKTi3CE'],
  'unboxing': ['D8wG67oko7E']},
 'Focal Bathys': {'review': ['-pRUVj3KRYw'],
  'comparison': ['qe7tea9Og14'],
  'unboxing': ['S3miH54g1Wk']},
 'Anker Soundcore Space One': {'review': ['M-p0BRhlugs'],
  'comparison': ['OHLKuEUg86M'],
  'unboxing': ['HkBnSY8XpJQ']},
 'Apple AirPods Max': {'review': ['59uTE7pLfKA'],
  'comparison': ['x4NCWc1aqw4'],
  'unboxing': ['UdfSrJvqY_E']}}

#### Retrieve Comments on Each Competitor

In [7]:
competitor_filemap = {}

for competitor, viddata in vid_meta.items():
    print(f"Retrieving comments for {competitor}.")
    name = re.sub("[^0-9a-zA-Z]+", "_", competitor)
    competitor_csv = OUT_DIR / f"{name}.csv"
    competitor_filemap[competitor] = f"{name}.csv"

    videos = []
    for category, vids in viddata.items():
        videos.extend([(category, vid) for vid in vids])

    data = []
    for cat, vid in tqdm(videos):
        req = youtube.commentThreads().list(
            part="snippet,replies",
            videoId=vid,
            maxResults=100,
            textFormat="plainText",
        )
        
        this_data = []
        while req and len(this_data) < MAX_COMMENTS:
            res = req.execute()
            req = youtube.commentThreads().list_next(req, res)

            for thread in res["items"]:
                comments = [thread["snippet"]["topLevelComment"]]
                if "replies" in thread:
                    comments.extend(thread["replies"]["comments"])
                # print("Number of comments in thread:", len(comments))
            
                for com in comments:
                    this_data.append(
                        {
                            "product": competitor,
                            "category": cat,
                            "id": com["id"],
                            "videoId": vid,
                            "parentId": thread["id"],
                            "isReply": thread["id"] != com["id"],
                            "authorName": com["snippet"]["authorDisplayName"],
                            "authorId": com["snippet"]["authorChannelId"]["value"],
                            "text": com["snippet"]["textDisplay"],
                            "likes": com["snippet"]["likeCount"],
                            "updatedAt": com["snippet"]["updatedAt"],
                        }
                    )

        data.extend(this_data)

    df = pd.DataFrame(data)
    df.to_csv(competitor_csv, index=False)
    print(f"Saved {len(df)} comments to {competitor_csv}.")

Retrieving comments for Bose QuietComfort Ultra Headphones.


  0%|          | 0/3 [00:00<?, ?it/s]

Saved 365 comments to session/wireless over-ear headphones/youtube/raw_comments/Bose_QuietComfort_Ultra_Headphones.csv.
Retrieving comments for Sony WH-1000XM5.


  0%|          | 0/3 [00:00<?, ?it/s]

Saved 350 comments to session/wireless over-ear headphones/youtube/raw_comments/Sony_WH_1000XM5.csv.
Retrieving comments for Focal Bathys.


  0%|          | 0/3 [00:00<?, ?it/s]

Saved 493 comments to session/wireless over-ear headphones/youtube/raw_comments/Focal_Bathys.csv.
Retrieving comments for Anker Soundcore Space One.


  0%|          | 0/3 [00:00<?, ?it/s]

Saved 429 comments to session/wireless over-ear headphones/youtube/raw_comments/Anker_Soundcore_Space_One.csv.
Retrieving comments for Apple AirPods Max.


  0%|          | 0/3 [00:00<?, ?it/s]

Saved 353 comments to session/wireless over-ear headphones/youtube/raw_comments/Apple_AirPods_Max.csv.


In [8]:
with open(YOUTUBE_DIR / "comment_filemap.json", "w") as f:
    json.dump(competitor_filemap, f, indent=2)