In [None]:
import os
import re
import time
import requests
import pandas as pd

from datetime import datetime, timezone, date
from pathlib import Path
from dotenv import load_dotenv

pd.set_option("display.max_columns", 200)


In [None]:
# =========================
# 프로젝트 루트 자동 탐색 + 표준 폴더 세팅
# =========================

def find_project_root() -> Path:
    p = Path.cwd()

    for parent in [p] + list(p.parents):
        if (parent / "data").exists() and (parent / "notebooks").exists():
            return parent

    return p

PROJECT_ROOT = find_project_root()

DATA_DIR = PROJECT_ROOT / "data"
INTERIM_DIR = DATA_DIR / "interim"

ACCUM_DIR = INTERIM_DIR / "01_daily_accumulated"
ACCUM_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("ACCUM_DIR:", ACCUM_DIR)


In [None]:
# =========================
# 실행 설정
# =========================

load_dotenv(PROJECT_ROOT / ".env")

API_KEY = os.getenv("YOUTUBE_API_KEY", "").strip()
if not API_KEY:
    raise ValueError("환경변수 YOUTUBE_API_KEY가 설정되어 있지 않습니다.")

RUN_DATE = date.today().isoformat()
RUN_TS_UTC = datetime.now(timezone.utc).isoformat()

print("RUN_DATE:", RUN_DATE)
print("RUN_TS_UTC:", RUN_TS_UTC)

def now_utc_iso():
    return datetime.now(timezone.utc).isoformat()

def run_stamp():
    return datetime.now().strftime("%Y%m%d_%H%M%S")


In [None]:
# =========================
# 누적 저장 유틸 (Path 기반)
# =========================

COMMENTS_FILE = ACCUM_DIR / "youtube_comments_raw_kr.csv"

print("COMMENTS_FILE =", COMMENTS_FILE)

def append_and_dedup_csv(df: pd.DataFrame, path: Path, key_cols: list[str]) -> pd.DataFrame:
    path.parent.mkdir(parents=True, exist_ok=True)

    if path.exists():
        old = pd.read_csv(path, low_memory=False)
        merged = pd.concat([old, df], ignore_index=True)
    else:
        merged = df.copy()

    # 키 컬럼 방어
    missing_keys = [k for k in key_cols if k not in merged.columns]
    if missing_keys:
        raise ValueError(f"dedup key 컬럼이 없습니다: {missing_keys} (path={path})")

    merged = merged.drop_duplicates(subset=key_cols, keep="last").reset_index(drop=True)
    merged.to_csv(path, index=False, encoding="utf-8-sig")
    print("✅ saved:", str(path), "| rows:", len(merged))
    
    return merged


In [None]:
def load_video_ids_from_trending(trending_daily_path: str, max_videos: int = 50):
    
    """
    trending daily 파일에서 video_id 추출 (중복 제거)
    """
    if not os.path.exists(trending_daily_path):
        raise FileNotFoundError(trending_daily_path)

    t = pd.read_csv(trending_daily_path, low_memory=False)
    if "video_id" not in t.columns:
        raise ValueError("trending 파일에 video_id 컬럼이 없습니다.")

    vids = (
        t["video_id"].astype(str)
        .dropna()
        .drop_duplicates()
        .tolist()
    )
    
    return vids[:max_videos]


In [None]:
YOUTUBE_COMMENTS_URL = "https://www.googleapis.com/youtube/v3/commentThreads"

def fetch_comments_for_video(video_id: str,
                             api_key: str,
                             max_comments: int = 200,
                             sleep_sec: float = 0.05):
    '''
    - 한 video_id에 대해 top-level 댓글(commentThreads) 수집
    - reply(대댓글)는 미수집
    '''
    rows = []
    page_token = None

    while True:
        remaining = max_comments - len(rows)

        if remaining <= 0:
            break

        params = {
            "part": "snippet",
            "videoId": video_id,
            "maxResults": min(100, remaining),
            "textFormat": "plainText",
            "key": api_key,
        }

        if page_token:
            params["pageToken"] = page_token

        res = requests.get(YOUTUBE_COMMENTS_URL, params=params, timeout=30)
        data = res.json()

        # API 에러 처리
        if "error" in data:
            err_reason = None

            try:
                err_reason = data["error"]["errors"][0].get("reason")
            except Exception:
                pass
            
            return rows, {"video_id": video_id, "status": "error", "reason": err_reason, "raw": data}

        for item in data.get("items", []):
            sn = item.get("snippet", {}).get("topLevelComment", {}).get("snippet", {})
            comment_id = item.get("snippet", {}).get("topLevelComment", {}).get("id")

            rows.append({
                "run_ts_utc": now_utc_iso(),
                "video_id": video_id,
                "comment_id": comment_id,
                "author_channel_id": (sn.get("authorChannelId") or {}).get("value"),
                "author_display_name": sn.get("authorDisplayName"),
                "published_at": sn.get("publishedAt"),
                "updated_at": sn.get("updatedAt"),
                "like_count": sn.get("likeCount"),
                "text": sn.get("textDisplay"),
            })

        page_token = data.get("nextPageToken")
        time.sleep(sleep_sec)

        if not page_token:
            break

    return rows, {"video_id": video_id, "status": "ok", "reason": None}


In [None]:
# =========================
# 실행 파라미터
# =========================

TRENDING_DAILY_PATH = str(ACCUM_DIR / "trending_videos_daily_kr.csv")

MAX_VIDEOS = 50                  # 한번 실행에 수집할 video 개수 (쿼터/시간 고려)
MAX_COMMENTS_PER_VIDEO = 200     # video 1개당 top-level 댓글 상한
SLEEP_SEC = 0.05                 # API 호출 텀 (너무 낮추면 실패/쿼터 문제 가능)

# =========================
# video_id 목록 준비
# =========================

video_ids = load_video_ids_from_trending(TRENDING_DAILY_PATH, max_videos=MAX_VIDEOS)
print("video_ids:", len(video_ids), "| sample:", video_ids[:5])

# =========================
# 댓글 수집
# =========================

all_rows = []
logs = []

for idx, vid in enumerate(video_ids, 1):
    rows, log = fetch_comments_for_video(
        vid,
        api_key=API_KEY,
        max_comments=MAX_COMMENTS_PER_VIDEO,
        sleep_sec=SLEEP_SEC,
    )

    all_rows.extend(rows)
    logs.append(log)

    if idx % 10 == 0:
        print(f"[{idx}/{len(video_ids)}] collected rows:", len(all_rows))

comments_df = pd.DataFrame(all_rows)
log_df = pd.DataFrame(logs)

# 수집일 컬럼 추가 (후속 파이프라인/추적용)
if len(comments_df) > 0 and "collected_date" not in comments_df.columns:
    comments_df["collected_date"] = RUN_DATE

print("comments_df:", comments_df.shape)
display(comments_df.head(3))
print("log_df:", log_df.shape)
display(log_df.head(10))


In [None]:
# =========================
# 댓글 저장
# =========================

# 누적 기준 키: comment_id
KEY_COLS = ["comment_id"]

comments_df = append_and_dedup_csv(
    df=comments_df,
    path=COMMENTS_FILE,
    key_cols=KEY_COLS
)

print("✅", COMMENTS_FILE.exists())
