In [None]:
import os, time, re, json, requests
import pandas as pd

from datetime import datetime, timezone, date
from pathlib import Path
from dotenv import load_dotenv

pd.set_option("display.max_columns", 200)

# =========================
# ✅ 프로젝트 루트 자동 탐색 + 표준 폴더 세팅
# =========================
def find_project_root() -> Path:
    p = Path.cwd()
    for parent in [p] + list(p.parents):
        if (parent / "data").exists() and (parent / "notebooks").exists():
            return parent
    return p

PROJECT_ROOT = find_project_root()
DATA_DIR = PROJECT_ROOT / "data"
PROCESSED_DIR = DATA_DIR / "processed"

ACCUM_DIR = PROCESSED_DIR / "01_daily_accumulated"      # ✅ 누적형(v1 고정)
ACCUM_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("ACCUM_DIR:", ACCUM_DIR)

# =========================
# ✅ 실행 메타
# =========================
RUN_DATE = date.today().isoformat()
RUN_TS_UTC = datetime.now(timezone.utc).isoformat()
print("RUN_DATE:", RUN_DATE)
print("RUN_TS_UTC:", RUN_TS_UTC)

def now_utc_iso():
    return datetime.now(timezone.utc).isoformat()

def run_stamp():
    return datetime.now().strftime("%Y%m%d_%H%M%S")

# =========================
# ✅ API KEY (환경변수만 사용)
# =========================
load_dotenv(PROJECT_ROOT / ".env")

API_KEY = os.getenv("YOUTUBE_API_KEY", "").strip()
if not API_KEY:
    raise ValueError("환경변수 YOUTUBE_API_KEY가 설정되어 있지 않습니다. (코드에 키 하드코딩 X)")

# =========================
# ✅ 누적 저장 유틸 (Path 기반)
# =========================
def append_and_dedup_csv(df: pd.DataFrame, path: Path, key_cols: list[str]) -> pd.DataFrame:
    path.parent.mkdir(parents=True, exist_ok=True)

    if path.exists():
        old = pd.read_csv(path, low_memory=False)
        merged = pd.concat([old, df], ignore_index=True)
    else:
        merged = df.copy()

    # 키 컬럼 방어
    missing_keys = [k for k in key_cols if k not in merged.columns]
    if missing_keys:
        raise ValueError(f"dedup key 컬럼이 없습니다: {missing_keys} (path={path})")

    merged = merged.drop_duplicates(subset=key_cols, keep="last").reset_index(drop=True)
    merged.to_csv(path, index=False, encoding="utf-8-sig")
    print("✅ saved:", str(path), "| rows:", len(merged))
    return merged


PROJECT_ROOT: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml
ACCUM_DIR: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\01_daily_accumulated
RUN_DATE: 2026-01-31
RUN_TS_UTC: 2026-01-30T19:51:16.612949+00:00


In [31]:
def load_video_ids_from_trending(trending_daily_path: str, max_videos: int = 50):
    """
    trending daily 파일에서 video_id를 뽑아옵니다(중복 제거).
    """
    if not os.path.exists(trending_daily_path):
        raise FileNotFoundError(trending_daily_path)

    t = pd.read_csv(trending_daily_path, low_memory=False)
    if "video_id" not in t.columns:
        raise ValueError("trending 파일에 video_id 컬럼이 없습니다.")

    vids = (
        t["video_id"].astype(str)
        .dropna()
        .drop_duplicates()
        .tolist()
    )
    return vids[:max_videos]


In [32]:
YOUTUBE_COMMENTS_URL = "https://www.googleapis.com/youtube/v3/commentThreads"

def fetch_comments_for_video(video_id: str,
                             api_key: str,
                             max_comments: int = 200,
                             sleep_sec: float = 0.05):
    '''
    한 video_id에 대해 top-level 댓글(commentThreads) 수집.
    - reply(대댓글)는 여기서는 미수집(원하면 확장 가능)
    '''
    rows = []
    page_token = None

    while True:
        remaining = max_comments - len(rows)
        if remaining <= 0:
            break

        params = {
            "part": "snippet",
            "videoId": video_id,
            "maxResults": min(100, remaining),
            "textFormat": "plainText",
            "key": api_key,
        }
        if page_token:
            params["pageToken"] = page_token

        res = requests.get(YOUTUBE_COMMENTS_URL, params=params, timeout=30)
        data = res.json()

        # API 에러 처리
        if "error" in data:
            err_reason = None
            try:
                err_reason = data["error"]["errors"][0].get("reason")
            except Exception:
                pass
            return rows, {"video_id": video_id, "status": "error", "reason": err_reason, "raw": data}

        for item in data.get("items", []):
            sn = item.get("snippet", {}).get("topLevelComment", {}).get("snippet", {})
            comment_id = item.get("snippet", {}).get("topLevelComment", {}).get("id")

            rows.append({
                "run_ts_utc": now_utc_iso(),
                "video_id": video_id,
                "comment_id": comment_id,
                "author_channel_id": (sn.get("authorChannelId") or {}).get("value"),
                "author_display_name": sn.get("authorDisplayName"),
                "published_at": sn.get("publishedAt"),
                "updated_at": sn.get("updatedAt"),
                "like_count": sn.get("likeCount"),
                "text": sn.get("textDisplay"),
            })

        page_token = data.get("nextPageToken")
        time.sleep(sleep_sec)

        if not page_token:
            break

    return rows, {"video_id": video_id, "status": "ok", "reason": None}


In [33]:
# =========================
# ✅ 실행 파라미터
# =========================
TRENDING_DAILY_PATH = str(ACCUM_DIR / "youtube_trending_videos_daily_kr_v1.csv")

MAX_VIDEOS = 50                  # 한번 실행에 수집할 video 개수(쿼터/시간 고려)
MAX_COMMENTS_PER_VIDEO = 200     # video 1개당 top-level 댓글 상한
SLEEP_SEC = 0.05                 # API 호출 텀(너무 낮추면 실패/쿼터 문제 가능)

# =========================
# ✅ video_id 목록 준비
#   - (권장) trending daily에서 읽기
# =========================
video_ids = load_video_ids_from_trending(TRENDING_DAILY_PATH, max_videos=MAX_VIDEOS)
print("video_ids:", len(video_ids), "| sample:", video_ids[:5])

# =========================
# ✅ 댓글 수집
# =========================
all_rows = []
logs = []

for idx, vid in enumerate(video_ids, 1):
    rows, log = fetch_comments_for_video(
        vid,
        api_key=API_KEY,
        max_comments=MAX_COMMENTS_PER_VIDEO,
        sleep_sec=SLEEP_SEC,
    )
    all_rows.extend(rows)
    logs.append(log)
    if idx % 10 == 0:
        print(f"[{idx}/{len(video_ids)}] collected rows:", len(all_rows))

comments_df = pd.DataFrame(all_rows)
log_df = pd.DataFrame(logs)

# ✅ 수집일 컬럼 추가(후속 파이프라인/추적용)
if len(comments_df) > 0 and "collected_date" not in comments_df.columns:
    comments_df["collected_date"] = RUN_DATE

print("comments_df:", comments_df.shape)
display(comments_df.head(3))
print("log_df:", log_df.shape)
display(log_df.head(10))


video_ids: 50 | sample: ['rOi49Asuue8', 'DYgE3SGPEqk', '9Md-huAu1HE', 'JAYxz3AmBVg', 'TLmRZQgXmEk']
[10/50] collected rows: 1340
[20/50] collected rows: 2251
[30/50] collected rows: 3520
[40/50] collected rows: 4383
[50/50] collected rows: 5309
comments_df: (5309, 10)


Unnamed: 0,run_ts_utc,video_id,comment_id,author_channel_id,author_display_name,published_at,updated_at,like_count,text,collected_date
0,2026-01-30T19:51:17.288522+00:00,rOi49Asuue8,UgyHqvpBPZNrGqoYLY94AaABAg,UC4P7F8qenDo1O_5NvgBb_cg,@발스타홍구,2026-01-29T03:39:25Z,2026-01-29T03:39:25Z,19,"※이 영상은 또다시 나타난 2026년 최악의 천적, 역대급 초고수 저승사자와 (1부...",2026-01-31
1,2026-01-30T19:51:17.288543+00:00,rOi49Asuue8,UgwUgUWWmMv7wPCb-U54AaABAg,UCK3VUXbj8KENE-NqglLzEqQ,@뇸뇸냠-e6w,2026-01-30T19:04:00Z,2026-01-30T19:04:00Z,0,저그도 많이 올려쥬세요 시원하게 이기는거도 많아 보고 싶어요,2026-01-31
2,2026-01-30T19:51:17.288554+00:00,rOi49Asuue8,UgzR41JafaT7H6hV5jB4AaABAg,UCrauSCcXjsiXM1mpSAsfgyA,@내꿈은낚시왕-h9i,2026-01-30T18:54:10Z,2026-01-30T18:54:10Z,0,비젼켜주고 개발리고 케리어로 장난치다 개발리고 지가 방심하면 안된다더니 본인피셜 살...,2026-01-31


log_df: (50, 4)


Unnamed: 0,video_id,status,reason,raw
0,rOi49Asuue8,ok,,
1,DYgE3SGPEqk,ok,,
2,9Md-huAu1HE,ok,,
3,JAYxz3AmBVg,ok,,
4,TLmRZQgXmEk,ok,,
5,SAHe1CuJK7k,ok,,
6,nuCzHmEgcQQ,error,commentsDisabled,"{'error': {'code': 403, 'message': 'The video ..."
7,o61n1nk5eu8,ok,,
8,zracwhm_rxU,error,commentsDisabled,"{'error': {'code': 403, 'message': 'The video ..."
9,5y1YQx1g4Mg,ok,,


In [34]:
# =========================
# ✅ 저장 (누적)
# =========================

# 누적 테이블(append + dedup) -> data/processed/01_daily_accumulated/
# ✅ 사용자가 지정한 '반드시 이 경로' (핵심)
ACCUM_COMMENTS_PATH = ACCUM_DIR / "youtube_comments_raw_kr_v1.csv"

# dedup key는 comment_id만으로 충분(유튜브 전역 유니크)
# (영상별로 안전하게 하고 싶으면 ["comment_id","video_id"]도 가능)
KEY_COLS = ["comment_id"]

comments_df = append_and_dedup_csv(
    df=comments_df,
    path=ACCUM_COMMENTS_PATH,
    key_cols=KEY_COLS
)

print("✅ accumulated comments path:", ACCUM_COMMENTS_PATH, "| exists:", ACCUM_COMMENTS_PATH.exists())


✅ saved: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\01_daily_accumulated\youtube_comments_raw_kr_v1.csv | rows: 5714
✅ accumulated comments path: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\01_daily_accumulated\youtube_comments_raw_kr_v1.csv | exists: True
