In [None]:
import os, time, re
import requests
import pandas as pd

from datetime import date, datetime, timezone
from pathlib import Path
from dotenv import load_dotenv

pd.set_option("display.max_columns", 200)

# =========================================================
# ✅ 프로젝트 루트 자동 탐색 + 표준 폴더 세팅
# =========================================================
def find_project_root() -> Path:
    p = Path.cwd()
    for parent in [p] + list(p.parents):
        if (parent / "data").exists() and (parent / "notebooks").exists():
            return parent
    return p

PROJECT_ROOT = find_project_root()
DATA_DIR = PROJECT_ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"

ACCUM_DIR = PROCESSED_DIR / "01_daily_accumulated"     # ✅ 누적형(v1 고정)
ACCUM_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT =", PROJECT_ROOT)
print("ACCUM_DIR    =", ACCUM_DIR)

# =========================================================
# ✅ 실행 설정
# =========================================================
load_dotenv(PROJECT_ROOT / ".env")

API_KEY = os.getenv("YOUTUBE_API_KEY", "").strip()
if not API_KEY:
    raise ValueError("환경변수 YOUTUBE_API_KEY가 설정되어 있지 않습니다. (코드에 키 하드코딩 X)")

REGION_CODES = ["KR"]
MAX_RESULTS = 50

# ✅ 오늘 수집 날짜(유지기간 계산/누적 추적용)
RUN_DATE = date.today().isoformat()
RUN_TS_UTC = datetime.now(timezone.utc).isoformat()

# =========================================================
# ✅ 저장 정책: 누적형(v1 고정)
# =========================================================
TRENDING_FILE = ACCUM_DIR / "youtube_trending_videos_daily_kr_v1.csv"
CHANNEL_FILE  = ACCUM_DIR / "youtube_channels_daily_stats_kr_v1.csv"

print("RUN_DATE   =", RUN_DATE)
print("RUN_TS_UTC =", RUN_TS_UTC)
print("TRENDING_FILE =", TRENDING_FILE)
print("CHANNEL_FILE  =", CHANNEL_FILE)

def safe_save_csv(df: pd.DataFrame, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=False, encoding="utf-8-sig")
    print("✅ saved:", str(path), "| rows:", len(df), "| exists:", path.exists())

def append_and_dedup_csv(df: pd.DataFrame, path: Path, key_cols: list[str]):
    if path.exists():
        old = pd.read_csv(path)
        merged = pd.concat([old, df], ignore_index=True)
    else:
        merged = df.copy()

    merged = merged.drop_duplicates(subset=key_cols, keep="last").reset_index(drop=True)
    safe_save_csv(merged, path)
    return merged


PROJECT_ROOT = c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml
ACCUM_DIR    = c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\01_daily_accumulated
RUN_DATE   = 2026-01-31
RUN_TS_UTC = 2026-01-30T19:50:41.252668+00:00
TRENDING_FILE = c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\01_daily_accumulated\youtube_trending_videos_daily_kr_v1.csv
CHANNEL_FILE  = c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\01_daily_accumulated\youtube_channels_daily_stats_kr_v1.csv


In [3]:
def fetch_trending(region: str):
    url = "https://www.googleapis.com/youtube/v3/videos"
    params = {
        "part": "snippet,statistics",
        "chart": "mostPopular",
        "regionCode": region,
        "maxResults": MAX_RESULTS,
        "key": API_KEY
    }
    r = requests.get(url, params=params, timeout=60)
    r.raise_for_status()
    return r.json().get("items", [])

In [4]:
# =========================
# 1) 트렌딩 수집
# =========================
video_rows = []
channel_ids = set()

for region in REGION_CODES:
    items = fetch_trending(region)
    time.sleep(0.2)  # quota 보호

    for it in items:
        snip = it.get("snippet", {})
        stat = it.get("statistics", {})

        video_id = it.get("id")
        channel_id = snip.get("channelId")

        if not video_id or not channel_id:
            continue

        video_rows.append({
            "date": RUN_DATE,     # ✅ 이름 통일 (저장 키에서도 사용)
            "run_ts_utc": RUN_TS_UTC,       # (선택) 수집 시각
            "region": region,

            "video_id": str(video_id),
            "channel_id": str(channel_id),
            "publish_date": snip.get("publishedAt"),
            "category_id": snip.get("categoryId"),

            "views": stat.get("viewCount"),
            "likes": stat.get("likeCount"),
            "comments": stat.get("commentCount"),
        })

        channel_ids.add(str(channel_id))

video_df = pd.DataFrame(video_rows)

print("today fetched trending rows:", video_df.shape)
display(video_df.head())
print("unique channels:", len(channel_ids))


today fetched trending rows: (50, 10)


Unnamed: 0,date,run_ts_utc,region,video_id,channel_id,publish_date,category_id,views,likes,comments
0,2026-01-31,2026-01-30T19:50:41.252668+00:00,KR,DYgE3SGPEqk,UCritGVo7pLJLUS8wEu32vow,2026-01-27T09:00:01Z,10,6574838,250704,15173
1,2026-01-31,2026-01-30T19:50:41.252668+00:00,KR,9Md-huAu1HE,UCiEEF51uRAeZeCo8CJFhGWw,2026-01-29T00:00:20Z,24,77661,1993,196
2,2026-01-31,2026-01-30T19:50:41.252668+00:00,KR,JAYxz3AmBVg,UCD2YO_A_PVMgMDN9jpRrpVA,2026-01-30T13:39:27Z,20,313165,5192,1340
3,2026-01-31,2026-01-30T19:50:41.252668+00:00,KR,TLmRZQgXmEk,UCqq-ovGE01ErlXakPihhKDA,2026-01-29T12:00:02Z,24,538469,15323,678
4,2026-01-31,2026-01-30T19:50:41.252668+00:00,KR,SAHe1CuJK7k,UCJpAwvQaZyCI5spAz7tipGA,2026-01-30T08:30:12Z,20,234228,1841,226


unique channels: 50


In [5]:
# =========================
# 2) 트렌딩 저장 (누적형 v1 고정)
# =========================

# 필수 컬럼 체크
required_cols = ["video_id", "date"]
missing = [c for c in required_cols if c not in video_df.columns]
if missing:
    raise ValueError(f"❌ video_df에 필수 컬럼이 없습니다: {missing}")

# 누적 기준 키: video_id + collected_date
KEY_COLS = ["video_id", "date"]

video_df = append_and_dedup_csv(
    df=video_df,
    path=TRENDING_FILE,
    key_cols=KEY_COLS
)

print("✅ accumulated trending rows:", video_df.shape)


✅ saved: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\01_daily_accumulated\youtube_trending_videos_daily_kr_v1.csv | rows: 51 | exists: True
✅ accumulated trending rows: (51, 10)


In [6]:
def fetch_channel_stats(channel_ids: set[str], min_subs: int | None = None, max_subs: int | None = None):
    url = "https://www.googleapis.com/youtube/v3/channels"
    rows = []
    channel_ids = list(channel_ids)

    for i in range(0, len(channel_ids), 50):
        batch = ",".join(channel_ids[i:i+50])

        params = {
            "part": "snippet,statistics",
            "id": batch,
            "key": API_KEY
        }

        resp = requests.get(url, params=params, timeout=60)
        resp.raise_for_status()
        res = resp.json()

        for item in res.get("items", []):
            snippet = item.get("snippet", {})
            stats = item.get("statistics", {})

            subs_raw = stats.get("subscriberCount")
            try:
                subs = int(subs_raw) if subs_raw is not None else None
            except Exception:
                subs = None

            # 필터(선택)
            if subs is not None:
                if min_subs is not None and subs < min_subs:
                    continue
                if max_subs is not None and subs > max_subs:
                    continue

            rows.append({
                "date": RUN_DATE,
                "run_ts_utc": RUN_TS_UTC,
                "channel_id": str(item.get("id")),
                "channel_name": snippet.get("title"),
                "created_date": snippet.get("publishedAt"),
                "subscriber_count": subs,
                "views_total": stats.get("viewCount"),
                "video_count_total": stats.get("videoCount"),
                "country": snippet.get("country"),
            })

        time.sleep(0.2)

    return pd.DataFrame(rows)

In [7]:
# =========================
# 3) 채널 통계 수집
# =========================
# (선택) 구독자 범위를 유지하고 싶으면 min_subs/max_subs 사용
channel_df = fetch_channel_stats(channel_ids, min_subs=5000, max_subs=20000)

print("today fetched channel rows:", channel_df.shape)
display(channel_df.head())

today fetched channel rows: (4, 9)


Unnamed: 0,date,run_ts_utc,channel_id,channel_name,created_date,subscriber_count,views_total,video_count_total,country
0,2026-01-31,2026-01-30T19:50:41.252668+00:00,UCrpcd5WtOrdCsx5cufc4JRQ,ZUTOMAYO - Topic,2018-08-29T11:15:36Z,8070,475280828,311,
1,2026-01-31,2026-01-30T19:50:41.252668+00:00,UC-FCFEgK0kGOV5CCA68xP7w,이준호 LEE JUNHO - Topic,2013-12-23T05:35:32Z,15800,20926123,274,
2,2026-01-31,2026-01-30T19:50:41.252668+00:00,UCxSjpAHSqTXKaFoAazV09TA,ROCOBERRY - Topic,2015-12-15T14:41:26Z,5130,20810908,118,
3,2026-01-31,2026-01-30T19:50:41.252668+00:00,UCMjeoedkGftL8SQh1iO5k9w,Yoo Hwe-seung - Topic,2023-06-26T05:33:12.896775Z,10700,46073857,28,


In [8]:
# =========================
# 4) 채널 저장 (누적형 v1 고정)
# =========================

# 필수 컬럼 체크
required_cols = ["channel_id", "date"]
missing = [c for c in required_cols if c not in channel_df.columns]
if missing:
    raise ValueError(f"❌ channel_df에 필수 컬럼이 없습니다: {missing}")

channel_df["channel_id"] = channel_df["channel_id"].astype(str)

channel_df = append_and_dedup_csv(
    df=channel_df,
    path=CHANNEL_FILE,
    key_cols=["channel_id", "date"]
)

print("✅ accumulated channel rows:", channel_df.shape)
display(channel_df.head())


✅ saved: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\01_daily_accumulated\youtube_channels_daily_stats_kr_v1.csv | rows: 4 | exists: True
✅ accumulated channel rows: (4, 9)


Unnamed: 0,date,run_ts_utc,channel_id,channel_name,created_date,subscriber_count,views_total,video_count_total,country
0,2026-01-31,2026-01-30T19:50:41.252668+00:00,UCrpcd5WtOrdCsx5cufc4JRQ,ZUTOMAYO - Topic,2018-08-29T11:15:36Z,8070,475280828,311,
1,2026-01-31,2026-01-30T19:50:41.252668+00:00,UC-FCFEgK0kGOV5CCA68xP7w,이준호 LEE JUNHO - Topic,2013-12-23T05:35:32Z,15800,20926123,274,
2,2026-01-31,2026-01-30T19:50:41.252668+00:00,UCxSjpAHSqTXKaFoAazV09TA,ROCOBERRY - Topic,2015-12-15T14:41:26Z,5130,20810908,118,
3,2026-01-31,2026-01-30T19:50:41.252668+00:00,UCMjeoedkGftL8SQh1iO5k9w,Yoo Hwe-seung - Topic,2023-06-26T05:33:12.896775Z,10700,46073857,28,
