In [21]:
import os
import re
import time
import requests
import pandas as pd

from datetime import date, datetime, timezone
from pathlib import Path
from dotenv import load_dotenv

pd.set_option("display.max_columns", 200)


In [22]:
# =========================================================
# 프로젝트 루트 자동 탐색 + 표준 폴더 세팅
# =========================================================

def find_project_root() -> Path:
    p = Path.cwd()

    for parent in [p] + list(p.parents):
        if (parent / "data").exists() and (parent / "notebooks").exists():
            return parent

    return p

PROJECT_ROOT = find_project_root()

DATA_DIR = PROJECT_ROOT / "data"
INTERIM_DIR = DATA_DIR / "interim"

ACCUM_DIR = INTERIM_DIR / "01_daily_accumulated"
ACCUM_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT =", PROJECT_ROOT)
print("ACCUM_DIR    =", ACCUM_DIR)


PROJECT_ROOT = c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml
ACCUM_DIR    = c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\interim\01_daily_accumulated


In [23]:
# =========================================================
# 실행 설정
# =========================================================

load_dotenv(PROJECT_ROOT / ".env")

API_KEY = os.getenv("YOUTUBE_API_KEY", "").strip()
if not API_KEY:
    raise ValueError("환경변수 YOUTUBE_API_KEY가 설정되어 있지 않습니다.")

REGION_CODES = ["KR"]
MAX_RESULTS = 50

# 오늘 수집 날짜 (유지기간 계산/누적 추적용)
RUN_DATE = date.today().isoformat()
RUN_TS_UTC = datetime.now(timezone.utc).isoformat()

print("RUN_DATE   =", RUN_DATE)
print("RUN_TS_UTC =", RUN_TS_UTC)


RUN_DATE   = 2026-02-02
RUN_TS_UTC = 2026-02-01T18:40:29.745546+00:00


In [24]:
# =========================================================
# 누적 저장 유틸 (Path 기반)
# =========================================================

TRENDING_FILE = ACCUM_DIR / "trending_videos_daily_kr.csv"
CHANNEL_FILE  = ACCUM_DIR / "channels_daily_stats_kr.csv"

print("TRENDING_FILE =", TRENDING_FILE)
print("CHANNEL_FILE  =", CHANNEL_FILE)

def safe_save_csv(df: pd.DataFrame, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=False, encoding="utf-8-sig")

    print("✅ saved:", str(path), "| rows:", len(df), "| exists:", path.exists())

def append_and_dedup_csv(df: pd.DataFrame, path: Path, key_cols: list[str]):
    if path.exists():
        old = pd.read_csv(path)
        merged = pd.concat([old, df], ignore_index=True)
    else:
        merged = df.copy()

    merged = merged.drop_duplicates(subset=key_cols, keep="last").reset_index(drop=True)
    safe_save_csv(merged, path)
    
    return merged


TRENDING_FILE = c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\interim\01_daily_accumulated\trending_videos_daily_kr.csv
CHANNEL_FILE  = c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\interim\01_daily_accumulated\channels_daily_stats_kr.csv


# 1. 트렌딩 비디오 데이터 수집 (trending_videos_daily_kr.csv)

In [25]:
def fetch_trending(region: str):
    url = "https://www.googleapis.com/youtube/v3/videos"
    
    params = {
        "part": "snippet,statistics",
        "chart": "mostPopular",
        "regionCode": region,
        "maxResults": MAX_RESULTS,
        "key": API_KEY
    }

    r = requests.get(url, params=params, timeout=60)
    r.raise_for_status()

    return r.json().get("items", [])


In [26]:
# =========================
# 트렌딩 수집
# =========================

video_rows = []
channel_ids = set()

for region in REGION_CODES:
    items = fetch_trending(region)
    time.sleep(0.2)  # quota 보호

    for it in items:
        snip = it.get("snippet", {})
        stat = it.get("statistics", {})

        video_id = it.get("id")
        channel_id = snip.get("channelId")

        if not video_id or not channel_id:
            continue

        video_rows.append({
            "date": RUN_DATE,     # ✅ 이름 통일 (저장 키에서도 사용)
            "run_ts_utc": RUN_TS_UTC,       # (선택) 수집 시각
            "region": region,

            "video_id": str(video_id),
            "channel_id": str(channel_id),
            "publish_date": snip.get("publishedAt"),
            "category_id": snip.get("categoryId"),

            "views": stat.get("viewCount"),
            "likes": stat.get("likeCount"),
            "comments": stat.get("commentCount"),
        })

        channel_ids.add(str(channel_id))

video_df = pd.DataFrame(video_rows)

print("today fetched trending rows:", video_df.shape)
display(video_df.head())
print("unique channels:", len(channel_ids))


today fetched trending rows: (50, 10)


Unnamed: 0,date,run_ts_utc,region,video_id,channel_id,publish_date,category_id,views,likes,comments
0,2026-02-02,2026-02-01T18:40:29.745546+00:00,KR,CCgmRhn7Z2A,UCvolP1xNN2maB52Tb1PkXzg,2026-01-31T09:00:15Z,10,65983,920,79
1,2026-02-02,2026-02-01T18:40:29.745546+00:00,KR,JAYxz3AmBVg,UCD2YO_A_PVMgMDN9jpRrpVA,2026-01-30T13:39:27Z,20,1058258,9525,2094
2,2026-02-02,2026-02-01T18:40:29.745546+00:00,KR,nuCzHmEgcQQ,UCMRvw9TUJB5m32YPrxLu7ag,2026-01-29T09:04:42Z,10,194772,3455,51
3,2026-02-02,2026-02-01T18:40:29.745546+00:00,KR,2xJqsBjDPwo,UCYJ0Ucu9jPX5kn6SeDcNaIQ,2026-01-31T21:51:03Z,20,253970,3244,190
4,2026-02-02,2026-02-01T18:40:29.745546+00:00,KR,OSUHqAvuSRI,UC9rMiEjNaCSsebs31MRDCRA,2026-01-31T15:00:00Z,10,877860,215399,16423


unique channels: 50


In [27]:
# =========================
# 트렌딩 저장
# =========================

# 필수 컬럼 체크
required_cols = ["video_id", "date"]
missing = [c for c in required_cols if c not in video_df.columns]

if missing:
    raise ValueError(f"video_df에 필수 컬럼이 없습니다: {missing}")

# 누적 기준 키: video_id + collected_date
KEY_COLS = ["video_id", "date"]

video_df = append_and_dedup_csv(
    df=video_df,
    path=TRENDING_FILE,
    key_cols=KEY_COLS
)

print("✅ accumulated trending rows:", video_df.shape)


✅ saved: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\interim\01_daily_accumulated\trending_videos_daily_kr.csv | rows: 156 | exists: True
✅ accumulated trending rows: (156, 10)


In [28]:
def fetch_channel_stats(channel_ids: set[str], min_subs: int | None = None, max_subs: int | None = None):
    url = "https://www.googleapis.com/youtube/v3/channels"
    rows = []
    channel_ids = list(channel_ids)

    for i in range(0, len(channel_ids), 50):
        batch = ",".join(channel_ids[i:i+50])

        params = {
            "part": "snippet,statistics",
            "id": batch,
            "key": API_KEY
        }

        resp = requests.get(url, params=params, timeout=60)
        resp.raise_for_status()
        res = resp.json()

        for item in res.get("items", []):
            snippet = item.get("snippet", {})
            stats = item.get("statistics", {})

            subs_raw = stats.get("subscriberCount")
            try:
                subs = int(subs_raw) if subs_raw is not None else None
            except Exception:
                subs = None

            # 필터(선택)
            if subs is not None:
                if min_subs is not None and subs < min_subs:
                    continue
                if max_subs is not None and subs > max_subs:
                    continue

            rows.append({
                "date": RUN_DATE,
                "run_ts_utc": RUN_TS_UTC,
                "channel_id": str(item.get("id")),
                "channel_name": snippet.get("title"),
                "created_date": snippet.get("publishedAt"),
                "subscriber_count": subs,
                "views_total": stats.get("viewCount"),
                "video_count_total": stats.get("videoCount"),
                "country": snippet.get("country"),
            })

        time.sleep(0.2)

    return pd.DataFrame(rows)

# 2. 채널 데이터 수집 (channels_daily_kr.csv)

In [29]:
# =========================
# 채널 통계 수집
# =========================

# 구독자 범위 조절 → min_subs/max_subs
channel_df = fetch_channel_stats(channel_ids, min_subs=5000, max_subs=20000)

print("today fetched channel rows:", channel_df.shape)
display(channel_df.head())

today fetched channel rows: (3, 9)


Unnamed: 0,date,run_ts_utc,channel_id,channel_name,created_date,subscriber_count,views_total,video_count_total,country
0,2026-02-02,2026-02-01T18:40:29.745546+00:00,UCxSjpAHSqTXKaFoAazV09TA,ROCOBERRY - Topic,2015-12-15T14:41:26Z,5130,21055089,118,
1,2026-02-02,2026-02-01T18:40:29.745546+00:00,UCDSyIOCB7gRAbVLbE9c8wUg,Kim Minseok (MeloMance) - Topic,2016-08-11T21:59:46Z,12700,141575101,64,
2,2026-02-02,2026-02-01T18:40:29.745546+00:00,UCMjeoedkGftL8SQh1iO5k9w,Yoo Hwe-seung - Topic,2023-06-26T05:33:12.896775Z,10700,46212756,28,


In [30]:
# =========================
# 채널 저장
# =========================

# 필수 컬럼 체크
required_cols = ["channel_id", "date"]
missing = [c for c in required_cols if c not in channel_df.columns]

if missing:
    raise ValueError(f"❌ channel_df에 필수 컬럼이 없습니다: {missing}")

channel_df["channel_id"] = channel_df["channel_id"].astype(str)

channel_df = append_and_dedup_csv(
    df=channel_df,
    path=CHANNEL_FILE,
    key_cols=["channel_id", "date"]
)

print("✅ accumulated channel rows:", channel_df.shape)
display(channel_df.head())


✅ saved: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\interim\01_daily_accumulated\channels_daily_stats_kr.csv | rows: 10 | exists: True
✅ accumulated channel rows: (10, 9)


Unnamed: 0,date,run_ts_utc,channel_id,channel_name,created_date,subscriber_count,views_total,video_count_total,country
0,2026-01-31,2026-01-30T19:50:41.252668+00:00,UCrpcd5WtOrdCsx5cufc4JRQ,ZUTOMAYO - Topic,2018-08-29T11:15:36Z,8070,475280828,311,
1,2026-01-31,2026-01-30T19:50:41.252668+00:00,UC-FCFEgK0kGOV5CCA68xP7w,이준호 LEE JUNHO - Topic,2013-12-23T05:35:32Z,15800,20926123,274,
2,2026-01-31,2026-01-30T19:50:41.252668+00:00,UCxSjpAHSqTXKaFoAazV09TA,ROCOBERRY - Topic,2015-12-15T14:41:26Z,5130,20810908,118,
3,2026-01-31,2026-01-30T19:50:41.252668+00:00,UCMjeoedkGftL8SQh1iO5k9w,Yoo Hwe-seung - Topic,2023-06-26T05:33:12.896775Z,10700,46073857,28,
4,2026-02-01,2026-02-01T11:21:08.803708+00:00,UCMjeoedkGftL8SQh1iO5k9w,Yoo Hwe-seung - Topic,2023-06-26T05:33:12.896775Z,10700,46180674,28,
