In [1]:
import os, re
import numpy as np
import pandas as pd
from pathlib import Path

pd.set_option("display.max_columns", 200)
np.random.seed(42)

def find_project_root() -> Path:
    p = Path.cwd()
    for parent in [p] + list(p.parents):
        if (parent / "data").exists() and (parent / "notebooks").exists():
            return parent
    return p

PROJECT_ROOT = find_project_root()
print("CWD:", os.getcwd())
print("PROJECT_ROOT:", PROJECT_ROOT)

# ✅ API 누적 데이터 위치 (고정)
ACCUM_DIR = PROJECT_ROOT / "data" / "processed" / "01_daily_accumulated"
ACCUM_DIR.mkdir(parents=True, exist_ok=True)
print("ACCUM_DIR:", ACCUM_DIR)

# ✅ 채널 타깃 산출물 저장 위치 (전용 폴더 추천)
OUT_DIR = PROJECT_ROOT / "data" / "processed" / "04_channel_targets"
OUT_DIR.mkdir(parents=True, exist_ok=True)
print("OUT_DIR:", OUT_DIR)


CWD: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\notebooks
PROJECT_ROOT: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml
ACCUM_DIR: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\01_daily_accumulated
OUT_DIR: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\04_channel_targets


In [2]:
# ✅ 입력 파일 (누적 고정 v1)
CHANNEL_DAILY_PATH = ACCUM_DIR / "youtube_channel_daily_stats_kr_v1.csv"
print("CHANNEL_DAILY_PATH:", CHANNEL_DAILY_PATH)

if not CHANNEL_DAILY_PATH.exists():
    raise FileNotFoundError(f"채널 누적 파일이 없습니다: {CHANNEL_DAILY_PATH}")

df = pd.read_csv(CHANNEL_DAILY_PATH, low_memory=False)
print("raw shape:", df.shape)
display(df.head(3))


CHANNEL_DAILY_PATH: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\01_daily_accumulated\youtube_channel_daily_stats_kr_v1.csv


FileNotFoundError: 채널 누적 파일이 없습니다: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\01_daily_accumulated\youtube_channel_daily_stats_kr_v1.csv

In [None]:
# ✅ 전처리: 타입/정렬/중복 제거

def clean_channel_daily(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # 필수 컬럼 체크
    required = ["date", "channel_id"]
    for c in required:
        if c not in out.columns:
            raise ValueError(f"필수 컬럼 누락: {c}")

    out["channel_id"] = out["channel_id"].astype(str)

    # 날짜 파싱 (tz-aware가 섞여도 안전하게 UTC로 읽고 tz 제거)
    out["date"] = pd.to_datetime(out["date"], errors="coerce", utc=True).dt.tz_localize(None)

    # 주요 수치 컬럼 numeric 변환(없어도 동작)
    for c in ["subscriber_count", "views_total", "video_count_total"]:
        if c in out.columns:
            out[c] = pd.to_numeric(out[c], errors="coerce")

    # date/channel_id 기준 중복 제거(가장 마지막 행 유지)
    out = out.sort_values(["channel_id", "date"])
    out = out.drop_duplicates(subset=["channel_id", "date"], keep="last")

    return out

df_clean = clean_channel_daily(df)
print("clean shape:", df_clean.shape)
print("date range:", df_clean["date"].min(), "→", df_clean["date"].max())
print("n unique days:", df_clean["date"].dt.date.nunique())
display(df_clean.head(3))


In [None]:
# ✅ 타깃 생성 함수

def build_channel_growth_targets(
    df_daily: pd.DataFrame,
    horizon_days: int = 7,
    require_min_days: int = 2,
) -> pd.DataFrame:
    """채널 일별 누적 데이터에서 horizon_days 뒤의 구독자/성장량 타깃을 생성"""

    df = df_daily.copy()
    df = df.sort_values(["channel_id", "date"]).reset_index(drop=True)

    n_days = df["date"].dt.date.nunique()
    if n_days < require_min_days:
        print(f"⚠️ 누적 일수 부족: {n_days}일 (최소 {require_min_days}일 필요). 결과가 0행일 수 있어요.")
        # 그래도 빈 형태로 반환(파이프라인이 터지지 않게)
        return df.iloc[0:0].assign(subscriber_future=np.nan, subscriber_growth_h=np.nan)

    # 다음날(1d) 델타(특징)
    if "subscriber_count" in df.columns:
        df["subs_delta_1d"] = df.groupby("channel_id")["subscriber_count"].diff(1)
    if "views_total" in df.columns:
        df["views_delta_1d"] = df.groupby("channel_id")["views_total"].diff(1)
    if "video_count_total" in df.columns:
        df["video_count_delta_1d"] = df.groupby("channel_id")["video_count_total"].diff(1)

    # rolling mean(7d) - 데이터가 부족하면 NaN
    for c, outc in [
        ("subs_delta_1d", "subs_delta_7d_mean"),
        ("views_delta_1d", "views_delta_7d_mean"),
        ("video_count_delta_1d", "video_count_delta_7d_mean"),
    ]:
        if c in df.columns:
            df[outc] = (
                df.groupby("channel_id")[c]
                  .rolling(window=7, min_periods=2)
                  .mean()
                  .reset_index(level=0, drop=True)
            )

    # horizon_days 뒤의 subscriber_count (타깃)
    if "subscriber_count" not in df.columns:
        raise ValueError("subscriber_count 컬럼이 없어 타깃을 만들 수 없습니다.")

    df["subscriber_future"] = df.groupby("channel_id")["subscriber_count"].shift(-horizon_days)
    df["subscriber_growth_h"] = df["subscriber_future"] - df["subscriber_count"]

    # 날짜 파생(모델에서 쓰기 쉬움)
    df["dayofweek"] = df["date"].dt.dayofweek

    # 타깃이 NaN인 마지막 구간 제거(미래가 없으므로)
    out = df.dropna(subset=["subscriber_growth_h"]).reset_index(drop=True)

    return out

df_targets = build_channel_growth_targets(df_clean, horizon_days=7)
print("targets shape:", df_targets.shape)
display(df_targets.head(5))


In [None]:
# ✅ 저장: v1/v2/... 자동 버전 업 (덮어쓰기 방지) - 전용 폴더(OUT_DIR) 사용

def get_next_versioned_path(base_dir: Path, base_name: str, ext: str = ".csv") -> Path:
    base_dir.mkdir(parents=True, exist_ok=True)
    pattern = re.compile(rf"^{re.escape(base_name)}_v(\d+){re.escape(ext)}$")
    versions = []
    for f in base_dir.iterdir():
        if f.is_file():
            m = pattern.match(f.name)
            if m:
                versions.append(int(m.group(1)))
    next_v = max(versions) + 1 if versions else 1
    return base_dir / f"{base_name}_v{next_v}{ext}"

def safe_save_csv(df: pd.DataFrame, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=False, encoding="utf-8-sig")
    print("✅ saved:", path, "| rows:", len(df), "| exists:", path.exists(), "| size:", path.stat().st_size)

OUT_PATH = get_next_versioned_path(OUT_DIR, "youtube_channel_growth_dataset_kr", ext=".csv")
safe_save_csv(df_targets, OUT_PATH)
