In [1]:
import os, re, json
import numpy as np
import pandas as pd
import joblib

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime


In [2]:
pd.set_option("display.max_columns", 200)
np.random.seed(42)

def find_project_root() -> Path:
    p = Path.cwd()
    for parent in [p] + list(p.parents):
        if (parent / "data").exists() and (parent / "notebooks").exists():
            return parent
    return p

def safe_read_csv(path: Path | None):
    """파일이 없거나 path가 None이면 None 반환"""
    if path is None:
        return None
    path = Path(path)
    if not path.exists():
        print("⚠️ not found:", path)
        return None
    return pd.read_csv(path, low_memory=False)

def latest_versioned_csv(folder: Path, base_name: str) -> Path | None:
    """
    folder 안에서 base_name_v{n}.csv 중 가장 큰 n 파일 Path 반환
    없으면 None
    """
    pattern = re.compile(rf"^{re.escape(base_name)}_v(\d+)\.csv$")
    best_v, best_path = None, None

    for f in folder.glob(f"{base_name}_v*.csv"):
        m = pattern.match(f.name)
        if m:
            v = int(m.group(1))
            if best_v is None or v > best_v:
                best_v, best_path = v, f

    return best_path

def next_versioned_file(folder: Path, base_name: str, ext: str = ".csv") -> Path:
    """
    folder 안에서 base_name_v{n}{ext} 다음 버전 경로 반환 (파일 저장용)
    """
    folder.mkdir(parents=True, exist_ok=True)
    pattern = re.compile(rf"^{re.escape(base_name)}_v(\d+){re.escape(ext)}$")

    versions = []
    for f in folder.glob(f"{base_name}_v*{ext}"):
        m = pattern.match(f.name)
        if m:
            versions.append(int(m.group(1)))

    v = (max(versions) + 1) if versions else 1
    return folder / f"{base_name}_v{v}{ext}"



PROJECT_ROOT = find_project_root()
print("CWD:", os.getcwd())
print("PROJECT_ROOT:", PROJECT_ROOT)

# ✅ 모델 폴더
MODEL_DIR = PROJECT_ROOT / "models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)
print("MODEL_DIR:", MODEL_DIR)

# ✅ 13번은 API 누적 데이터만 사용 (사용자 조건 반영)
ACCUM_DIR = PROJECT_ROOT / "data" / "processed" / "01_daily_accumulated"
FEAT_DIR  = PROJECT_ROOT / "data" / "processed" / "02_comment_features"

PATH_TRENDING_DAILY = ACCUM_DIR / "youtube_trending_videos_daily_kr_v1.csv"
PATH_CHANNEL_DAILY  = ACCUM_DIR / "youtube_channels_daily_stats_kr_v1.csv"
PATH_COMMENTS_RAW   = ACCUM_DIR / "youtube_comments_raw_kr_v1.csv"

# ✅ 댓글 기반 비디오 레벨 feature (04번 산출물)
PATH_COMMENT_VIDEO_FEATS = FEAT_DIR / "youtube_comment_features_video_level_kr_v1.csv"

# ✅ 기존 코드 호환(중요): build_trending_duration_dataset()에서 쓰는 이름을 맞춰줌
PATH_VCF_BY_VIDEO = PATH_COMMENT_VIDEO_FEATS
PATH_VCF = None  # 두 번째 merge는 스킵되도록

print("PATH_TRENDING_DAILY:", PATH_TRENDING_DAILY)
print("PATH_CHANNEL_DAILY :", PATH_CHANNEL_DAILY)
print("PATH_COMMENTS_RAW  :", PATH_COMMENTS_RAW)
print("PATH_COMMENT_VIDEO_FEATS:", PATH_COMMENT_VIDEO_FEATS)

# 존재 체크
for p in [PATH_TRENDING_DAILY, PATH_CHANNEL_DAILY, PATH_COMMENTS_RAW]:
    if not p.exists():
        print("⚠️ not found:", p)


CWD: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\notebooks
PROJECT_ROOT: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml
MODEL_DIR: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\models
PATH_TRENDING_DAILY: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\01_daily_accumulated\youtube_trending_videos_daily_kr_v1.csv
PATH_CHANNEL_DAILY : c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\01_daily_accumulated\youtube_channels_daily_stats_kr_v1.csv
PATH_COMMENTS_RAW  : c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\01_daily_accumulated\youtube_comments_raw_kr_v1.csv
PATH_COMMENT_VIDEO_FEATS: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\02_comment_features\youtube_comment_features_video

## 1) 영상 트렌딩 유지기간 예측 (video_id 단위 회귀)

In [3]:
def build_trending_duration_dataset():
    trending = pd.read_csv(PATH_TRENDING_DAILY)

    # ✅ merge 안정성: video_id 타입 통일
    trending["video_id"] = trending["video_id"].astype(str)

    # ✅ 호환: 예전 누적 파일이 collected_date를 썼다면 date로 변환
    if "date" not in trending.columns and "collected_date" in trending.columns:
        trending = trending.rename(columns={"collected_date": "date"})

    # 날짜 tz 통일(UTC로 파싱 후 tz 제거 -> naive)
    trending["date"] = pd.to_datetime(trending["date"], errors="coerce", utc=True).dt.tz_localize(None)
    if "publish_date" in trending.columns:
        trending["publish_date"] = pd.to_datetime(trending["publish_date"], errors="coerce", utc=True).dt.tz_localize(None)

    # target: video_id별 트렌딩 유지기간(일)
    y = (trending.groupby("video_id")["date"]
                .nunique()
                .rename("trending_duration_days")
                .reset_index())

    # first day snapshot
    first_day = (trending.sort_values(["video_id", "date"])
                         .groupby("video_id", as_index=False)
                         .first())

    cols = ["video_id"]
    for c in ["channel_id", "region", "category_id", "publish_date", "date", "views", "likes", "comments"]:
        if c in first_day.columns:
            cols.append(c)

    first_day = first_day[cols].rename(columns={
        "views": "views_day1",
        "likes": "likes_day1",
        "comments": "comments_day1",
        "date": "first_trending_date"
    })

    # aggregates
    agg = (trending.groupby("video_id")
                  .agg(
                      views_mean=("views", "mean"),
                      views_max=("views", "max"),
                      likes_mean=("likes", "mean"),
                      likes_max=("likes", "max"),
                      comments_mean=("comments", "mean"),
                      comments_max=("comments", "max"),
                      trending_days=("date", "nunique"),
                  )
                  .reset_index())

    df = y.merge(first_day, on="video_id", how="left").merge(agg, on="video_id", how="left")

    # merge 이후 dtype 꼬임 방지: 다시 datetime 강제
    if "publish_date" in df.columns:
        df["publish_date"] = pd.to_datetime(df["publish_date"], errors="coerce", utc=True).dt.tz_localize(None)
    if "first_trending_date" in df.columns:
        df["first_trending_date"] = pd.to_datetime(df["first_trending_date"], errors="coerce", utc=True).dt.tz_localize(None)

    # days_to_first_trending
    df["days_to_first_trending"] = (
        (df["first_trending_date"] - df["publish_date"]).dt.total_seconds() / 86400.0
        if ("publish_date" in df.columns and "first_trending_date" in df.columns)
        else np.nan
    )

    # merge comment features (video-level)
    vcfv = safe_read_csv(PATH_VCF_BY_VIDEO)
    if vcfv is not None and "video_id" in vcfv.columns:
        vcfv = vcfv.copy()
        vcfv["video_id"] = vcfv["video_id"].astype(str)
        vcfv = vcfv[~vcfv["video_id"].str.contains("#NAME", na=False)]
        vcfv = vcfv.drop_duplicates("video_id", keep="first")
        df = df.merge(vcfv.drop(columns=["category_name"], errors="ignore"), on="video_id", how="left")

    vcf = safe_read_csv(PATH_VCF)
    if vcf is not None and "video_id" in vcf.columns:
        vcf = vcf.copy()
        vcf["video_id"] = vcf["video_id"].astype(str)
        vcf = vcf[~vcf["video_id"].str.contains("#NAME", na=False)]
        vcf = vcf.drop_duplicates("video_id", keep="first")
        df = df.merge(vcf, on="video_id", how="left", suffixes=("", "_vcf"))

    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # leakage 제거
    df.drop(columns=["trending_days"], errors="ignore", inplace=True)

    # ✅ raw/ID/원문 컬럼 제거(혹시 섞여 들어온 경우)
    DROP_COLS = ["comment_id", "comment_publishedAt", "text", "run_id", "category_name", "country", "likeCount"]
    df.drop(columns=[c for c in DROP_COLS if c in df.columns], inplace=True, errors="ignore")

    return df


df_video = build_trending_duration_dataset()
print("video dataset:", df_video.shape)
display(df_video.head(3))


video dataset: (51, 26)


Unnamed: 0,video_id,trending_duration_days,channel_id,region,category_id,publish_date,first_trending_date,views_day1,likes_day1,comments_day1,views_mean,views_max,likes_mean,likes_max,comments_mean,comments_max,days_to_first_trending,comment_count,unique_authors,mean_like_count,mean_text_len,url_ratio,hashtag_ratio,mention_ratio,korean_comment_ratio,mean_hangul_ratio
0,-WGFbInX6JI,1,UC1q4Ihlv_YhLELw-ijE0Diw,KR,20,2026-01-30 12:30:00,2026-01-31,89876,2820,379,89876.0,89876,2820.0,2820,379.0,379,0.479167,181.0,169.0,3.944751,36.679558,0.0,0.0,0.0,1.0,0.730608
1,0HXwT4gefnQ,1,UCpqyr6h4RCXCEswHlkSjykA,KR,20,2026-01-30 09:00:02,2026-01-31,296006,16525,1296,296006.0,296006,16525.0,16525,1296.0,1296,0.624977,52.0,52.0,0.096154,25.230769,0.0,0.0,0.0,1.0,0.829411
2,0ZrO8_AKVcc,1,UC28EaHd6V5EFqgaKZG33pIQ,KR,10,2026-01-27 09:00:01,2026-01-31,106814,1571,62,106814.0,106814,1571.0,1571,62.0,62,3.624988,,,,,,,,,


In [4]:
def train_trending_duration_model(df, test_size=0.2, random_state=42, prefix="trending_duration"):
    import os

    target_col = "trending_duration_days"
    if target_col not in df.columns:
        raise ValueError(f"'{target_col}' 컬럼이 df에 없습니다.")

    print("DEBUG MODEL_DIR =", MODEL_DIR)

    y = pd.to_numeric(df[target_col], errors="coerce")
    X = df.drop(columns=["video_id", target_col], errors="ignore").copy()
    X = X.replace([np.inf, -np.inf], np.nan)

    # 타깃 NaN 제거
    valid = y.notna()
    X = X.loc[valid].reset_index(drop=True)
    y = y.loc[valid].reset_index(drop=True)

    # 1) datetime 컬럼 -> 숫자화(ts/dow)
    datetime_cols = list(X.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns)
    for c in ["publish_date", "first_trending_date"]:
        if c in X.columns and c not in datetime_cols:
            X[c] = pd.to_datetime(X[c], errors="coerce", utc=True).dt.tz_localize(None)
            datetime_cols.append(c)

    datetime_cols = list(dict.fromkeys(datetime_cols))
    for c in datetime_cols:
        dt = pd.to_datetime(X[c], errors="coerce")
        ts = dt.astype("int64")
        ts = pd.Series(ts, index=X.index).where(dt.notna(), np.nan) / 1e9
        X[c + "_ts"] = ts
        X[c + "_dow"] = dt.dt.dayofweek

    if datetime_cols:
        X = X.drop(columns=datetime_cols, errors="ignore")

    # 2) object지만 숫자열이면 numeric 변환
    for c in X.columns:
        if X[c].dtype == "object":
            tmp = pd.to_numeric(X[c], errors="coerce")
            if tmp.notna().mean() >= 0.9:
                X[c] = tmp

    # 3) 전부 NaN 컬럼 제거
    all_nan_cols = X.columns[X.isna().all()].tolist()
    if all_nan_cols:
        print("Drop all-NaN cols:", all_nan_cols[:20], "..." if len(all_nan_cols) > 20 else "")
        X = X.drop(columns=all_nan_cols)

    # 4) 상수 컬럼 제거
    nunique = X.nunique(dropna=True)
    const_cols = nunique[nunique <= 1].index.tolist()
    if const_cols:
        print("Drop constant cols:", const_cols[:20], "..." if len(const_cols) > 20 else "")
        X = X.drop(columns=const_cols)

    # 5) cat/num 분리
    cat_cols = [c for c in X.columns if X[c].dtype == "object"]
    num_cols = [c for c in X.columns if c not in cat_cols]

    # 6) 전처리 파이프라인
    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    pre = ColumnTransformer(
        transformers=[
            ("num", num_pipe, num_cols),
            ("cat", cat_pipe, cat_cols),
        ],
        remainder="drop"
    )

    model = RandomForestRegressor(
        n_estimators=600,
        random_state=random_state,
        n_jobs=-1,
        min_samples_leaf=2
    )

    pipe = Pipeline([("preprocess", pre), ("model", model)])

    # 7) split & fit
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    print("DEBUG: fit start | X_train:", X_train.shape, "X_test:", X_test.shape)
    pipe.fit(X_train, y_train)
    print("DEBUG: fit done")

    pred = pipe.predict(X_test)

    mae = mean_absolute_error(y_test, pred)
    rmse = mean_squared_error(y_test, pred) ** 0.5
    r2 = r2_score(y_test, pred)

    print(f"[Trending Duration] MAE: {mae:.4f}  RMSE: {rmse:.4f}  R2: {r2:.4f}")
    print("Features used:", X.shape[1])

    # ✅ v1/v2 자동 저장 + 저장 확인
    model_path = next_versioned_file(MODEL_DIR, f"{prefix}_model", ext=".joblib")
    cols_path  = next_versioned_file(MODEL_DIR, f"{prefix}_feature_columns", ext=".joblib")


    print("DEBUG: about to save")
    print("DEBUG model_path =", model_path)
    print("DEBUG cols_path  =", cols_path)

    joblib.dump(pipe, model_path)
    joblib.dump(list(X.columns), cols_path)

    assert model_path.exists(), f"모델 저장 실패: {model_path}"
    assert cols_path.exists(),  f"컬럼 저장 실패: {cols_path}"



    print("saved:")
    print(" -", model_path)
    print(" -", cols_path)

    return pipe, list(X.columns), model_path, cols_path


In [5]:
print("RUN TRAIN START")
video_model, video_feature_cols, video_model_path, video_cols_path = train_trending_duration_model(df_video)
print("RUN TRAIN DONE")
print("RETURNED:", video_model_path, video_cols_path)


RUN TRAIN START
DEBUG MODEL_DIR = c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\models
Drop constant cols: ['region', 'korean_comment_ratio', 'first_trending_date_ts', 'first_trending_date_dow'] 
DEBUG: fit start | X_train: (40, 22) X_test: (11, 22)
DEBUG: fit done
[Trending Duration] MAE: 0.0000  RMSE: 0.0000  R2: 1.0000
Features used: 22
DEBUG: about to save
DEBUG model_path = c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\models\trending_duration_model_v1.joblib
DEBUG cols_path  = c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\models\trending_duration_feature_columns_v1.joblib
saved:
 - c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\models\trending_duration_model_v1.joblib
 - c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\models\trending_duration_feature_columns_v1.joblib
RUN TRAIN DONE
RETURNED: c:\Users

In [6]:
def _latest_versioned_file(base_dir: str, base_name: str, ext: str = ".joblib") -> str:
    """base_name_vN.joblib 중 가장 큰 N을 반환"""
    pattern = re.compile(rf"{re.escape(base_name)}_v(\d+){re.escape(ext)}$")
    best_v = None
    best_path = None
    for f in os.listdir(base_dir):
        m = pattern.match(f)
        if m:
            v = int(m.group(1))
            if best_v is None or v > best_v:
                best_v = v
                best_path = os.path.join(base_dir, f)
    if best_path is None:
        raise FileNotFoundError(f"{base_dir} 에 '{base_name}_v*.joblib' 파일이 없습니다.")
    return best_path


def _load_latest_model_and_cols(model_base_name: str, cols_base_name: str):
    model_path = _latest_versioned_file(MODEL_DIR, model_base_name)
    cols_path  = _latest_versioned_file(MODEL_DIR, cols_base_name)
    pipe = joblib.load(model_path)
    feature_cols = joblib.load(cols_path)
    return pipe, feature_cols, model_path, cols_path


def predict_trending_duration(input_dict: dict) -> float:
    pipe, feature_cols, model_path, cols_path = _load_latest_model_and_cols(
        "trending_duration_model",
        "trending_duration_feature_columns",
    )

    X_new = pd.DataFrame([input_dict]).copy()

    # (선택) 날짜 입력을 줬다면 학습 때와 동일하게 파생 생성
    for c in ["publish_date", "first_trending_date"]:
        if c in X_new.columns:
            dt = pd.to_datetime(X_new[c], errors="coerce", utc=True).dt.tz_localize(None)
            X_new[c + "_ts"] = dt.astype("int64") / 1e9
            X_new[c + "_dow"] = dt.dt.dayofweek

    # 학습 피처 컬럼에 맞추기(없는 컬럼은 NaN)
    for c in feature_cols:
        if c not in X_new.columns:
            X_new[c] = np.nan
    X_new = X_new[feature_cols]

    pred = pipe.predict(X_new)[0]
    return float(pred)


def trending_duration_input_template():
    _, cols, _, _ = _load_latest_model_and_cols(
        "trending_duration_model",
        "trending_duration_feature_columns",
    )
    return {c: None for c in cols}


tpl = trending_duration_input_template()
print("template columns:", len(tpl))
print("loaded from:", _latest_versioned_file(MODEL_DIR, "trending_duration_model"))
list(tpl.keys())[:25]

template columns: 22
loaded from: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\models\trending_duration_model_v1.joblib


['channel_id',
 'category_id',
 'views_day1',
 'likes_day1',
 'comments_day1',
 'views_mean',
 'views_max',
 'likes_mean',
 'likes_max',
 'comments_mean',
 'comments_max',
 'days_to_first_trending',
 'comment_count',
 'unique_authors',
 'mean_like_count',
 'mean_text_len',
 'url_ratio',
 'hashtag_ratio',
 'mention_ratio',
 'mean_hangul_ratio',
 'publish_date_ts',
 'publish_date_dow']

In [7]:

example_video_input = {
    "region": "KR",
    "category_id": 24,
    "views_day1": 120000,
    "likes_day1": 8000,
    "comments_day1": 1500,
    "views_mean": 180000,
    "views_max": 350000,
    "likes_mean": 12000,
    "likes_max": 22000,
    "comments_mean": 2000,
    "comments_max": 4200,
    "days_to_first_trending": 2.0,
}
print("pred trending duration(days):", predict_trending_duration(example_video_input))


pred trending duration(days): 1.0


## 2) 채널 성장 예측 (channel_id×date 단위 회귀)
타깃: subscriber_growth_h = (h일 뒤 구독자수 - 오늘 구독자수). 기본 h=7

In [8]:
def build_channel_growth_dataset(horizon_days=7):
    ch = pd.read_csv(PATH_CHANNEL_DAILY)

    # 날짜 파싱
    ch["date"] = pd.to_datetime(ch.get("date"), errors="coerce", utc=True).dt.tz_localize(None)
    ch = ch.sort_values(["channel_id", "date"]).reset_index(drop=True)

    # 숫자형 강제
    for c in ["subscriber_count", "views_total", "video_count_total"]:
        if c in ch.columns:
            ch[c] = pd.to_numeric(ch[c], errors="coerce")

    # 타깃: h일 뒤 구독자 증가량
    ch["subscriber_future"] = ch.groupby("channel_id")["subscriber_count"].shift(-horizon_days)
    ch["subscriber_growth_h"] = ch["subscriber_future"] - ch["subscriber_count"]

    # 변화량 피처
    ch["subs_delta_1d"] = ch.groupby("channel_id")["subscriber_count"].diff(1)
    ch["subs_delta_1d"] = ch["subs_delta_1d"].fillna(0)

    if "views_total" in ch.columns:
        ch["views_delta_1d"] = ch.groupby("channel_id")["views_total"].diff(1).fillna(0)

    if "video_count_total" in ch.columns:
        ch["video_count_delta_1d"] = ch.groupby("channel_id")["video_count_total"].diff(1).fillna(0)

    # 7일 평균 추세(초기 NaN 최소화)
    ch["subs_delta_7d_mean"] = (
        ch.groupby("channel_id")["subs_delta_1d"]
          .rolling(7, min_periods=1)
          .mean()
          .reset_index(level=0, drop=True)
    )
    if "views_delta_1d" in ch.columns:
        ch["views_delta_7d_mean"] = (
            ch.groupby("channel_id")["views_delta_1d"]
              .rolling(7, min_periods=1)
              .mean()
              .reset_index(level=0, drop=True)
        )

    # 트렌딩(채널-일자 집계) merge
    tr = pd.read_csv(PATH_TRENDING_DAILY)
    tr["date"] = pd.to_datetime(tr.get("date"), errors="coerce", utc=True).dt.tz_localize(None)
    if "channel_id" in tr.columns:
        tr["channel_id"] = tr["channel_id"].astype(str)
    if "video_id" in tr.columns:
        tr["video_id"] = tr["video_id"].astype(str)

    # 기본 집계
    agg_dict = {"video_id": "nunique"}
    for c in ["views", "likes", "comments"]:
        if c in tr.columns:
            tr[c] = pd.to_numeric(tr[c], errors="coerce")
            agg_dict[c] = "sum"

    tr_agg = (tr.groupby(["channel_id", "date"], as_index=False)
                .agg(agg_dict)
                .rename(columns={
                    "video_id": "trending_video_cnt",
                    "views": "trending_views_sum",
                    "likes": "trending_likes_sum",
                    "comments": "trending_comments_sum",
                }))

    df = ch.merge(tr_agg, on=["channel_id", "date"], how="left")

    # 트렌딩 관련 결측은 0 처리 (그 날 트렌딩 영향 없음)
    for c in df.columns:
        if c.startswith("trending_"):
            df[c] = df[c].fillna(0)

    # inf 처리
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # 타깃 NaN (끝부분 h일치) 제거
    df = df[df["subscriber_growth_h"].notna()].reset_index(drop=True)

    return df


df_channel = build_channel_growth_dataset(horizon_days=7)
print("channel dataset:", df_channel.shape)
display(df_channel.head(3))

channel dataset: (0, 20)


Unnamed: 0,date,run_ts_utc,channel_id,channel_name,created_date,subscriber_count,views_total,video_count_total,country,subscriber_future,subscriber_growth_h,subs_delta_1d,views_delta_1d,video_count_delta_1d,subs_delta_7d_mean,views_delta_7d_mean,trending_video_cnt,trending_views_sum,trending_likes_sum,trending_comments_sum


In [9]:
def train_channel_growth_model(df, test_size=0.2, random_state=42, prefix="channel_growth"):
    target_col = "subscriber_growth_h"
    if target_col not in df.columns:
        raise ValueError(f"'{target_col}' 컬럼이 df에 없습니다.")

    y = pd.to_numeric(df[target_col], errors="coerce")

    X = df.drop(columns=[target_col, "subscriber_future"], errors="ignore").copy()

    # ID 제거(식별자 과적합 방지)
    X = X.drop(columns=["channel_id"], errors="ignore")

    # date -> dayofweek
    if "date" in X.columns:
        X["date"] = pd.to_datetime(X["date"], errors="coerce", utc=True).dt.tz_localize(None)
        X["dayofweek"] = X["date"].dt.dayofweek
        X = X.drop(columns=["date"], errors="ignore")

    # inf 제거
    X = X.replace([np.inf, -np.inf], np.nan)

    # 타깃 NaN 제거
    valid = y.notna()
    X = X.loc[valid].reset_index(drop=True)
    y = y.loc[valid].reset_index(drop=True)

    # object인데 숫자열이면 numeric으로 바꾸기(보수적)
    for c in X.columns:
        if X[c].dtype == "object":
            tmp = pd.to_numeric(X[c], errors="coerce")
            if tmp.notna().mean() >= 0.9:
                X[c] = tmp

    # 전부 NaN 컬럼 제거
    all_nan_cols = X.columns[X.isna().all()].tolist()
    if all_nan_cols:
        print("Drop all-NaN cols:", all_nan_cols[:20], "..." if len(all_nan_cols) > 20 else "")
        X = X.drop(columns=all_nan_cols)

    # 상수 컬럼 제거
    nunique = X.nunique(dropna=True)
    const_cols = nunique[nunique <= 1].index.tolist()
    if const_cols:
        X = X.drop(columns=const_cols)

    cat_cols = [c for c in X.columns if X[c].dtype == "object"]
    num_cols = [c for c in X.columns if c not in cat_cols]

    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    pre = ColumnTransformer([
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols),
    ])

    model = RandomForestRegressor(
        n_estimators=900, random_state=random_state, n_jobs=-1, min_samples_leaf=2
    )

    pipe = Pipeline([("preprocess", pre), ("model", model)])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)

    mae = mean_absolute_error(y_test, pred)
    rmse = mean_squared_error(y_test, pred) ** 0.5
    r2 = r2_score(y_test, pred)

    print(f"[Channel Growth] MAE: {mae:.4f}  RMSE: {rmse:.4f}  R2: {r2:.4f}")
    print("Features used:", X.shape[1])

    # ✅ v1/v2 자동 저장 (프로젝트 루트/models)
    model_path = next_versioned_file(MODEL_DIR, f"{prefix}_model", ext=".joblib")
    cols_path  = next_versioned_file(MODEL_DIR, f"{prefix}_feature_columns", ext=".joblib")

    joblib.dump(pipe, model_path)
    joblib.dump(list(X.columns), cols_path)

    print("saved:")
    print(" -", model_path)
    print(" -", cols_path)

    return pipe, list(X.columns), model_path, cols_path


channel_model, channel_feature_cols, channel_model_path, channel_cols_path = train_channel_growth_model(df_channel)
len(channel_feature_cols), channel_feature_cols[:25], channel_model_path, channel_cols_path

Drop all-NaN cols: ['run_ts_utc', 'channel_name', 'created_date', 'subscriber_count', 'views_total', 'video_count_total', 'country', 'subs_delta_1d', 'views_delta_1d', 'video_count_delta_1d', 'subs_delta_7d_mean', 'views_delta_7d_mean', 'trending_video_cnt', 'trending_views_sum', 'trending_likes_sum', 'trending_comments_sum', 'dayofweek'] 


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
def predict_channel_growth(input_dict: dict) -> float:
    pipe, feature_cols, model_path, cols_path = _load_latest_model_and_cols(
        "channel_growth_model",
        "channel_growth_feature_columns",
    )

    X_new = pd.DataFrame([input_dict]).copy()

    # date 지원(주면 dayofweek로 변환)
    if "date" in X_new.columns:
        dt = pd.to_datetime(X_new["date"], errors="coerce", utc=True).dt.tz_localize(None)
        X_new["dayofweek"] = dt.dt.dayofweek
        X_new = X_new.drop(columns=["date"], errors="ignore")

    for c in feature_cols:
        if c not in X_new.columns:
            X_new[c] = np.nan
    X_new = X_new[feature_cols]

    pred = pipe.predict(X_new)[0]
    return float(pred)


def channel_growth_input_template():
    _, cols, _, _ = _load_latest_model_and_cols(
        "channel_growth_model",
        "channel_growth_feature_columns",
    )
    return {c: None for c in cols}

tpl = channel_growth_input_template()
print("template columns:", len(tpl))
print("loaded from:", _latest_versioned_file(MODEL_DIR, "channel_growth_model"))
list(tpl.keys())[:25]

In [None]:

example_channel_input = {
    "subscriber_count": 1500000,
    "views_total": 450000000,
    "video_count_total": 520,
    "subs_delta_1d": 1200,
    "views_delta_1d": 800000,
    "video_count_delta_1d": 0,
    "subs_delta_7d_mean": 950,
    "views_delta_7d_mean": 700000,
    "trending_video_cnt": 1,
    "trending_views_sum": 2000000,
    "trending_likes_sum": 120000,
    "trending_comments_sum": 9000,
    "dayofweek": 3,
}
print("pred subscriber growth in 7 days:", predict_channel_growth(example_channel_input))


In [None]:
# ==========================================
# ✅ 예측 실행 메타 로그 저장 (버전/입력파일 추적)
# ==========================================

PROJECT_ROOT = Path.cwd()
for parent in [PROJECT_ROOT] + list(PROJECT_ROOT.parents):
    if (parent / "data").exists() and (parent / "notebooks").exists():
        PROJECT_ROOT = parent
        break

REPORTS_DIR = PROJECT_ROOT / "reports"
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

meta = {
    "timestamp_local": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "inputs": {
        "trending_daily_path": str(PATH_TRENDING_DAILY) if "PATH_TRENDING_DAILY" in globals() else None,
        "channel_daily_path": str(PATH_CHANNEL_DAILY) if "PATH_CHANNEL_DAILY" in globals() else None,
        "comments_features_path": str(PATH_COMMENT_VIDEO_FEAT) if "PATH_COMMENT_VIDEO_FEAT" in globals() else None,
        "channel_clean_path": str(PATH_CHANNEL_CLEAN) if "PATH_CHANNEL_CLEAN" in globals() else None,
    },
    "outputs": {
        # 네 13에서 최종 저장하는 파일이 있으면 여기에 추가
    }
}

out_path = REPORTS_DIR / "predict_run_meta_latest.json"
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print("✅ saved:", out_path)
print(json.dumps(meta, ensure_ascii=False, indent=2))
