In [None]:
import re
import os
import sys
import warnings
import pandas as pd
import numpy as np

from pathlib import Path
from typing import Optional
from typing import Tuple
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)

warnings.filterwarnings("ignore")
RANDOM_STATE = 42


In [None]:
# ----------------------------
# 데이터 파일 경로 (프로젝트 상대경로)
# ----------------------------

def find_project_root() -> Path:
    p = Path.cwd()

    for parent in [p] + list(p.parents):
        if (parent / "data").exists() and (parent / "notebooks").exists():
            return parent

    return p

def latest_versioned_csv(folder: Path, base_name: str) -> Optional[Path]:
    pattern = re.compile(rf"^{re.escape(base_name)}_v(\d+)\.csv$")
    best_v, best_path = None, None

    for f in folder.glob(f"{base_name}_v*.csv"):
        m = pattern.match(f.name)

        if m:
            v = int(m.group(1))

            if best_v is None or v > best_v:
                best_v, best_path = v, f

    return best_path

def next_versioned_file(folder: Path, base_name: str, ext: str = ".csv") -> Path:
    folder.mkdir(parents=True, exist_ok=True)

    pattern = re.compile(rf"^{re.escape(base_name)}_v(\d+){re.escape(ext)}$")
    versions = []

    for f in folder.glob(f"{base_name}_v*{ext}"):
        m = pattern.match(f.name)

        if m:
            versions.append(int(m.group(1)))

    v = (max(versions) + 1) if versions else 1
    return folder / f"{base_name}_v{v}{ext}"

PROJECT_ROOT = find_project_root()

CLEAN_DIR = PROJECT_ROOT / "data" / "processed"

csv_path = latest_versioned_csv(CLEAN_DIR, "trending_videos_clean")
if csv_path is None:
    csv_path = CLEAN_DIR / "trending_videos_clean_v1.csv"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("csv_path:", csv_path)

if not csv_path.exists():
    raise FileNotFoundError(f"트렌딩 clean 파일이 없습니다: {csv_path}")

df = pd.read_csv(csv_path, low_memory=False)

print("데이터 로드 완료, shape:", df.shape)
print("컬럼 목록:", list(df.columns))
display(df.head(3))


In [None]:
# =====================================================
# 공통: 독립변수(feature) 구성
# =====================================================

base_feature_cols = [
    "view_count",
    "likes",
    "comment_count",
    "categoryId",
    "publish_dayofweek",
    "tags_count",
]

# 실제로 존재하는 컬럼만 사용
feature_cols = [c for c in base_feature_cols if c in df.columns]
print("\n사용 독립변수 (feature_cols):", feature_cols)


In [None]:
# -----------------------------------------------------
# 함수: 회귀 평가 출력
# -----------------------------------------------------

def eval_regression(name, y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred) ** 0.5
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"\n[{name}]")
    print("RMSE:", rmse)
    print("MAE :", mae)
    print("R²  :", r2)
    
    return rmse, mae, r2


In [None]:
# =====================================================
# 영상 분석 모델링
# =====================================================

# 타깃: trending_days (회귀)
if "trending_days" not in df.columns:
    raise ValueError("trending_days 컬럼이 없습니다. v2 파일을 확인해주세요.")

df_trend = df.dropna(subset=["trending_days"]).copy()

X1 = df_trend[feature_cols].fillna(0)
y1 = df_trend["trending_days"].fillna(0)

X1_train, X1_test, y1_train, y1_test = train_test_split(
    X1, y1, test_size=0.2, random_state=42
)

print("\n[1번] trending_days 모델링용 데이터 분할 완료")
print("Train:", X1_train.shape, "/ Test:", X1_test.shape)


In [None]:
# =====================================================
# Decision Tree Regressor
# =====================================================

tree = DecisionTreeRegressor(
    max_depth=8,
    min_samples_leaf=50,
    random_state=42,
)

tree.fit(X1_train, y1_train)
y1_pred_tree = tree.predict(X1_test)
eval_regression("Decision Tree Regressor (trending_days)", y1_test, y1_pred_tree)

tree_importance = pd.DataFrame(
    {"feature": X1.columns, "importance": tree.feature_importances_}
).sort_values(by="importance", ascending=False)

print("\n[Decision Tree - 변수 중요도]")
print(tree_importance)


In [None]:
# =====================================================
# Random Forest Regressor
# =====================================================

rf1 = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_leaf=50,
    random_state=42,
    n_jobs=-1,
)

rf1.fit(X1_train, y1_train)
y1_pred_rf = rf1.predict(X1_test)
eval_regression("Random Forest Regressor (trending_days)", y1_test, y1_pred_rf)

rf1_importance = pd.DataFrame(
    {"feature": X1.columns, "importance": rf1.feature_importances_}
).sort_values(by="importance", ascending=False)

print("\n[Random Forest - 변수 중요도]")
print(rf1_importance)


In [None]:
# =====================================================
# XGBoost Regressor
# =====================================================

xgb1 = XGBRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method="hist",
)

xgb1.fit(X1_train, y1_train)
y1_pred_xgb = xgb1.predict(X1_test)
eval_regression("XGBoost Regressor (trending_days)", y1_test, y1_pred_xgb)

xgb1_importance = pd.DataFrame(
    {"feature": X1.columns, "importance": xgb1.feature_importances_}
).sort_values(by="importance", ascending=False)

print("\n[XGBoost - 변수 중요도]")
print(xgb1_importance)


In [None]:
# =====================================================
# [Table 1] trending_days 회귀 모델 성능 비교
# =====================================================

def regression_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    
    return mae, rmse, r2

mae_tree, rmse_tree, r2_tree = regression_metrics(y1_test, y1_pred_tree)
mae_rf, rmse_rf, r2_rf = regression_metrics(y1_test, y1_pred_rf)
mae_xgb, rmse_xgb, r2_xgb = regression_metrics(y1_test, y1_pred_xgb)

table1_results = pd.DataFrame({
    "Model": ["Decision Tree", "Random Forest", "XGBoost"],
    "MAE": [mae_tree, mae_rf, mae_xgb],
    "RMSE": [rmse_tree, rmse_rf, rmse_xgb],
    "R²": [r2_tree, r2_rf, r2_xgb],
})

table1_results[["MAE", "RMSE", "R²"]] = table1_results[["MAE", "RMSE", "R²"]].round(4)
table1_results


In [None]:
# =====================================================
# 회귀: engagement_score
# =====================================================

if "engagement_score" not in df.columns:
    raise ValueError("engagement_score 컬럼이 없습니다. v2 파일을 확인해주세요.")

df_eng = df.dropna(subset=["engagement_score"]).copy()

X2 = df_eng[feature_cols].fillna(0)
y2 = df_eng["engagement_score"].fillna(0)

X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y2, test_size=0.2, random_state=42
)

print("\n[2번-회귀] engagement_score 회귀용 데이터 분할 완료")
print("Train:", X2_train.shape, "/ Test:", X2_test.shape)

# Decision Tree
tree2 = DecisionTreeRegressor(
    max_depth=8, min_samples_leaf=50, random_state=42
)

tree2.fit(X2_train, y2_train)
y2_pred_tree = tree2.predict(X2_test)
eval_regression("Decision Tree Regressor (engagement_score)", y2_test, y2_pred_tree)

tree2_importance = pd.DataFrame(
    {"feature": X2.columns, "importance": tree2.feature_importances_}
).sort_values(by="importance", ascending=False)

print("\n[Decision Tree(engagement_score) - 변수 중요도]")
print(tree2_importance)

# Random Forest
rf2 = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_leaf=50,
    random_state=42,
    n_jobs=-1,
)

rf2.fit(X2_train, y2_train)
y2_pred_rf = rf2.predict(X2_test)
eval_regression("Random Forest Regressor (engagement_score)", y2_test, y2_pred_rf)

rf2_importance = pd.DataFrame(
    {"feature": X2.columns, "importance": rf2.feature_importances_}
).sort_values(by="importance", ascending=False)

print("\n[Random Forest(engagement_score) - 변수 중요도]")
print(rf2_importance)

# XGBoost
xgb2 = XGBRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method="hist",
)

xgb2.fit(X2_train, y2_train)
y2_pred_xgb = xgb2.predict(X2_test)
eval_regression("XGBoost Regressor (engagement_score)", y2_test, y2_pred_xgb)

xgb2_importance = pd.DataFrame(
    {"feature": X2.columns, "importance": xgb2.feature_importances_}
).sort_values(by="importance", ascending=False)

print("\n[XGBoost(engagement_score) - 변수 중요도]")
print(xgb2_importance)


In [None]:
# =====================================================
# 분류: high_engagement
# =====================================================

# 상위 20% 기준으로 high_engagement 라벨 생성
threshold = df_eng["engagement_score"].quantile(0.8)
df_eng["high_engagement"] = (df_eng["engagement_score"] >= threshold).astype(int)

print("\nhigh_engagement threshold (상위 20%):", threshold)
print(df_eng["high_engagement"].value_counts())

X3 = df_eng[feature_cols].fillna(0)
y3 = df_eng["high_engagement"]

X3_train, X3_test, y3_train, y3_test = train_test_split(
    X3, y3, test_size=0.2, random_state=42, stratify=y3
)

print("\n[2번-분류] high_engagement 분류용 데이터 분할 완료")
print("Train:", X3_train.shape, "/ Test:", X3_test.shape)

# RandomForest Classifier
rf_clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    min_samples_leaf=50,
    random_state=42,
    n_jobs=-1,
)

rf_clf.fit(X3_train, y3_train)
y3_pred_rf = rf_clf.predict(X3_test)

print("\n[RandomForest Classifier 결과 (high_engagement)]")
print(classification_report(y3_test, y3_pred_rf))

print("\n[Confusion Matrix]")
print(confusion_matrix(y3_test, y3_pred_rf))

rf_clf_importance = pd.DataFrame(
    {"feature": X3.columns, "importance": rf_clf.feature_importances_}
).sort_values(by="importance", ascending=False)

print("\n[RandomForest Classifier - 변수 중요도]")
print(rf_clf_importance)

# XGBoost Classifier
xgb_clf = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method="hist",
    eval_metric="logloss",
)

xgb_clf.fit(X3_train, y3_train)
y3_pred_xgb = xgb_clf.predict(X3_test)

print("\n[XGBoost Classifier 결과 (high_engagement)]")
print(classification_report(y3_test, y3_pred_xgb))

xgb_clf_importance = pd.DataFrame(
    {"feature": X3.columns, "importance": xgb_clf.feature_importances_}
).sort_values(by="importance", ascending=False)

print("\n[XGBoost Classifier - 변수 중요도]")
print(xgb_clf_importance)

print("\n1번(트렌딩 유지기간) + 2번(참여도 회귀/분류) 핵심 모델링 완료!")
