In [18]:
import re, os, sys, warnings
import pandas as pd
import numpy as np

from pathlib import Path
from typing import Optional
from typing import Tuple
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)

warnings.filterwarnings("ignore")
RANDOM_STATE = 42

In [4]:
# ----------------------------
# 설정: 데이터 파일 경로 (프로젝트 상대경로)
# ----------------------------

def find_project_root() -> Path:
    p = Path.cwd()
    for parent in [p] + list(p.parents):
        if (parent / "data").exists() and (parent / "notebooks").exists():
            return parent
    return p

def latest_versioned_csv(folder: Path, base_name: str) -> Optional[Path]:
    """
    folder 안에서 base_name_v{n}.csv 중 가장 큰 n 파일 Path 반환
    없으면 None
    """
    pattern = re.compile(rf"^{re.escape(base_name)}_v(\d+)\.csv$")
    best_v, best_path = None, None

    for f in folder.glob(f"{base_name}_v*.csv"):
        m = pattern.match(f.name)
        if m:
            v = int(m.group(1))
            if best_v is None or v > best_v:
                best_v, best_path = v, f

    return best_path

def next_versioned_file(folder: Path, base_name: str, ext: str = ".csv") -> Path:
    """
    folder 안에서 base_name_v{n}{ext} 다음 버전 경로 반환 (파일 저장용)
    """
    folder.mkdir(parents=True, exist_ok=True)
    pattern = re.compile(rf"^{re.escape(base_name)}_v(\d+){re.escape(ext)}$")

    versions = []
    for f in folder.glob(f"{base_name}_v*{ext}"):
        m = pattern.match(f.name)
        if m:
            versions.append(int(m.group(1)))

    v = (max(versions) + 1) if versions else 1
    return folder / f"{base_name}_v{v}{ext}"

PROJECT_ROOT = find_project_root()
CLEAN_DIR = PROJECT_ROOT / "data" / "processed" / "03_kaggle_clean"

csv_path = latest_versioned_csv(CLEAN_DIR, "youtube_trending_videos_clean")
if csv_path is None:
    csv_path = CLEAN_DIR / "youtube_trending_videos_clean_v1.csv"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("csv_path:", csv_path)

if not csv_path.exists():
    raise FileNotFoundError(f"트렌딩 clean 파일이 없습니다: {csv_path}")

df = pd.read_csv(csv_path, low_memory=False)
print("데이터 로드 완료, shape:", df.shape)
print("컬럼 목록:", list(df.columns))
display(df.head(3))


PROJECT_ROOT: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml
csv_path: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\03_kaggle_clean\youtube_trending_videos_clean_v1.csv
데이터 로드 완료, shape: (2905678, 21)
컬럼 목록: ['channelId', 'video_id', 'title', 'publishedAt', 'trending_date', 'categoryId', 'tags', 'view_count', 'likes', 'comment_count', 'country', 'tags_list', 'tags_count', 'trending_days', 'publish_month', 'publish_dayofweek', 'days_since_publish', 'like_ratio', 'comment_ratio', 'engagement_score', 'category_name']


Unnamed: 0,channelId,video_id,title,publishedAt,trending_date,categoryId,tags,view_count,likes,comment_count,...,tags_list,tags_count,trending_days,publish_month,publish_dayofweek,days_since_publish,like_ratio,comment_ratio,engagement_score,category_name
0,UCGfBwrCoi9ZJjKiUK8MmJNw,s9FH4rDMvds,LEVEI UM FORA? FINGI ESTAR APAIXONADO POR ELA!,2020-08-11 22:21:49+00:00,2020-08-12 00:00:00+00:00,22,pietro|guedes|ingrid|ohara|pingrid|vlog|amigos...,263835,85095,4500,...,"['pietro', 'guedes', 'ingrid', 'ohara', 'pingr...",14,7,8,1,0,0.322531,0.017056,0.339587,Unknown
1,UCaO6TYtlC8U5ttz62hTrZgg,jbGRowa5tIk,ITZY âNot Shyâ M/V TEASER,2020-08-11 15:00:13+00:00,2020-08-12 00:00:00+00:00,10,JYP Entertainment|JYP|ITZY|ìì§|ITZY Video|I...,6000070,714310,31040,...,"['JYP Entertainment', 'JYP', 'ITZY', 'ì\x9e\x8...",47,60,8,1,0,0.11905,0.005173,0.124224,Unknown
2,UCoXZmVma073v5G1cW82UKkA,3EfkCrXKZNs,Oh Juliana PARÃDIA - MC Niack,2020-08-10 14:59:00+00:00,2020-08-12 00:00:00+00:00,22,OH JULIANA PARÃDIA|MC Niack PARÃDIA|PARÃDIA...,2296748,39761,0,...,"['OH JULIANA PARÃ\x93DIA', 'MC Niack PARÃ\x93D...",7,6,8,0,1,0.017312,0.0,0.017312,Unknown


In [5]:
# =====================================================
# 공통: 독립변수(feature) 구성
# =====================================================

base_feature_cols = [
    "view_count",
    "likes",
    "comment_count",
    "categoryId",
    "publish_dayofweek",
    "tags_count",
]

# 실제로 존재하는 컬럼만 사용 (방어용)
feature_cols = [c for c in base_feature_cols if c in df.columns]
print("\n사용 독립변수(feature_cols):", feature_cols)



사용 독립변수(feature_cols): ['view_count', 'likes', 'comment_count', 'categoryId', 'publish_dayofweek', 'tags_count']


In [6]:
# -----------------------------------------------------
# 함수: 회귀 평가 출력
# -----------------------------------------------------

def eval_regression(name, y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred) ** 0.5
    mae = mean_absolute_error(y_true, y_pred)
    print(f"\n[{name}]")
    print("RMSE:", rmse)
    print("MAE :", mae)
    return rmse, mae

In [7]:
# =====================================================
# 1. 영상 분석 모델링
#    - 타깃: trending_days (회귀)
# =====================================================

if "trending_days" not in df.columns:
    raise ValueError("trending_days 컬럼이 없습니다. v2 파일을 확인해주세요.")

df_trend = df.dropna(subset=["trending_days"]).copy()

X1 = df_trend[feature_cols].fillna(0)
y1 = df_trend["trending_days"].fillna(0)

X1_train, X1_test, y1_train, y1_test = train_test_split(
    X1, y1, test_size=0.2, random_state=42
)

print("\n[1번] trending_days 모델링용 데이터 분할 완료")
print("Train:", X1_train.shape, "/ Test:", X1_test.shape)



[1번] trending_days 모델링용 데이터 분할 완료
Train: (2324542, 6) / Test: (581136, 6)


In [10]:
# ----------------- 1-1. Decision Tree Regressor -----------------

tree = DecisionTreeRegressor(
    max_depth=8,
    min_samples_leaf=50,
    random_state=42,
)

tree.fit(X1_train, y1_train)
y1_pred_tree = tree.predict(X1_test)
eval_regression("Decision Tree Regressor (trending_days)", y1_test, y1_pred_tree)

tree_importance = pd.DataFrame(
    {"feature": X1.columns, "importance": tree.feature_importances_}
).sort_values(by="importance", ascending=False)

print("\n[Decision Tree - 변수 중요도]")
print(tree_importance)



[Decision Tree Regressor (trending_days)]
RMSE: 9.092233800713078
MAE : 5.7497725169106735

[Decision Tree - 변수 중요도]
             feature  importance
2      comment_count    0.508471
1              likes    0.218381
0         view_count    0.175053
3         categoryId    0.066770
5         tags_count    0.023977
4  publish_dayofweek    0.007347


In [11]:
# ----------------- 1-2. Random Forest Regressor -----------------

rf1 = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_leaf=50,
    random_state=42,
    n_jobs=-1,
)

rf1.fit(X1_train, y1_train)
y1_pred_rf = rf1.predict(X1_test)
eval_regression("Random Forest Regressor (trending_days)", y1_test, y1_pred_rf)

rf1_importance = pd.DataFrame(
    {"feature": X1.columns, "importance": rf1.feature_importances_}
).sort_values(by="importance", ascending=False)

print("\n[Random Forest - 변수 중요도]")
print(rf1_importance)



[Random Forest Regressor (trending_days)]
RMSE: 8.526084465850735
MAE : 5.458778949819815

[Random Forest - 변수 중요도]
             feature  importance
2      comment_count    0.489288
1              likes    0.213680
0         view_count    0.174767
3         categoryId    0.071507
5         tags_count    0.037572
4  publish_dayofweek    0.013185


In [14]:
# ----------------- 1-3. XGBoost Regressor -----------------

xgb1 = XGBRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method="hist",
)

xgb1.fit(X1_train, y1_train)
y1_pred_xgb = xgb1.predict(X1_test)
eval_regression("XGBoost Regressor (trending_days)", y1_test, y1_pred_xgb)

xgb1_importance = pd.DataFrame(
    {"feature": X1.columns, "importance": xgb1.feature_importances_}
).sort_values(by="importance", ascending=False)

print("\n[XGBoost - 변수 중요도]")
print(xgb1_importance)



[XGBoost Regressor (trending_days)]
RMSE: 8.307835697229933
MAE : 5.359653949737549

[XGBoost - 변수 중요도]
             feature  importance
0         view_count    0.363864
2      comment_count    0.305823
1              likes    0.146522
3         categoryId    0.098155
5         tags_count    0.044375
4  publish_dayofweek    0.041261


In [15]:
# =====================================================
# 2. 참여도 점수 모델링
#    (1) 회귀: engagement_score
#    (2) 분류: high_engagement (상위 20%)
# =====================================================

if "engagement_score" not in df.columns:
    raise ValueError("engagement_score 컬럼이 없습니다. v2 파일을 확인해주세요.")

df_eng = df.dropna(subset=["engagement_score"]).copy()


In [16]:
# ----------------- 2-1. 회귀: engagement_score -----------------

X2 = df_eng[feature_cols].fillna(0)
y2 = df_eng["engagement_score"].fillna(0)

X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y2, test_size=0.2, random_state=42
)

print("\n[2번-회귀] engagement_score 회귀용 데이터 분할 완료")
print("Train:", X2_train.shape, "/ Test:", X2_test.shape)

# Decision Tree
tree2 = DecisionTreeRegressor(
    max_depth=8, min_samples_leaf=50, random_state=42
)

tree2.fit(X2_train, y2_train)
y2_pred_tree = tree2.predict(X2_test)
eval_regression("Decision Tree Regressor (engagement_score)", y2_test, y2_pred_tree)

tree2_importance = pd.DataFrame(
    {"feature": X2.columns, "importance": tree2.feature_importances_}
).sort_values(by="importance", ascending=False)

print("\n[Decision Tree(engagement_score) - 변수 중요도]")
print(tree2_importance)

# Random Forest
rf2 = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_leaf=50,
    random_state=42,
    n_jobs=-1,
)

rf2.fit(X2_train, y2_train)
y2_pred_rf = rf2.predict(X2_test)
eval_regression("Random Forest Regressor (engagement_score)", y2_test, y2_pred_rf)

rf2_importance = pd.DataFrame(
    {"feature": X2.columns, "importance": rf2.feature_importances_}
).sort_values(by="importance", ascending=False)

print("\n[Random Forest(engagement_score) - 변수 중요도]")
print(rf2_importance)

# XGBoost
xgb2 = XGBRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method="hist",
)

xgb2.fit(X2_train, y2_train)
y2_pred_xgb = xgb2.predict(X2_test)
eval_regression("XGBoost Regressor (engagement_score)", y2_test, y2_pred_xgb)

xgb2_importance = pd.DataFrame(
    {"feature": X2.columns, "importance": xgb2.feature_importances_}
).sort_values(by="importance", ascending=False)

print("\n[XGBoost(engagement_score) - 변수 중요도]")
print(xgb2_importance)



[2번-회귀] engagement_score 회귀용 데이터 분할 완료
Train: (2324542, 6) / Test: (581136, 6)

[Decision Tree Regressor (engagement_score)]
RMSE: 104.0925559198174
MAE : 0.3789432849545896

[Decision Tree(engagement_score) - 변수 중요도]
             feature  importance
1              likes    0.852058
0         view_count    0.147344
2      comment_count    0.000596
3         categoryId    0.000003
4  publish_dayofweek    0.000000
5         tags_count    0.000000

[Random Forest Regressor (engagement_score)]
RMSE: 388.42816841802147
MAE : 2.826719420596413

[Random Forest(engagement_score) - 변수 중요도]
             feature  importance
3         categoryId    0.403510
0         view_count    0.386045
2      comment_count    0.165444
1              likes    0.023179
5         tags_count    0.012290
4  publish_dayofweek    0.009532

[XGBoost Regressor (engagement_score)]
RMSE: 27.545830444125116
MAE : 1.1912416774781645

[XGBoost(engagement_score) - 변수 중요도]
             feature  importance
1              like

In [19]:
# ----------------- 2-2. 분류: high_engagement -----------------

# 상위 20% 기준으로 high_engagement 라벨 생성
threshold = df_eng["engagement_score"].quantile(0.8)
df_eng["high_engagement"] = (df_eng["engagement_score"] >= threshold).astype(int)

print("\nhigh_engagement threshold (상위 20%):", threshold)
print(df_eng["high_engagement"].value_counts())

X3 = df_eng[feature_cols].fillna(0)
y3 = df_eng["high_engagement"]

X3_train, X3_test, y3_train, y3_test = train_test_split(
    X3, y3, test_size=0.2, random_state=42, stratify=y3
)

print("\n[2번-분류] high_engagement 분류용 데이터 분할 완료")
print("Train:", X3_train.shape, "/ Test:", X3_test.shape)

# RandomForest Classifier
rf_clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    min_samples_leaf=50,
    random_state=42,
    n_jobs=-1,
)

rf_clf.fit(X3_train, y3_train)
y3_pred_rf = rf_clf.predict(X3_test)

print("\n[RandomForest Classifier 결과 (high_engagement)]")
print(classification_report(y3_test, y3_pred_rf))

print("\n[Confusion Matrix]")
print(confusion_matrix(y3_test, y3_pred_rf))

rf_clf_importance = pd.DataFrame(
    {"feature": X3.columns, "importance": rf_clf.feature_importances_}
).sort_values(by="importance", ascending=False)

print("\n[RandomForest Classifier - 변수 중요도]")
print(rf_clf_importance)

# XGBoost Classifier
xgb_clf = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method="hist",
    eval_metric="logloss",
)

xgb_clf.fit(X3_train, y3_train)
y3_pred_xgb = xgb_clf.predict(X3_test)

print("\n[XGBoost Classifier 결과 (high_engagement)]")
print(classification_report(y3_test, y3_pred_xgb))

xgb_clf_importance = pd.DataFrame(
    {"feature": X3.columns, "importance": xgb_clf.feature_importances_}
).sort_values(by="importance", ascending=False)

print("\n[XGBoost Classifier - 변수 중요도]")
print(xgb_clf_importance)

print("\n1번(트렌딩 유지기간) + 2번(참여도 회귀/분류) 핵심 모델링 완료!")



high_engagement threshold (상위 20%): 0.0887262101975618
high_engagement
0    2324541
1     581137
Name: count, dtype: int64

[2번-분류] high_engagement 분류용 데이터 분할 완료
Train: (2324542, 6) / Test: (581136, 6)

[RandomForest Classifier 결과 (high_engagement)]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    464909
           1       1.00      0.95      0.97    116227

    accuracy                           0.99    581136
   macro avg       0.99      0.97      0.98    581136
weighted avg       0.99      0.99      0.99    581136


[Confusion Matrix]
[[464507    402]
 [  5902 110325]]

[RandomForest Classifier - 변수 중요도]
             feature  importance
0         view_count    0.495395
1              likes    0.417203
2      comment_count    0.065054
3         categoryId    0.014295
5         tags_count    0.006414
4  publish_dayofweek    0.001640

[XGBoost Classifier 결과 (high_engagement)]
              precision    recall  f1-score   support

  