In [1]:
import pandas as pd
import numpy as np
import re

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    classification_report,
    confusion_matrix
)


In [2]:
# ============================
# 1. 데이터 로드 (프로젝트 상대경로)
# ============================

def find_project_root() -> Path:
    p = Path.cwd()
    for parent in [p] + list(p.parents):
        if (parent / "data").exists() and (parent / "notebooks").exists():
            return parent
    return p

def latest_versioned_csv(folder: Path, base_name: str) -> Path | None:
    """
    folder 안에서 base_name_v{n}.csv 중 가장 큰 n 파일 Path 반환
    없으면 None
    """
    pattern = re.compile(rf"^{re.escape(base_name)}_v(\d+)\.csv$")
    best_v, best_path = None, None

    for f in folder.glob(f"{base_name}_v*.csv"):
        m = pattern.match(f.name)
        if m:
            v = int(m.group(1))
            if best_v is None or v > best_v:
                best_v, best_path = v, f

    return best_path

def next_versioned_file(folder: Path, base_name: str, ext: str = ".csv") -> Path:
    """
    folder 안에서 base_name_v{n}{ext} 다음 버전 경로 반환 (파일 저장용)
    """
    folder.mkdir(parents=True, exist_ok=True)
    pattern = re.compile(rf"^{re.escape(base_name)}_v(\d+){re.escape(ext)}$")

    versions = []
    for f in folder.glob(f"{base_name}_v*{ext}"):
        m = pattern.match(f.name)
        if m:
            versions.append(int(m.group(1)))

    v = (max(versions) + 1) if versions else 1
    return folder / f"{base_name}_v{v}{ext}"

PROJECT_ROOT = find_project_root()
csv_path = PROJECT_ROOT / "data" / "processed" / "03_kaggle_clean" / "youtube_channels_clean_v1.csv"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("csv_path:", csv_path)

if not csv_path.exists():
    raise FileNotFoundError(f"채널 clean 파일이 없습니다: {csv_path}")

df = pd.read_csv(csv_path, low_memory=False)
print("채널 데이터 로드 완료, shape:", df.shape)
print("컬럼 목록:", list(df.columns))
display(df.head(3))


PROJECT_ROOT: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml
csv_path: c:\Users\73bib\Desktop\유혜원\제주한라대학교\[2025] 프로젝트\bigdata_project\youtube_trending_ml\data\processed\03_kaggle_clean\youtube_channels_clean_v1.csv
채널 데이터 로드 완료, shape: (15830, 17)
컬럼 목록: ['channel_id', 'channel_name', 'subscriber_count', 'view_count', 'video_count', 'created_date', 'category', 'country', 'videos_last_30_days', 'views_last_30_days', 'channel_age_days', 'upload_frequency', 'subscriber_per_view', 'views_per_video', 'uploads_per_subscriber', 'category_encoded', 'country_encoded']


Unnamed: 0,channel_id,channel_name,subscriber_count,view_count,video_count,created_date,category,country,videos_last_30_days,views_last_30_days,channel_age_days,upload_frequency,subscriber_per_view,views_per_video,uploads_per_subscriber,category_encoded,country_encoded
0,UCOmHUn--16B90oW2L6FRR3A,BLACKPINK,99000000,39962585446,636,2016-06-29 03:15:23+00:00,"Music of Asia, Pop music, Music, Electronic music",KR,1,3256869,3450.0,0.184348,0.002477,62834250.0,6e-06,1916,56
1,UC3IZKseVpdzPSBaWxBxundA,HYBE LABELS,78700000,41604896923,2817,2008-06-04 08:23:22+00:00,"Hip hop music, Pop music, Music, Music of Asia",KR,79,46074833,6397.0,0.440363,0.001892,14769220.0,3.6e-05,1122,56
2,UCVNE660NcgYzi18LwwUZb7Q,BILLIE EILISH,82300,14316364,1,2019-01-18 05:14:32+00:00,,,0,0,2517.0,0.000397,0.005749,14316360.0,1.2e-05,3443,110


In [3]:
# ============================
# 2. 타깃 정의: 채널 성장 proxy
#    → 최근 30일 조회수(views_last_30_days)를 성장 정도로 사용
# ============================

if "views_last_30_days" not in df.columns:
    raise ValueError("views_last_30_days 없음")

df = df.dropna(subset=["views_last_30_days"])
df = df[df["views_last_30_days"] >= 0]

# 로그 변환 → 분포 안정화
df["views_last_30_days_log"] = np.log1p(df["views_last_30_days"])


In [4]:
# ============================
# 3. Feature 구성
# ============================

candidate_features = [
    "upload_frequency",      # 업로드 빈도
    "views_per_video",       # 평균 영상당 조회수
    "subscriber_per_view",   # 조회수 대비 구독자
    "video_count",           # 전체 업로드 수
    "channel_age_days",      # 채널 나이
    "category_encoded",      # 카테고리(인코딩)
    "country_encoded",       # 국가(인코딩)
]

feature_cols = [c for c in candidate_features if c in df.columns]
print("\n사용 독립변수(feature_cols):", feature_cols)

X = df[feature_cols].fillna(0)
y_reg = df["views_last_30_days_log"]



사용 독립변수(feature_cols): ['upload_frequency', 'views_per_video', 'subscriber_per_view', 'video_count', 'channel_age_days', 'category_encoded', 'country_encoded']


In [5]:
# ============================
# 4. Train/Test 분할
# ============================

X_train, X_test, y_train_reg, y_test_reg = train_test_split(
    X,
    y_reg,
    test_size=0.2,
    random_state=42,
)

print("\nTrain size:", X_train.shape, " / Test size:", X_test.shape)



Train size: (12664, 7)  / Test size: (3166, 7)


In [6]:
# ============================
# 5. 회귀 모델: RandomForest Regressor
# ============================

rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=12,
    min_samples_leaf=10,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train_reg)
pred = rf.predict(X_test)

rmse = mean_squared_error(y_test_reg, pred)**0.5
mae = mean_absolute_error(y_test_reg, pred)
r2 = r2_score(y_test_reg, pred)
mape = np.mean(np.abs((np.expm1(y_test_reg) - np.expm1(pred)) / np.expm1(y_test_reg)))

print("\n===== [회귀 성능] 로그 변환 버전 =====")
print("RMSE :", rmse)
print("MAE  :", mae)
print("R2   :", r2)
print("MAPE :", mape)

# 변수 중요도
rf_importance = pd.DataFrame({
    "feature": X.columns,
    "importance": rf.feature_importances_
}).sort_values(by="importance", ascending=False)

print("\n[변수 중요도]")
print(rf_importance)



===== [회귀 성능] 로그 변환 버전 =====
RMSE : 4.978264408023472
MAE  : 3.828712639178084
R2   : 0.446033001318324
MAPE : inf

[변수 중요도]
               feature  importance
3          video_count    0.687098
1      views_per_video    0.109070
2  subscriber_per_view    0.055256
4     channel_age_days    0.052559
0     upload_frequency    0.039945
5     category_encoded    0.038082
6      country_encoded    0.017989


In [7]:
# ============================
# 6. 분류용 타깃 생성: 성장 빠른 채널 vs 느린 채널
#    → 최근 30일 조회수가 상위 20% 이상이면 '성장 빠른 채널(1)'
# ============================

threshold = np.quantile(y_reg, 0.8)
df["growth_fast"] = (df["views_last_30_days"] >= threshold).astype(int)

print("\ngrowth_fast 기준 (상위 20% threshold):", threshold)
print(df["growth_fast"].value_counts())

# 분류용 데이터 다시 구성
X_cls = df[feature_cols].fillna(0)
y_cls = df["growth_fast"]

X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X_cls,
    y_cls,
    test_size=0.2,
    random_state=42,
    stratify=y_cls,
)

print("\n분류 Train size:", X_train_cls.shape, " / Test size:", X_test_cls.shape)



growth_fast 기준 (상위 20% threshold): 14.500769565633874
growth_fast
1    9829
0    6001
Name: count, dtype: int64

분류 Train size: (12664, 7)  / Test size: (3166, 7)


In [8]:
# ============================
# 7. 분류 모델: RandomForest Classifier
# ============================

rf_cls = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_leaf=20,
    random_state=42,
    n_jobs=-1,
)

rf_cls.fit(X_train_cls, y_train_cls)
y_pred_cls = rf_cls.predict(X_test_cls)

print("\n===== [성장 채널 분류 성능] =====")
print(classification_report(y_test_cls, y_pred_cls))

print("Confusion Matrix:")
print(confusion_matrix(y_test_cls, y_pred_cls))

rf_cls_importance = pd.DataFrame({
    "feature": X.columns,
    "importance": rf_cls.feature_importances_
}).sort_values(by="importance", ascending=False)

print("\n[분류 중요도]")
print(rf_cls_importance)

print("\n>>> 모델링 + 성능 개선 완료!")



===== [성장 채널 분류 성능] =====
              precision    recall  f1-score   support

           0       0.75      0.62      0.68      1200
           1       0.79      0.87      0.83      1966

    accuracy                           0.78      3166
   macro avg       0.77      0.74      0.75      3166
weighted avg       0.77      0.78      0.77      3166

Confusion Matrix:
[[ 740  460]
 [ 250 1716]]

[분류 중요도]
               feature  importance
3          video_count    0.437204
0     upload_frequency    0.348616
2  subscriber_per_view    0.061462
1      views_per_video    0.051222
4     channel_age_days    0.050412
5     category_encoded    0.025806
6      country_encoded    0.025278

>>> 모델링 + 성능 개선 완료!
