In [None]:
import re
import pandas as pd
import numpy as np

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    classification_report,
    confusion_matrix
)


In [None]:
# ============================
# 데이터 파일 경로 (프로젝트 상대경로)
# ============================

def find_project_root() -> Path:
    p = Path.cwd()

    for parent in [p] + list(p.parents):
        if (parent / "data").exists() and (parent / "notebooks").exists():
            return parent

    return p

def latest_versioned_csv(folder: Path, base_name: str) -> Path | None:
    pattern = re.compile(rf"^{re.escape(base_name)}_v(\d+)\.csv$")
    best_v, best_path = None, None

    for f in folder.glob(f"{base_name}_v*.csv"):
        m = pattern.match(f.name)

        if m:
            v = int(m.group(1))

            if best_v is None or v > best_v:
                best_v, best_path = v, f

    return best_path

def next_versioned_file(folder: Path, base_name: str, ext: str = ".csv") -> Path:
    folder.mkdir(parents=True, exist_ok=True)

    pattern = re.compile(rf"^{re.escape(base_name)}_v(\d+){re.escape(ext)}$")
    versions = []
    for f in folder.glob(f"{base_name}_v*{ext}"):
        m = pattern.match(f.name)

        if m:
            versions.append(int(m.group(1)))

    v = (max(versions) + 1) if versions else 1
    return folder / f"{base_name}_v{v}{ext}"

PROJECT_ROOT = find_project_root()

CLEAN_DIR = PROJECT_ROOT / "data" / "processed"

csv_path = latest_versioned_csv(CLEAN_DIR, "channels_clean")
if csv_path is None:
    csv_path = CLEAN_DIR / "channels_clean_v1.csv"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("csv_path:", csv_path)

if not csv_path.exists():
    raise FileNotFoundError(f"채널 clean 파일이 없습니다: {csv_path}")

df = pd.read_csv(csv_path, low_memory=False)

print("채널 데이터 로드 완료, shape:", df.shape)
print("컬럼 목록:", list(df.columns))
display(df.head(3))


In [None]:
# ============================
# 타깃 정의: 채널 성장 proxy
# ============================

# 최근 30일 조회수(views_last_30_days)를 성장 정도로 사용
if "views_last_30_days" not in df.columns:
    raise ValueError("views_last_30_days 없음")

df = df.dropna(subset=["views_last_30_days"])
df = df[df["views_last_30_days"] >= 0]

# 로그 변환 → 분포 안정화
df["views_last_30_days_log"] = np.log1p(df["views_last_30_days"])


In [None]:
# ============================
# Feature 구성
# ============================

candidate_features = [
    "upload_frequency",      # 업로드 빈도
    "views_per_video",       # 평균 영상당 조회수
    "subscriber_per_view",   # 조회수 대비 구독자
    "video_count",           # 전체 업로드 수
    "channel_age_days",      # 채널 나이
    "category_encoded",      # 카테고리(인코딩)
    "country_encoded",       # 국가(인코딩)
]

feature_cols = [c for c in candidate_features if c in df.columns]
print("\n사용 독립변수(feature_cols):", feature_cols)

X = df[feature_cols].fillna(0)
y_reg = df["views_last_30_days_log"]


In [None]:
# ============================
# Train/Test 분할
# ============================

X_train, X_test, y_train_reg, y_test_reg = train_test_split(
    X,
    y_reg,
    test_size=0.2,
    random_state=42,
)

print("\nTrain size:", X_train.shape, " / Test size:", X_test.shape)


In [None]:
# ============================
# 회귀 모델: RandomForest Regressor
# ============================

rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=12,
    min_samples_leaf=10,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train_reg)
pred = rf.predict(X_test)

rmse = mean_squared_error(y_test_reg, pred)**0.5
mae = mean_absolute_error(y_test_reg, pred)
r2 = r2_score(y_test_reg, pred)

mape = np.mean(np.abs((np.expm1(y_test_reg) - np.expm1(pred)) / np.expm1(y_test_reg)))

print("\n===== [회귀 성능] 로그 변환 버전 =====")
print("RMSE :", rmse)
print("MAE  :", mae)
print("R2   :", r2)
print("MAPE :", mape)

# 변수 중요도
rf_importance = pd.DataFrame({
    "feature": X.columns,
    "importance": rf.feature_importances_
}).sort_values(by="importance", ascending=False)

print("\n[변수 중요도]")
print(rf_importance)


In [None]:
# ============================
# 분류용 타깃 생성: 성장 빠른 채널 vs 느린 채널
# ============================

# 최근 30일 조회수가 상위 20% 이상이면 '성장 빠른 채널'
threshold = np.quantile(y_reg, 0.8)
df["growth_fast"] = (df["views_last_30_days"] >= threshold).astype(int)

print("\ngrowth_fast 기준 (상위 20% threshold):", threshold)
print(df["growth_fast"].value_counts())

# 분류용 데이터 다시 구성
X_cls = df[feature_cols].fillna(0)
y_cls = df["growth_fast"]

X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X_cls,
    y_cls,
    test_size=0.2,
    random_state=42,
    stratify=y_cls,
)

print("\n분류 Train size:", X_train_cls.shape, " / Test size:", X_test_cls.shape)


In [None]:
# ============================
# 분류 모델: RandomForest Classifier
# ============================

rf_cls = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_leaf=20,
    random_state=42,
    n_jobs=-1,
)

rf_cls.fit(X_train_cls, y_train_cls)
y_pred_cls = rf_cls.predict(X_test_cls)

print("\n===== [성장 채널 분류 성능] =====")
print(classification_report(y_test_cls, y_pred_cls))
print("Confusion Matrix:")
print(confusion_matrix(y_test_cls, y_pred_cls))

rf_cls_importance = pd.DataFrame({
    "feature": X.columns,
    "importance": rf_cls.feature_importances_
}).sort_values(by="importance", ascending=False)

print("\n[분류 중요도]")
print(rf_cls_importance)
print("\n>>> 모델링 + 성능 개선 완료!")
