# ML 기반 베스트셀러 예측 분석

## 데이터 설명

### 피처 구성
| 그룹 | 피처 | 설명 |
|------|------|------|
| 카테고리 | category_1 ~ category_10 | 도서 카테고리 비율 |
| 바이럴 | viral_index, viral_index_smoothed | 뉴스 기반 관심도 |
| 카테고리×바이럴 | category_X_x_viral_index | 카테고리별 바이럴 영향 |
| Prophet 예측 | prophet_forecast_X | 카테고리별 판매 예측값 (시차, 트렌드, 계절성 반영) × 카테고리 비율 |
| 경제지표 | kospi, usd_krw, brent_oil 등 | 거시경제 변수 |

### Prophet 예측값 특징
- 각 카테고리의 최적 시차(lag)가 적용된 예측값
- 해당 책의 카테고리 비율로 가중치 적용됨
- 예: 주식투자 60% 책 → prophet_forecast_stock × 0.6

---

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import (f1_score, roc_auc_score, r2_score, mean_absolute_error,
                             confusion_matrix, ConfusionMatrixDisplay, roc_curve)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.svm import SVC
from scipy import stats
import lightgbm as lgb
import xgboost as xgb
import shap
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['axes.unicode_minus'] = False

# 이미지 저장 경로
IMG_PATH = 'ml_image_v4'
os.makedirs(IMG_PATH, exist_ok=True)

## 1. 데이터 로드

In [None]:
df = pd.read_csv('books_ml_dataset_v4.csv')
df['ymw'] = df['ymw'].astype(str)
df = df.sort_values(['product_code', 'ymw']).reset_index(drop=True)

print(f'데이터: {len(df):,}개')
print(f'기간: {df["ymw"].min()} ~ {df["ymw"].max()}')
print(f'고유 책 수: {df["product_code"].nunique()}권')

In [None]:
# 피처 정의
feature_cols = [c for c in df.columns if c not in ['product_code', 'ymw', 'y_sales_score']]

print(f'피처 수: {len(feature_cols)}개')
print(f'\n[피처 그룹별 개수]')
print(f'  category_: {len([c for c in feature_cols if c.startswith("category_") and "x_viral" not in c])}개')
print(f'  category_x_viral: {len([c for c in feature_cols if "x_viral" in c])}개')
print(f'  prophet_forecast_: {len([c for c in feature_cols if c.startswith("prophet_")])}개')
print(f'  viral_index: {len([c for c in feature_cols if "viral_index" in c and "x_" not in c])}개')
print(f'  기타 (경제지표 등): {len([c for c in feature_cols if not any(x in c for x in ["category", "prophet", "viral_index"])])}개')

## 2. Train/Test Split (Time-based)

In [None]:
# Time-based Split (시계열 데이터이므로 시간순 분할)
df_sorted = df.sort_values('ymw').reset_index(drop=True)
split_idx = int(len(df_sorted) * 0.8)

train_data = df_sorted.iloc[:split_idx]
test_data = df_sorted.iloc[split_idx:]

X_train = train_data[feature_cols]
X_test = test_data[feature_cols]
y_train = train_data['y_sales_score']
y_test = test_data['y_sales_score']
y_train_class = (y_train > 0).astype(int)
y_test_class = (y_test > 0).astype(int)

print('[Time-based Split]')
print(f'  Train: {train_data["ymw"].min()} ~ {train_data["ymw"].max()} ({len(train_data):,}개)')
print(f'  Test:  {test_data["ymw"].min()} ~ {test_data["ymw"].max()} ({len(test_data):,}개)')
print(f'\n[타겟 분포]')
print(f'  Train - 베스트셀러: {y_train_class.sum()} ({y_train_class.mean()*100:.1f}%)')
print(f'  Test  - 베스트셀러: {y_test_class.sum()} ({y_test_class.mean()*100:.1f}%)')

---
# Part 1: 기존 피처만 사용한 예측

바이럴 지수, Prophet 예측값, 카테고리 등 **외부 변수만으로** 베스트셀러를 예측할 수 있는가?

## 3. 기존 피처 상관관계 분석

In [None]:
# 타겟과 상관관계 분석
y_full = df['y_sales_score']
corr_list = []
for col in feature_cols:
    corr, pval = stats.pearsonr(df[col], y_full)
    corr_list.append({'feature': col, 'corr': corr, 'abs_corr': abs(corr), 'pval': pval})

corr_df = pd.DataFrame(corr_list).sort_values('abs_corr', ascending=False)

print('[기존 피처 vs 타겟 상관관계 Top 15]')
print('='*60)
print(f'{"순위":<5}{"피처":<40}{"r":<10}{"p-value":<12}')
print('-'*60)
for i, (_, row) in enumerate(corr_df.head(15).iterrows()):
    sig = '***' if row['pval'] < 0.001 else '**' if row['pval'] < 0.01 else '*' if row['pval'] < 0.05 else ''
    print(f'{i+1:<5}{row["feature"]:<40}{row["corr"]:+.4f}    {row["pval"]:.2e} {sig}')

In [None]:
# 상관관계 시각화 (Top 15)
fig, ax = plt.subplots(figsize=(10, 8))
top15 = corr_df.head(15)
colors = ['#e74c3c' if c > 0 else '#3498db' for c in top15['corr']]
ax.barh(range(len(top15)), top15['corr'], color=colors)
ax.set_yticks(range(len(top15)))
ax.set_yticklabels(top15['feature'])
ax.set_xlabel('상관계수 (r)')
ax.set_title('기존 피처 vs 판매점수 상관관계 Top 15')
ax.axvline(x=0, color='black', linewidth=0.5)
ax.invert_yaxis()
plt.tight_layout()
plt.savefig(f'{IMG_PATH}/01_feature_correlation.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. 기존 피처만으로 모델 학습

In [None]:
# 회귀 모델 정의
reg_models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, random_state=42, verbose=-1),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
}

needs_scaling = ['Linear Regression', 'Ridge']

# 스케일링
scaler = RobustScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

# 기존 피처만으로 학습
base_results = []

print('[기존 피처만 사용 - 회귀 모델 성능]')
print('='*50)
print(f'{"모델":<25}{"R²":<12}{"MAE":<12}')
print('-'*50)

for model_name, model in reg_models.items():
    if model_name in needs_scaling:
        X_tr, X_te = X_train_scaled, X_test_scaled
    else:
        X_tr, X_te = X_train, X_test
    
    reg = model.__class__(**model.get_params())
    reg.fit(X_tr, y_train)
    y_pred = np.maximum(reg.predict(X_te), 0)
    
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    base_results.append({'model': model_name, 'r2': r2, 'mae': mae})
    print(f'{model_name:<25}{r2:<12.4f}{mae:<12.4f}')

base_results_df = pd.DataFrame(base_results)

In [None]:
# 분류 모델
clf_models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'LightGBM': lgb.LGBMClassifier(n_estimators=100, class_weight='balanced', random_state=42, verbose=-1),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, scale_pos_weight=3, random_state=42, verbosity=0),
}

base_clf_results = []

print('\n[기존 피처만 사용 - 분류 모델 성능]')
print('='*60)
print(f'{"모델":<25}{"F1":<12}{"AUC":<12}{"Precision":<12}')
print('-'*60)

for model_name, model in clf_models.items():
    if model_name == 'Logistic Regression':
        X_tr, X_te = X_train_scaled, X_test_scaled
    else:
        X_tr, X_te = X_train, X_test
    
    clf = model.__class__(**model.get_params())
    clf.fit(X_tr, y_train_class)
    y_pred_c = clf.predict(X_te)
    y_prob = clf.predict_proba(X_te)[:, 1]
    
    f1 = f1_score(y_test_class, y_pred_c)
    auc = roc_auc_score(y_test_class, y_prob)
    prec = (y_pred_c[y_test_class == 1] == 1).mean() if y_pred_c.sum() > 0 else 0
    
    base_clf_results.append({'model': model_name, 'f1': f1, 'auc': auc})
    print(f'{model_name:<25}{f1:<12.4f}{auc:<12.4f}{prec:<12.4f}')

base_clf_results_df = pd.DataFrame(base_clf_results)

## 5. 기존 피처 SHAP 분석

In [None]:
# LightGBM으로 SHAP 분석
reg_base = lgb.LGBMRegressor(n_estimators=100, random_state=42, verbose=-1)
reg_base.fit(X_train, y_train)

explainer_base = shap.TreeExplainer(reg_base)
shap_values_base = explainer_base.shap_values(X_test)

# SHAP Summary Plot
plt.figure(figsize=(12, 10))
shap.summary_plot(shap_values_base, X_test, plot_type="bar", show=False, max_display=20)
plt.title('기존 피처 SHAP Feature Importance')
plt.tight_layout()
plt.savefig(f'{IMG_PATH}/02_base_shap_importance.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# SHAP 중요도 테이블
shap_base_importance = pd.DataFrame({
    'feature': feature_cols,
    'mean_abs_shap': np.abs(shap_values_base).mean(axis=0)
}).sort_values('mean_abs_shap', ascending=False)

print('[기존 피처 SHAP 중요도 Top 10]')
print('='*50)
total_imp = shap_base_importance['mean_abs_shap'].sum()
for i, (_, row) in enumerate(shap_base_importance.head(10).iterrows()):
    pct = row['mean_abs_shap'] / total_imp * 100
    print(f'{i+1}. {row["feature"]:<35} {row["mean_abs_shap"]:.4f} ({pct:.1f}%)')

## 6. 기존 피처 예측 결과 시각화

In [None]:
# 예측 vs 실제
y_pred_base = np.maximum(reg_base.predict(X_test), 0)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# 산점도
ax1 = axes[0]
ax1.scatter(y_test, y_pred_base, alpha=0.4, s=20)
ax1.plot([0, y_test.max()], [0, y_test.max()], 'r--', lw=2)
ax1.set_xlabel('실제값')
ax1.set_ylabel('예측값')
ax1.set_title(f'기존 피처: 예측 vs 실제 (R² = {r2_score(y_test, y_pred_base):.4f})')
ax1.grid(True, alpha=0.3)

# 잔차 분포
ax2 = axes[1]
residuals = y_test - y_pred_base
ax2.hist(residuals, bins=50, edgecolor='black', alpha=0.7)
ax2.axvline(x=0, color='r', linestyle='--', lw=2)
ax2.set_xlabel('잔차')
ax2.set_ylabel('빈도')
ax2.set_title(f'잔차 분포 (MAE = {mean_absolute_error(y_test, y_pred_base):.4f})')

plt.tight_layout()
plt.savefig(f'{IMG_PATH}/03_base_prediction_result.png', dpi=150, bbox_inches='tight')
plt.show()

---
# Part 2: Lag 피처 (전주 판매점수) 추가

**y_lag1 (전주 판매점수)**를 추가했을 때 예측 성능이 어떻게 변하는가?

## 7. Lag 피처 생성

In [None]:
# Lag 피처 생성
df_lag = df.copy()
for lag in [1, 2, 3, 4]:
    df_lag[f'y_lag{lag}'] = df_lag.groupby('product_code')['y_sales_score'].shift(lag)

print('[Lag 피처 결측치]')
for lag in [1, 2, 3, 4]:
    na = df_lag[f'y_lag{lag}'].isna().sum()
    print(f'  y_lag{lag}: {na}개 ({na/len(df_lag)*100:.1f}%)')

# 결측치 제거
df_lag = df_lag.dropna(subset=['y_lag1']).reset_index(drop=True)
print(f'\n결측치 제거 후: {len(df_lag):,}개')

In [None]:
# Lag 피처 상관관계
y_lag_full = df_lag['y_sales_score']

print('[Lag 피처 vs 타겟 상관관계]')
print('='*40)
for lag in [1, 2, 3, 4]:
    corr, pval = stats.pearsonr(df_lag[f'y_lag{lag}'].dropna(), df_lag.loc[df_lag[f'y_lag{lag}'].notna(), 'y_sales_score'])
    print(f'  y_lag{lag}: r = {corr:.4f} (p < 0.001)')

# 기존 피처 최고 상관관계와 비교
best_base = corr_df.iloc[0]
print(f'\n[비교]')
print(f'  기존 피처 최고: {best_base["feature"]} (r = {best_base["corr"]:.4f})')
print(f'  y_lag1: r = 0.885 → 기존 최고 대비 {0.885/abs(best_base["corr"]):.0f}배 강함')

In [None]:
# y_lag1 vs y 산점도
fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(df_lag['y_lag1'], df_lag['y_sales_score'], alpha=0.3, s=10)
ax.plot([0, df_lag['y_sales_score'].max()], [0, df_lag['y_sales_score'].max()], 'r--', lw=2)
ax.set_xlabel('전주 판매점수 (y_lag1)')
ax.set_ylabel('현재 판매점수')
corr_lag1 = stats.pearsonr(df_lag['y_lag1'], df_lag['y_sales_score'])[0]
ax.set_title(f'전주 vs 현재 판매점수 (r = {corr_lag1:.3f})')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'{IMG_PATH}/04_lag1_correlation.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Lag 피처 추가 후 모델 학습

In [None]:
# Time-based Split (Lag 데이터)
df_lag_sorted = df_lag.sort_values('ymw').reset_index(drop=True)
split_idx_lag = int(len(df_lag_sorted) * 0.8)

train_lag = df_lag_sorted.iloc[:split_idx_lag]
test_lag = df_lag_sorted.iloc[split_idx_lag:]

feature_cols_lag = feature_cols + ['y_lag1']

X_train_lag = train_lag[feature_cols_lag]
X_test_lag = test_lag[feature_cols_lag]
y_train_lag = train_lag['y_sales_score']
y_test_lag = test_lag['y_sales_score']
y_train_lag_class = (y_train_lag > 0).astype(int)
y_test_lag_class = (y_test_lag > 0).astype(int)

# 스케일링
scaler_lag = RobustScaler()
X_train_lag_scaled = pd.DataFrame(scaler_lag.fit_transform(X_train_lag), columns=X_train_lag.columns, index=X_train_lag.index)
X_test_lag_scaled = pd.DataFrame(scaler_lag.transform(X_test_lag), columns=X_test_lag.columns, index=X_test_lag.index)

print('[Time-based Split (Lag 데이터)]')
print(f'  Train: {len(train_lag):,}개')
print(f'  Test:  {len(test_lag):,}개')

In [None]:
# y_lag1 추가 후 회귀 모델 성능
lag_results = []

print('[y_lag1 추가 - 회귀 모델 성능]')
print('='*50)
print(f'{"모델":<25}{"R²":<12}{"MAE":<12}')
print('-'*50)

for model_name, model in reg_models.items():
    if model_name in needs_scaling:
        X_tr, X_te = X_train_lag_scaled, X_test_lag_scaled
    else:
        X_tr, X_te = X_train_lag, X_test_lag
    
    reg = model.__class__(**model.get_params())
    reg.fit(X_tr, y_train_lag)
    y_pred = np.maximum(reg.predict(X_te), 0)
    
    r2 = r2_score(y_test_lag, y_pred)
    mae = mean_absolute_error(y_test_lag, y_pred)
    
    lag_results.append({'model': model_name, 'r2': r2, 'mae': mae})
    print(f'{model_name:<25}{r2:<12.4f}{mae:<12.4f}')

lag_results_df = pd.DataFrame(lag_results)

In [None]:
# y_lag1 추가 후 분류 모델 성능
lag_clf_results = []

print('\n[y_lag1 추가 - 분류 모델 성능]')
print('='*60)
print(f'{"모델":<25}{"F1":<12}{"AUC":<12}')
print('-'*60)

for model_name, model in clf_models.items():
    if model_name == 'Logistic Regression':
        X_tr, X_te = X_train_lag_scaled, X_test_lag_scaled
    else:
        X_tr, X_te = X_train_lag, X_test_lag
    
    clf = model.__class__(**model.get_params())
    clf.fit(X_tr, y_train_lag_class)
    y_pred_c = clf.predict(X_te)
    y_prob = clf.predict_proba(X_te)[:, 1]
    
    f1 = f1_score(y_test_lag_class, y_pred_c)
    auc = roc_auc_score(y_test_lag_class, y_prob)
    
    lag_clf_results.append({'model': model_name, 'f1': f1, 'auc': auc})
    print(f'{model_name:<25}{f1:<12.4f}{auc:<12.4f}')

lag_clf_results_df = pd.DataFrame(lag_clf_results)

## 9. Lag 피처 추가 후 SHAP 분석

In [None]:
# LightGBM + y_lag1 SHAP 분석
reg_lag = lgb.LGBMRegressor(n_estimators=100, random_state=42, verbose=-1)
reg_lag.fit(X_train_lag, y_train_lag)

explainer_lag = shap.TreeExplainer(reg_lag)
shap_values_lag = explainer_lag.shap_values(X_test_lag)

# SHAP Summary Plot
plt.figure(figsize=(12, 10))
shap.summary_plot(shap_values_lag, X_test_lag, plot_type="bar", show=False, max_display=20)
plt.title('기존 피처 + y_lag1: SHAP Feature Importance')
plt.tight_layout()
plt.savefig(f'{IMG_PATH}/05_lag_shap_importance.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# SHAP 중요도 테이블
shap_lag_importance = pd.DataFrame({
    'feature': feature_cols_lag,
    'mean_abs_shap': np.abs(shap_values_lag).mean(axis=0)
}).sort_values('mean_abs_shap', ascending=False)

print('[y_lag1 추가 후 SHAP 중요도 Top 10]')
print('='*55)
total_imp_lag = shap_lag_importance['mean_abs_shap'].sum()
for i, (_, row) in enumerate(shap_lag_importance.head(10).iterrows()):
    pct = row['mean_abs_shap'] / total_imp_lag * 100
    print(f'{i+1}. {row["feature"]:<35} {row["mean_abs_shap"]:.4f} ({pct:.1f}%)')

# y_lag1 vs prophet 비교
y_lag1_imp = shap_lag_importance[shap_lag_importance['feature'] == 'y_lag1']['mean_abs_shap'].values[0]
prophet_imp = shap_lag_importance[shap_lag_importance['feature'].str.startswith('prophet_')]['mean_abs_shap'].sum()

print(f'\n[피처 그룹별 중요도]')
print(f'  y_lag1: {y_lag1_imp/total_imp_lag*100:.1f}%')
print(f'  prophet_forecast (10개): {prophet_imp/total_imp_lag*100:.1f}%')

## 10. Lag 피처 추가 후 예측 결과 시각화

In [None]:
# 예측 vs 실제
y_pred_lag = np.maximum(reg_lag.predict(X_test_lag), 0)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# 산점도
ax1 = axes[0]
ax1.scatter(y_test_lag, y_pred_lag, alpha=0.4, s=20)
ax1.plot([0, y_test_lag.max()], [0, y_test_lag.max()], 'r--', lw=2)
ax1.set_xlabel('실제값')
ax1.set_ylabel('예측값')
ax1.set_title(f'기존 + y_lag1: 예측 vs 실제 (R² = {r2_score(y_test_lag, y_pred_lag):.4f})')
ax1.grid(True, alpha=0.3)

# 잔차 분포
ax2 = axes[1]
residuals_lag = y_test_lag - y_pred_lag
ax2.hist(residuals_lag, bins=50, edgecolor='black', alpha=0.7)
ax2.axvline(x=0, color='r', linestyle='--', lw=2)
ax2.set_xlabel('잔차')
ax2.set_ylabel('빈도')
ax2.set_title(f'잔차 분포 (MAE = {mean_absolute_error(y_test_lag, y_pred_lag):.4f})')

plt.tight_layout()
plt.savefig(f'{IMG_PATH}/06_lag_prediction_result.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# ROC Curve & Confusion Matrix
clf_lag = lgb.LGBMClassifier(n_estimators=100, class_weight='balanced', random_state=42, verbose=-1)
clf_lag.fit(X_train_lag, y_train_lag_class)
y_prob_lag = clf_lag.predict_proba(X_test_lag)[:, 1]
y_pred_lag_class = clf_lag.predict(X_test_lag)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# ROC Curve
ax1 = axes[0]
fpr, tpr, _ = roc_curve(y_test_lag_class, y_prob_lag)
auc_score = roc_auc_score(y_test_lag_class, y_prob_lag)
ax1.plot(fpr, tpr, 'b-', lw=2, label=f'LightGBM (AUC = {auc_score:.4f})')
ax1.plot([0, 1], [0, 1], 'r--', lw=1)
ax1.fill_between(fpr, tpr, alpha=0.2)
ax1.set_xlabel('False Positive Rate')
ax1.set_ylabel('True Positive Rate')
ax1.set_title('ROC Curve - 베스트셀러 진입 분류')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Confusion Matrix
ax2 = axes[1]
cm = confusion_matrix(y_test_lag_class, y_pred_lag_class)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['비진입', '베스트셀러'])
disp.plot(ax=ax2, cmap='Blues', values_format='d')
ax2.set_title(f'Confusion Matrix (F1 = {f1_score(y_test_lag_class, y_pred_lag_class):.4f})')

plt.tight_layout()
plt.savefig(f'{IMG_PATH}/07_lag_roc_confusion.png', dpi=150, bbox_inches='tight')
plt.show()

---
# Part 3: 성능 비교 및 인사이트

## 11. 기존 피처 vs y_lag1 추가 비교

In [None]:
# 회귀 성능 비교
comparison = pd.merge(base_results_df, lag_results_df, on='model', suffixes=('_base', '_lag'))
comparison['r2_improvement'] = comparison['r2_lag'] - comparison['r2_base']
comparison['r2_improvement_pct'] = (comparison['r2_improvement'] / comparison['r2_base'].abs() * 100)

print('[회귀 모델 성능 비교]')
print('='*75)
print(f'{"모델":<25}{"기존 R²":<12}{"+ y_lag1 R²":<12}{"개선":<12}{"개선율":<12}')
print('-'*75)
for _, row in comparison.iterrows():
    print(f'{row["model"]:<25}{row["r2_base"]:<12.4f}{row["r2_lag"]:<12.4f}{row["r2_improvement"]:+.4f}      {row["r2_improvement_pct"]:+.1f}%')

In [None]:
# 분류 성능 비교
comparison_clf = pd.merge(base_clf_results_df, lag_clf_results_df, on='model', suffixes=('_base', '_lag'))
comparison_clf['f1_improvement'] = comparison_clf['f1_lag'] - comparison_clf['f1_base']

print('\n[분류 모델 성능 비교]')
print('='*70)
print(f'{"모델":<25}{"기존 F1":<12}{"+ y_lag1 F1":<12}{"기존 AUC":<12}{"+ y_lag1 AUC":<12}')
print('-'*70)
for _, row in comparison_clf.iterrows():
    print(f'{row["model"]:<25}{row["f1_base"]:<12.4f}{row["f1_lag"]:<12.4f}{row["auc_base"]:<12.4f}{row["auc_lag"]:<12.4f}')

In [None]:
# 비교 시각화
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# 회귀 R² 비교
ax1 = axes[0]
x = np.arange(len(comparison))
width = 0.35
bars1 = ax1.bar(x - width/2, comparison['r2_base'], width, label='기존 피처만', color='#3498db')
bars2 = ax1.bar(x + width/2, comparison['r2_lag'], width, label='+ y_lag1', color='#e74c3c')
ax1.set_xticks(x)
ax1.set_xticklabels(comparison['model'], rotation=45, ha='right')
ax1.set_ylabel('R²')
ax1.set_title('회귀 모델: 기존 피처 vs + y_lag1')
ax1.legend()
ax1.set_ylim(0, 1)

# 분류 F1 비교
ax2 = axes[1]
bars1 = ax2.bar(x - width/2, comparison_clf['f1_base'], width, label='기존 피처만', color='#3498db')
bars2 = ax2.bar(x + width/2, comparison_clf['f1_lag'], width, label='+ y_lag1', color='#e74c3c')
ax2.set_xticks(x)
ax2.set_xticklabels(comparison_clf['model'], rotation=45, ha='right')
ax2.set_ylabel('F1 Score')
ax2.set_title('분류 모델: 기존 피처 vs + y_lag1')
ax2.legend()
ax2.set_ylim(0, 1)

plt.tight_layout()
plt.savefig(f'{IMG_PATH}/08_performance_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

## 12. 최종 결론

In [None]:
print('='*70)
print('최종 분석 결론')
print('='*70)

print('\n[1. 기존 피처만 사용 시]')
print(f'  - 회귀 R² (LightGBM): {base_results_df[base_results_df["model"]=="LightGBM"]["r2"].values[0]:.4f}')
print(f'  - 분류 F1 (LightGBM): {base_clf_results_df[base_clf_results_df["model"]=="LightGBM"]["f1"].values[0]:.4f}')
print(f'  - 해석: 바이럴 지수, Prophet 예측값으로는 예측 어려움')

print('\n[2. y_lag1 추가 시]')
print(f'  - 회귀 R² (LightGBM): {lag_results_df[lag_results_df["model"]=="LightGBM"]["r2"].values[0]:.4f}')
print(f'  - 분류 F1 (LightGBM): {lag_clf_results_df[lag_clf_results_df["model"]=="LightGBM"]["f1"].values[0]:.4f}')
print(f'  - 개선율: R² +{comparison[comparison["model"]=="LightGBM"]["r2_improvement_pct"].values[0]:.0f}%')

print('\n[3. SHAP 피처 중요도]')
print(f'  - y_lag1: {y_lag1_imp/total_imp_lag*100:.1f}%')
print(f'  - prophet_forecast (10개): {prophet_imp/total_imp_lag*100:.1f}%')

print('\n[4. 핵심 인사이트]')
print('  - "과거 성과가 미래 성과를 결정한다" (베스트셀러 관성 효과)')
print('  - 바이럴 지수, Prophet 예측값은 y_lag1 대비 기여도 미미')
print('  - 전주에 베스트셀러였던 책은 이번 주에도 베스트셀러일 확률 높음')

In [None]:
# 저장된 이미지 목록
print('\n[저장된 시각화 파일]')
for f in sorted(os.listdir(IMG_PATH)):
    print(f'  {f}')