In [None]:
# =============================================================================
# 1. 라이브러리 임포트 및 설정
# =============================================================================
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

# Scikit-learn 관련
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression # (비교용으로 남겨둠)
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# XGBoost 관련
from xgboost import XGBClassifier, XGBRegressor

# 경고 무시 (선택사항)
import warnings
warnings.filterwarnings('ignore')

# WMAPE (Weighted Mean Absolute Percentage Error) 정의
# -> 실제 판매량 규모를 고려한 오차율 평가 지표
def wmape(y_true, y_pred):
    return np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true))


# =============================================================================
# 2. 데이터 로드 및 전처리 (Pivot 생성)
# =============================================================================
print("Loading Data...")
train = pd.read_csv("dataset/train.csv")

# 월별 집계
monthly = (
    train
    .groupby(["item_id", "year", "month"], as_index=False)["value"]
    .sum()
)

# 'YYYY-MM' 형태의 컬럼 생성
monthly["ym"] = pd.to_datetime(
    monthly["year"].astype(str) + "-" +
    monthly["month"].astype(str).str.zfill(2)
)

# Pivot Table 생성 (행: item_id, 열: ym, 값: value)
pivot = (
    monthly
    .pivot(index="item_id", columns="ym", values="value")
    .fillna(0.0)
)

print(f"Pivot Shape: {pivot.shape}")


# =============================================================================
# 3. Pair Feature Extraction (아이템 간 관계 추출)
# =============================================================================
def safe_corr(x, y):
    """표준편차가 0인 경우(변화 없음) 상관계수 계산 오류 방지"""
    if np.std(x) == 0 or np.std(y) == 0:
        return 0.0
    return float(np.corrcoef(x, y)[0, 1])

def extract_pair_features(pivot, max_lag=6, min_nonzero=12):
    items = pivot.index.to_list()
    n_months = pivot.shape[1]
    rows = []

    print("Extracting Pair Features...")
    for leader in tqdm(items, desc="Scanning Leaders"):
        x = pivot.loc[leader].values.astype(float)
        if np.count_nonzero(x) < min_nonzero:
            continue

        for follower in items:
            if leader == follower:
                continue

            y = pivot.loc[follower].values.astype(float)
            if np.count_nonzero(y) < min_nonzero:
                continue

            corrs = []
            best_corr, best_lag = 0.0, 0

            # Lag 1 ~ max_lag 까지 상관관계 스캔
            for lag in range(1, max_lag + 1):
                if n_months <= lag:
                    continue
                # x(leader)가 lag만큼 앞서고, y(follower)가 뒤따름
                c = safe_corr(x[:-lag], y[lag:])
                corrs.append(abs(c))
                if abs(c) > abs(best_corr):
                    best_corr = c
                    best_lag = lag

            rows.append({
                "leader": leader,
                "follower": follower,
                "max_corr": best_corr,
                "best_lag": best_lag,
                "mean_abs_corr": np.mean(corrs),
                "std_abs_corr": np.std(corrs),
                "nonzero_ratio_a": np.count_nonzero(x) / len(x),
                "nonzero_ratio_b": np.count_nonzero(y) / len(y),
                "var_ratio": np.var(y) / (np.var(x) + 1e-6),
            })

    return pd.DataFrame(rows).dropna()

pair_df = extract_pair_features(pivot)


# =============================================================================
# 4. XGBoost Classifier (유의미한 관계 필터링)
# =============================================================================
print("Classifying Significant Pairs...")

# Pseudo-Labeling: 상관계수가 높고 Lag가 존재하는 것을 '관계 있음(1)'으로 가정
pair_df["label"] = (
    (pair_df["max_corr"].abs() >= 0.35) &
    (pair_df["best_lag"] > 0)
).astype(int)

clf_features = [
    "max_corr", "best_lag",
    "mean_abs_corr", "std_abs_corr",
    "nonzero_ratio_a", "nonzero_ratio_b",
    "var_ratio"
]

X_clf = pair_df[clf_features]
y_clf = pair_df["label"]

# 분류 모델 학습
clf = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

clf.fit(X_clf, y_clf)

# 예측 확률 기반 필터링 (확률 0.6 이상인 쌍만 선택)
pair_df["comove_prob"] = clf.predict_proba(X_clf)[:, 1]
pairs_ml = (
    pair_df[pair_df["comove_prob"] >= 0.6]
    .rename(columns={
        "leader": "leading_item_id",
        "follower": "following_item_id"
    })[["leading_item_id", "following_item_id", "best_lag", "max_corr"]]
)

print(f"Selected Pairs: {len(pairs_ml)}")


# =============================================================================
# 5. Feature Engineering (회귀 분석용 데이터 생성)
# =============================================================================
print("Building Regression Dataset...")

# 계절성 인덱스 미리 계산
seasonal_table = (
    monthly.groupby(["item_id", "month"])["value"]
    .mean()
    .unstack(fill_value=0)
)

def build_training_data_fe(pivot, pairs_ml):
    n_months = pivot.shape[1]
    
    # 월 정보를 얻기 위한 작업
    pivot_with_month = pivot.copy()
    pivot_with_month.columns = pd.to_datetime(pivot_with_month.columns)
    seasonal_index_map = pivot_with_month.groupby(
        pivot_with_month.columns.month, axis=1
    ).mean()

    rows = []

    for row in tqdm(pairs_ml.itertuples(index=False), desc="Generating Rows"):
        leader = row.leading_item_id
        follower = row.following_item_id
        lag = int(row.best_lag)
        corr = float(row.max_corr)

        a = pivot.loc[leader].values
        b = pivot.loc[follower].values

        # Rolling Statistics 계산을 위한 임시 DataFrame
        b_df = pd.DataFrame({"b": b})
        for w in [3, 5, 7, 12]:
            b_df[f"roll_mean_{w}"] = b_df["b"].rolling(w).mean()
        for w in [3, 5, 12]:
            b_df[f"roll_std_{w}"] = b_df["b"].rolling(w).std()

        b_df["trend"] = b_df["b"].diff()
        
        # 월별 계절성 매핑
        month_series = pivot_with_month.columns.month
        b_df["season_index"] = [seasonal_index_map.loc[follower, m] for m in month_series]

        # 학습 데이터 생성 (t 시점에서 t+1 예측)
        for t in range(max(lag, 12), n_months - 1):
            rows.append({
                "b_t": b[t],
                "b_t_1": b[t-1],
                "a_t_lag": a[t-lag], # Leader의 과거 값
                "max_corr": corr,
                "best_lag": lag,
                "roll_mean_3": b_df.loc[t,"roll_mean_3"],
                "roll_mean_5": b_df.loc[t,"roll_mean_5"],
                "roll_mean_7": b_df.loc[t,"roll_mean_7"],
                "roll_mean_12": b_df.loc[t,"roll_mean_12"],
                "roll_std_3": b_df.loc[t,"roll_std_3"],
                "roll_std_5": b_df.loc[t,"roll_std_5"],
                "roll_std_12": b_df.loc[t,"roll_std_12"],
                "trend": b_df.loc[t,"trend"],
                "season_index": b_df.loc[t,"season_index"],
                "target": b[t+1], # 정답: 다음 달 판매량
            })

    return pd.DataFrame(rows).dropna()

df_train = build_training_data_fe(pivot, pairs_ml)

feature_cols = [
    'b_t','b_t_1','a_t_lag','max_corr','best_lag',
    'roll_mean_3','roll_mean_5','roll_mean_7','roll_mean_12',
    'roll_std_3','roll_std_5','roll_std_12',
    'trend','season_index'
]

X = df_train[feature_cols]
y = df_train["target"]

# =============================================================================
# 6. TimeSeries Split Cross Validation & Model Training
# =============================================================================
print("Training XGBRegressor with TimeSeriesSplit...")

# 시계열 교차 검증 (5-Fold)
tscv = TimeSeriesSplit(n_splits=5)

# 1. 교차 검증용 모델 설정 (Early Stopping 활성화)
xg_reg = XGBRegressor(
    n_estimators=1000,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1,
    early_stopping_rounds=50  # 교차 검증 때는 켜둡니다.
)

fold_scores = []

for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Log Transformation
    y_train_log = np.log1p(y_train)
    y_val_log = np.log1p(y_val)

    xg_reg.fit(
        X_train, y_train_log,
        eval_set=[(X_val, y_val_log)],
        verbose=False
    )

    pred_log = xg_reg.predict(X_val)
    pred = np.expm1(pred_log)
    pred = np.maximum(0, pred) 

    score = wmape(y_val, pred)
    fold_scores.append(score)
    print(f"Fold {fold+1} WMAPE: {score:.4f}")

print(f"Average WMAPE: {np.mean(fold_scores):.4f}")

# [수정 핵심] 전체 데이터 재학습 (제출용)
print("Retraining on Full Data...")

# 2. 전체 학습 때는 검증 데이터가 없으므로 Early Stopping을 끕니다.
xg_reg.early_stopping_rounds = None 

y_log = np.log1p(y)
xg_reg.fit(X, y_log, verbose=False)

# =============================================================================
# 7. 최종 예측 (Next Month Prediction)
# =============================================================================
print("Predicting Next Month...")

def predict_next_month(pivot, pairs_ml, model):
    months = pivot.columns
    t = len(months) - 1 # 가장 최근 시점 인덱스

    preds = []
    
    # Pivot의 열이 datetime인지 확인 (인덱싱을 위해)
    pivot_cols = pd.to_datetime(pivot.columns)

    for row in tqdm(pairs_ml.itertuples(index=False), desc="Forecasting"):
        leader = row.leading_item_id
        follower = row.following_item_id
        lag = int(row.best_lag)
        corr = float(row.max_corr)

        a = pivot.loc[leader].values
        b = pivot.loc[follower].values

        if t - lag < 0:
            continue

        b_ts = pd.Series(b)
        
        # Test Input 생성 (StandardScaler 불필요 - 트리 모델 장점)
        X_test = pd.DataFrame([{
            "b_t": b[t],
            "b_t_1": b[t-1],
            "a_t_lag": a[t-lag],
            "max_corr": corr,
            "best_lag": lag,
            "roll_mean_3": b_ts.rolling(3).mean().iloc[t],
            "roll_mean_5": b_ts.rolling(5).mean().iloc[t],
            "roll_mean_7": b_ts.rolling(7).mean().iloc[t],
            "roll_mean_12": b_ts.rolling(12).mean().iloc[t],
            "roll_std_3": b_ts.rolling(3).std().iloc[t],
            "roll_std_5": b_ts.rolling(5).std().iloc[t],
            "roll_std_12": b_ts.rolling(12).std().iloc[t],
            "trend": b[t] - b[t-1],
            "season_index": seasonal_table.loc[follower, pivot_cols[t].month]
        }])

        # 순서 보장 (학습 때와 동일한 컬럼 순서)
        X_test = X_test[feature_cols]

        # 예측
        y_hat_log = model.predict(X_test)[0]
        y_hat = np.expm1(y_hat_log) # 역변환
        
        preds.append({
            "leading_item_id": leader,
            "following_item_id": follower,
            "value": max(0, int(round(y_hat)))
        })

    return pd.DataFrame(preds)

submission = predict_next_month(pivot, pairs_ml, xg_reg)
submission.to_csv("final_submission.csv", index=False)

print("Done! Submission file saved.")

Loading Data...
Pivot Shape: (100, 43)
Extracting Pair Features...


Scanning Leaders: 100%|██████████| 100/100 [00:01<00:00, 55.03it/s]


Classifying Significant Pairs...
Selected Pairs: 2179
Building Regression Dataset...


Generating Rows: 2179it [00:03, 647.20it/s]


Training XGBRegressor with TimeSeriesSplit...
Fold 1 WMAPE: 0.1684
Fold 2 WMAPE: 0.1372
Fold 3 WMAPE: 0.1274
Fold 4 WMAPE: 0.1261
Fold 5 WMAPE: 0.1219
Average WMAPE: 0.1362
Retraining on Full Data...
Predicting Next Month...


Forecasting: 2179it [00:02, 1029.43it/s]

Done! Submission file saved.



