# preprocess_ev_battery.ipynb 기준 공행성 쌍 예측 모델

이 노트북은 `preprocess_ev_battery.ipynb`에서 생성한 전처리 결과를 사용하여 공행성 쌍을 예측합니다.

## 특징
- EV 배터리 전처리 결과(monthly_features) 사용
- 클러스터 정보 활용 (battery_materials, ev_machinery_electrical, other_industries)
- 보정된 값(value_filled, value_clip) 및 로그 변환 값 활용


In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, QuantileRegressor
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

print("라이브러리 로드 완료")


라이브러리 로드 완료


## 1. 전처리된 데이터 로드

주의: preprocess_ev_battery.ipynb를 먼저 실행하여 monthly_features.csv를 생성해야 합니다.


In [2]:
# 전처리 결과 파일 로드
try:
    monthly_features = pd.read_csv('../analysis/data/monthly_features.csv')
    print(f"전처리 결과 로드 완료: {len(monthly_features):,}행")
except FileNotFoundError:
    print("⚠️  monthly_features.csv 파일이 없습니다.")
    print("preprocess_ev_battery.ipynb를 먼저 실행하세요.")
    raise

print(f"컬럼: {monthly_features.columns.tolist()}")
monthly_features.head()


전처리 결과 로드 완료: 3,776행
컬럼: ['item_id', 'hs4', 'hs2', 'year', 'month', 'ym', 'cluster', 'value_sum', 'weight_sum', 'quantity_sum', 'has_quantity_info_hs4', 'has_quantity_this_row', 'weight_zero_flag', 'value_zero_flag', 'weight_missing_flag', 'value_missing_flag', 'value_extreme_flag', 'weight_extreme_flag', 'quantity_extreme_flag', 'weight_filled', 'value_filled', 'value_clip', 'weight_clip', 'quantity_clip', 'log_value', 'log_weight', 'log_quantity']


Unnamed: 0,item_id,hs4,hs2,year,month,ym,cluster,value_sum,weight_sum,quantity_sum,...,weight_extreme_flag,quantity_extreme_flag,weight_filled,value_filled,value_clip,weight_clip,quantity_clip,log_value,log_weight,log_quantity
0,AANGBULD,4810,48,2022,1,2022-01-01,other_industries,14276.0,17625.0,0.0,...,0,0,17625.0,14276.0,14276.0,17625.0,0.0,9.566405,9.77713,0.0
1,AANGBULD,4810,48,2022,2,2022-02-01,other_industries,52347.0,67983.0,0.0,...,0,0,67983.0,52347.0,52347.0,67983.0,0.0,10.865669,11.127028,0.0
2,AANGBULD,4810,48,2022,3,2022-03-01,other_industries,53549.0,69544.0,0.0,...,0,0,69544.0,53549.0,53549.0,69544.0,0.0,10.888371,11.149729,0.0
3,AANGBULD,4810,48,2022,5,2022-05-01,other_industries,26997.0,34173.0,0.0,...,0,0,34173.0,26997.0,26997.0,34173.0,0.0,10.203518,10.43922,0.0
4,AANGBULD,4810,48,2022,6,2022-06-01,other_industries,84489.0,103666.0,0.0,...,0,0,103666.0,84489.0,84489.0,103666.0,0.0,11.344388,11.548939,0.0


## 2. Pivot 테이블 생성 (보정된 값 사용)


In [3]:
# value_filled 사용 (보정된 값, 이상치 처리 전)
pivot = (
    monthly_features.pivot_table(
        index='item_id',
        columns='ym',
        values='value_filled',  # 보정된 값 사용
        aggfunc='sum',
        fill_value=0.0
    )
)

print(f"Pivot table shape: {pivot.shape}")

# item_id별 메타 정보
item_info = monthly_features[['item_id', 'hs4', 'hs2', 'cluster']].drop_duplicates().set_index('item_id')
item_hs4 = item_info['hs4'].to_dict()
item_cluster = item_info['cluster'].to_dict()

print(f"item_id 수: {len(item_hs4)}")
print(f"클러스터 분포:")
print(monthly_features['cluster'].value_counts())


Pivot table shape: (100, 43)
item_id 수: 100
클러스터 분포:
cluster
battery_materials          1818
other_industries           1364
ev_machinery_electrical     594
Name: count, dtype: int64


## 3. 공행성쌍 탐색 (EV 배터리 클러스터 고려)


In [4]:
def safe_corr(x, y):
    if np.std(x) == 0 or np.std(y) == 0:
        return 0.0
    return float(np.corrcoef(x, y)[0, 1])


def get_cluster_boost(leader_cluster, follower_cluster):
    """EV 배터리 클러스터 기반 가중치"""
    # 같은 클러스터 내에서 높은 가중치
    if leader_cluster == follower_cluster:
        if leader_cluster == 'battery_materials':
            return 2.0  # 배터리 소재 간 높은 연관성
        elif leader_cluster == 'ev_machinery_electrical':
            return 1.8  # EV 기기 간 연관성
        else:
            return 1.2  # 기타 산업 간 연관성
    
    # 배터리 소재 → EV 기기 (공급망 관계)
    if leader_cluster == 'battery_materials' and follower_cluster == 'ev_machinery_electrical':
        return 1.5
    
    return 1.0  # 기본값


def find_comovement_pairs_ev_battery(pivot, item_hs4, item_cluster, 
                                     max_lag=6, min_nonzero=12, corr_threshold=0.4):
    """EV 배터리 클러스터를 고려한 공행성쌍 탐색"""
    items = pivot.index.to_list()
    months = pivot.columns.to_list()
    n_months = len(months)

    results = []

    for i, leader in tqdm(enumerate(items), desc="Finding comovement pairs (EV battery)"):
        leader_hs4 = item_hs4.get(leader, 0)
        leader_cluster = item_cluster.get(leader, 'other_industries')
        x = pivot.loc[leader].values.astype(float)
        if np.count_nonzero(x) < min_nonzero:
            continue

        for follower in items:
            if follower == leader:
                continue

            follower_hs4 = item_hs4.get(follower, 0)
            follower_cluster = item_cluster.get(follower, 'other_industries')
            y = pivot.loc[follower].values.astype(float)
            if np.count_nonzero(y) < min_nonzero:
                continue

            best_lag = None
            best_corr = 0.0

            # lag = 1 ~ max_lag 탐색
            for lag in range(1, max_lag + 1):
                if n_months <= lag:
                    continue
                corr = safe_corr(x[:-lag], y[lag:])
                if abs(corr) > abs(best_corr):
                    best_corr = corr
                    best_lag = lag

            # 클러스터 가중치 적용
            cluster_boost = get_cluster_boost(leader_cluster, follower_cluster)
            effective_corr = abs(best_corr) * cluster_boost
            effective_threshold = corr_threshold / cluster_boost

            # 이중 필터링: 최소 상관계수 + 가중치 적용 임계값
            # 너무 낮은 상관계수는 제외 (노이즈 방지)
            min_abs_corr = 0.25  # 최소 상관계수 조건
            if best_lag is not None and abs(best_corr) >= min_abs_corr and effective_corr >= corr_threshold:
                same_cluster = 1 if leader_cluster == follower_cluster else 0
                same_hs4 = 1 if leader_hs4 == follower_hs4 else 0

                results.append({
                    "leading_item_id": leader,
                    "following_item_id": follower,
                    "best_lag": best_lag,
                    "max_corr": best_corr,
                    "cluster_boost": cluster_boost,
                    "same_cluster": same_cluster,
                    "same_hs4": same_hs4,
                    "leader_cluster": leader_cluster,
                    "follower_cluster": follower_cluster,
                })

    pairs = pd.DataFrame(results)
    return pairs


pairs = find_comovement_pairs_ev_battery(pivot, item_hs4, item_cluster)
print(f"탐색된 공행성쌍 수: {len(pairs)}")
print(f"\n클러스터별 공행성쌍 분포:")
print(pairs.groupby(['leader_cluster', 'follower_cluster']).size())
pairs.head()


Finding comovement pairs (EV battery): 100it [00:01, 76.50it/s]

탐색된 공행성쌍 수: 2604

클러스터별 공행성쌍 분포:
leader_cluster           follower_cluster       
battery_materials        battery_materials          1111
                         ev_machinery_electrical     312
                         other_industries            232
ev_machinery_electrical  battery_materials           124
                         ev_machinery_electrical      99
                         other_industries             61
other_industries         battery_materials           278
                         ev_machinery_electrical      58
                         other_industries            329
dtype: int64





Unnamed: 0,leading_item_id,following_item_id,best_lag,max_corr,cluster_boost,same_cluster,same_hs4,leader_cluster,follower_cluster
0,AANGBULD,APQGTRMF,5,-0.443984,1.0,0,0,other_industries,battery_materials
1,AANGBULD,BEZYMBBT,1,-0.333863,1.2,1,0,other_industries,other_industries
2,AANGBULD,DEWLVASR,6,0.640221,1.2,1,0,other_industries,other_industries
3,AANGBULD,DNMPSKTB,4,-0.410635,1.0,0,0,other_industries,battery_materials
4,AANGBULD,EVBVXETX,6,0.436623,1.2,1,0,other_industries,other_industries


## 4. 학습 데이터 생성 (보정된 값 및 로그 변환 값 활용)


In [5]:
def build_training_data_ev_battery(pivot, pairs, monthly_features):
    """EV 배터리 전처리 특성을 활용한 학습 데이터 생성"""
    months = pivot.columns.to_list()
    n_months = len(months)

    rows = []

    for row in tqdm(pairs.itertuples(index=False), desc="Building training data", total=len(pairs)):
        leader = row.leading_item_id
        follower = row.following_item_id
        lag = int(row.best_lag)
        corr = float(row.max_corr)
        cluster_boost = float(row.cluster_boost)
        same_cluster = int(row.same_cluster)
        same_hs4 = int(row.same_hs4)

        if leader not in pivot.index or follower not in pivot.index:
            continue

        a_series = pivot.loc[leader].values.astype(float)
        b_series = pivot.loc[follower].values.astype(float)

        # t+1이 존재하고, t-lag >= 0인 구간만 학습에 사용
        for t in range(max(lag, 1), n_months - 1):
            b_t = b_series[t]
            b_t_1 = b_series[t - 1]
            a_t_lag = a_series[t - lag]
            b_t_plus_1 = b_series[t + 1]

            # 해당 시점의 전처리 특성 추출
            current_ym = months[t]
            leader_features = monthly_features[
                (monthly_features['item_id'] == leader) & (monthly_features['ym'] == current_ym)
            ]
            follower_features = monthly_features[
                (monthly_features['item_id'] == follower) & (monthly_features['ym'] == current_ym)
            ]

            # 보정된 값 및 로그 변환 값 사용
            try:
                if len(leader_features) > 0 and 'log_value' in leader_features.columns:
                    leader_log_value = leader_features['log_value'].values[0]
                else:
                    leader_log_value = 0

                if len(follower_features) > 0:
                    follower_log_value = follower_features['log_value'].values[0] if 'log_value' in follower_features.columns else 0
                    follower_value_clip = follower_features['value_clip'].values[0] if 'value_clip' in follower_features.columns else 0
                    follower_has_quantity = follower_features['has_quantity_this_row'].values[0] if 'has_quantity_this_row' in follower_features.columns else 0
                else:
                    follower_log_value = 0
                    follower_value_clip = 0
                    follower_has_quantity = 0
            except (KeyError, IndexError):
                # 컬럼이 없거나 데이터가 없는 경우 기본값 사용
                leader_log_value = 0
                follower_log_value = 0
                follower_value_clip = 0
                follower_has_quantity = 0

            rows.append({
                "b_t": b_t,
                "b_t_1": b_t_1,
                "a_t_lag": a_t_lag,
                "max_corr": corr,
                "best_lag": float(lag),
                "cluster_boost": cluster_boost,
                "same_cluster": float(same_cluster),
                "same_hs4": float(same_hs4),
                "follower_log_value": follower_log_value,
                "follower_value_clip": follower_value_clip,
                "follower_has_quantity": float(follower_has_quantity),
                "target": b_t_plus_1,
            })

    df_train = pd.DataFrame(rows)
    return df_train


df_train_model = build_training_data_ev_battery(pivot, pairs, monthly_features)
print(f'생성된 학습 데이터의 shape: {df_train_model.shape}')
df_train_model.head()


Building training data: 100%|██████████| 2604/2604 [00:52<00:00, 49.66it/s]

생성된 학습 데이터의 shape: (100125, 12)





Unnamed: 0,b_t,b_t_1,a_t_lag,max_corr,best_lag,cluster_boost,same_cluster,same_hs4,follower_log_value,follower_value_clip,follower_has_quantity,target
0,582317.0,539873.0,14276.0,-0.443984,5.0,1.0,0.0,0.0,13.274772,582317.0,0.0,759980.0
1,759980.0,582317.0,52347.0,-0.443984,5.0,1.0,0.0,0.0,13.541049,759980.0,0.0,216019.0
2,216019.0,759980.0,53549.0,-0.443984,5.0,1.0,0.0,0.0,12.283126,216019.0,0.0,537693.0
3,537693.0,216019.0,0.0,-0.443984,5.0,1.0,0.0,0.0,13.195045,537693.0,0.0,205326.0
4,205326.0,537693.0,26997.0,-0.443984,5.0,1.0,0.0,0.0,12.232359,205326.0,0.0,169440.0


In [6]:
feature_cols = ['b_t', 'b_t_1', 'a_t_lag', 'max_corr', 'best_lag',
                'cluster_boost', 'same_cluster', 'same_hs4',
                'follower_log_value', 'follower_value_clip', 'follower_has_quantity']

train_X = df_train_model[feature_cols].values
train_y = df_train_model["target"].values

# inf, -inf, NaN 값 처리
train_X = np.nan_to_num(train_X, nan=0.0, posinf=0.0, neginf=0.0)
train_y = np.nan_to_num(train_y, nan=0.0, posinf=0.0, neginf=0.0)

# Feature scaling (NMAE 개선을 위해)
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)

# 개선 2: NMAE 손실 함수 고려
USE_QUANTILE = False  # True로 설정하면 Quantile Regression 사용
RIDGE_ALPHA = 1.0  # 튜닝 가능: 0.1, 1.0, 10.0 등

if USE_QUANTILE:
    # Quantile Regression (중앙값 = 0.5 quantile)
    reg = QuantileRegressor(quantile=0.5, alpha=RIDGE_ALPHA, solver='highs')
    print("  - Quantile Regression 사용 (중앙값 예측)")
else:
    # Ridge 회귀 (정규화로 과적합 방지)
    reg = Ridge(alpha=RIDGE_ALPHA)
    print(f"  - Ridge Regression 사용 (alpha={RIDGE_ALPHA})")

reg.fit(train_X_scaled, train_y)

print("Model training completed!")
print(f"Feature importance (coefficients):")
for i, col in enumerate(feature_cols):
    print(f"  {col}: {reg.coef_[i]:.6f}")


  - Ridge Regression 사용 (alpha=1.0)
Model training completed!
Feature importance (coefficients):
  b_t: 7344755.304980
  b_t_1: 6378292.282154
  a_t_lag: -33432.760672
  max_corr: -46327.403384
  best_lag: 8057.213273
  cluster_boost: 70937.713128
  same_cluster: -56122.682138
  same_hs4: 16911.122648
  follower_log_value: 89356.169224
  follower_value_clip: 654851.825265
  follower_has_quantity: -29463.253162


## 6. 예측 및 제출 파일 생성


In [7]:
def predict_ev_battery(pivot, pairs, reg, monthly_features):
    """예측 수행"""
    months = pivot.columns.to_list()
    n_months = len(months)

    t_last = n_months - 1
    t_prev = n_months - 2

    preds = []

    for row in tqdm(pairs.itertuples(index=False), desc="Making predictions", total=len(pairs)):
        leader = row.leading_item_id
        follower = row.following_item_id
        lag = int(row.best_lag)
        corr = float(row.max_corr)
        cluster_boost = float(row.cluster_boost)
        same_cluster = int(row.same_cluster)
        same_hs4 = int(row.same_hs4)

        if leader not in pivot.index or follower not in pivot.index:
            continue

        a_series = pivot.loc[leader].values.astype(float)
        b_series = pivot.loc[follower].values.astype(float)

        if t_last - lag < 0:
            continue

        b_t = b_series[t_last]
        b_t_1 = b_series[t_prev]
        a_t_lag = a_series[t_last - lag]

        # 전처리 특성
        current_ym = months[t_last]
        try:
            follower_features = monthly_features[
                (monthly_features['item_id'] == follower) & (monthly_features['ym'] == current_ym)
            ]

            if len(follower_features) > 0:
                follower_log_value = follower_features['log_value'].values[0] if 'log_value' in follower_features.columns else 0
                follower_value_clip = follower_features['value_clip'].values[0] if 'value_clip' in follower_features.columns else 0
                follower_has_quantity = follower_features['has_quantity_this_row'].values[0] if 'has_quantity_this_row' in follower_features.columns else 0
            else:
                follower_log_value = 0
                follower_value_clip = 0
                follower_has_quantity = 0
        except (KeyError, IndexError):
            # 컬럼이 없거나 데이터가 없는 경우 기본값 사용
            follower_log_value = 0
            follower_value_clip = 0
            follower_has_quantity = 0

        X_test = np.array([[b_t, b_t_1, a_t_lag, corr, float(lag),
                           cluster_boost, float(same_cluster), float(same_hs4),
                           follower_log_value, follower_value_clip, float(follower_has_quantity)]])
        # Feature scaling 적용
        X_test_scaled = scaler.transform(X_test)
        y_pred = reg.predict(X_test_scaled)[0]

        y_pred = max(0.0, float(y_pred))
        y_pred = int(round(y_pred))

        preds.append({
            "leading_item_id": leader,
            "following_item_id": follower,
            "value": y_pred,
        })

    df_pred = pd.DataFrame(preds)
    return df_pred


submission = predict_ev_battery(pivot, pairs, reg, monthly_features)
submission.to_csv('../results/submissions/ev_battery_submit.csv', index=False)
print(f"제출 파일 생성 완료: ../results/submissions/ev_battery_submit.csv")
print(f"예측된 공행성쌍 수: {len(submission)}")
submission.head()


Making predictions: 100%|██████████| 2604/2604 [00:00<00:00, 2706.05it/s]

제출 파일 생성 완료: ../results/submissions/ev_battery_submit.csv
예측된 공행성쌍 수: 2604





Unnamed: 0,leading_item_id,following_item_id,value
0,AANGBULD,APQGTRMF,241860
1,AANGBULD,BEZYMBBT,3125675
2,AANGBULD,DEWLVASR,487311
3,AANGBULD,DNMPSKTB,5069528
4,AANGBULD,EVBVXETX,4914193
