## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings('ignore')

## 2. Data Preprocessing (Load and Pivot)

In [2]:
train = pd.read_csv('../open/train.csv')

# Aggregate by year and month
monthly = (
    train
    .groupby(["item_id", "year", "month"], as_index=False)["value"]
    .sum()
)

# Create datetime column
monthly["ym"] = pd.to_datetime(
    monthly["year"].astype(str) + "-" + monthly["month"].astype(str).str.zfill(2)
)

# Create pivot table
pivot = (
    monthly
    .pivot(index="item_id", columns="ym", values="value")
    .fillna(0.0)
)

pivot.head()

ym,2022-01-01,2022-02-01,2022-03-01,2022-04-01,2022-05-01,2022-06-01,2022-07-01,2022-08-01,2022-09-01,2022-10-01,...,2024-10-01,2024-11-01,2024-12-01,2025-01-01,2025-02-01,2025-03-01,2025-04-01,2025-05-01,2025-06-01,2025-07-01
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AANGBULD,14276.0,52347.0,53549.0,0.0,26997.0,84489.0,0.0,0.0,0.0,0.0,...,428725.0,144248.0,26507.0,25691.0,25805.0,0.0,38441.0,0.0,441275.0,533478.0
AHMDUILJ,242705.0,120847.0,197317.0,126142.0,71730.0,149138.0,186617.0,169995.0,140547.0,89292.0,...,123085.0,143451.0,78649.0,125098.0,80404.0,157401.0,115509.0,127473.0,89479.0,101317.0
ANWUJOKX,0.0,0.0,0.0,63580.0,81670.0,26424.0,8470.0,0.0,0.0,80475.0,...,0.0,0.0,0.0,27980.0,0.0,0.0,0.0,0.0,0.0,0.0
APQGTRMF,383999.0,512813.0,217064.0,470398.0,539873.0,582317.0,759980.0,216019.0,537693.0,205326.0,...,683581.0,2147.0,0.0,25013.0,77.0,20741.0,2403.0,3543.0,32430.0,40608.0
ATLDMDBO,143097177.0,103568323.0,118403737.0,121873741.0,115024617.0,65716075.0,146216818.0,97552978.0,72341427.0,87454167.0,...,60276050.0,30160198.0,42613728.0,64451013.0,38667429.0,29354408.0,42450439.0,37136720.0,32181798.0,57090235.0


## 3. Find Comovement Pairs (Baseline Logic)

In [3]:
def safe_corr(x, y):
    if np.std(x) == 0 or np.std(y) == 0:
        return 0.0
    return float(np.corrcoef(x, y)[0, 1])

def find_comovement_pairs(pivot, max_lag=6, min_nonzero=12, corr_threshold=0.33):
    items = pivot.index.to_list()
    months = pivot.columns.to_list()
    n_months = len(months)

    results = []

    for i, leader in tqdm(enumerate(items)):
        x = pivot.loc[leader].values.astype(float)
        if np.count_nonzero(x) < min_nonzero:
            continue

        for follower in items:
            if follower == leader:
                continue

            y = pivot.loc[follower].values.astype(float)
            if np.count_nonzero(y) < min_nonzero:
                continue

            best_lag = None
            best_corr = 0.0

            for lag in range(1, max_lag + 1):
                if n_months <= lag:
                    continue
                corr = safe_corr(x[:-lag], y[lag:])
                if abs(corr) > abs(best_corr):
                    best_corr = corr
                    best_lag = lag

            if best_lag is not None and abs(best_corr) >= corr_threshold:
                results.append({
                    "leading_item_id": leader,
                    "following_item_id": follower,
                    "best_lag": best_lag,
                    "max_corr": best_corr,
                })

    pairs = pd.DataFrame(results)
    return pairs

pairs = find_comovement_pairs(pivot)
print("Comovement pairs found:", len(pairs))
pairs.head()

100it [00:01, 52.46it/s]

Comovement pairs found: 2565





Unnamed: 0,leading_item_id,following_item_id,best_lag,max_corr
0,AANGBULD,APQGTRMF,5,-0.443984
1,AANGBULD,BEZYMBBT,1,-0.333863
2,AANGBULD,DDEXPPXU,2,0.383169
3,AANGBULD,DEWLVASR,6,0.640221
4,AANGBULD,DNMPSKTB,4,-0.410635


## 4. Feature Engineering (New Features)

Create lag, rolling, and periodicity features for all items.

In [4]:
lags = [1, 3, 6, 12]
windows = [3, 6, 12]

def create_features(pivot_df):
    # Melt pivot to long format
    df_long = pivot_df.reset_index().melt(id_vars='item_id', var_name='ym', value_name='value')
    df_long['ym'] = pd.to_datetime(df_long['ym'])
    df_long = df_long.sort_values(by=['item_id', 'ym']).reset_index(drop=True)
    
    print("Creating features...")
    # 1. Periodicity
    df_long['month'] = df_long['ym'].dt.month
    
    # 2. Lag Features
    for lag in lags:
        df_long[f'lag_{lag}'] = df_long.groupby('item_id')['value'].shift(lag)
        
    # 3. Rolling Features
    # Use shift(1) to prevent target leakage in rolling features
    grouped_value = df_long.groupby('item_id')['value'].shift(1)
    for window in windows:
        df_long[f'rolling_mean_{window}'] = grouped_value.rolling(window, min_periods=1).mean().reset_index(drop=True)
        df_long[f'rolling_std_{window}'] = grouped_value.rolling(window, min_periods=1).std().reset_index(drop=True)
        
    # Set index for fast lookup later
    df_long = df_long.set_index(['item_id', 'ym'])
    print("Features created.")
    return df_long

df_features_long = create_features(pivot)

# Define feature columns
base_feature_cols = ['b_t', 'b_t_1', 'a_t_lag', 'max_corr', 'best_lag']
new_feature_cols = ['month'] + [f'lag_{lag}' for lag in lags] + [f'rolling_{metric}_{window}' for metric in ['mean', 'std'] for window in windows]
all_feature_cols = base_feature_cols + new_feature_cols

df_features_long.head()

Creating features...
Features created.


Unnamed: 0_level_0,Unnamed: 1_level_0,value,month,lag_1,lag_3,lag_6,lag_12,rolling_mean_3,rolling_std_3,rolling_mean_6,rolling_std_6,rolling_mean_12,rolling_std_12
item_id,ym,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AANGBULD,2022-01-01,14276.0,1,,,,,,,,,,
AANGBULD,2022-02-01,52347.0,2,14276.0,,,,14276.0,,14276.0,,14276.0,
AANGBULD,2022-03-01,53549.0,3,52347.0,,,,33311.5,26920.262267,33311.5,26920.262267,33311.5,26920.262267
AANGBULD,2022-04-01,0.0,4,53549.0,14276.0,,,40057.333333,22335.376924,40057.333333,22335.376924,40057.333333,22335.376924
AANGBULD,2022-05-01,26997.0,5,0.0,52347.0,,,35298.666667,30575.44934,30043.0,27087.39307,30043.0,27087.39307


## 5. Build Training Data with All Features

Modify `build_training_data` to join the new features.

In [5]:
def build_training_data(pivot, pairs, features_df):
    months = pivot.columns.to_list()
    n_months = len(months)
    rows = []
    print("Building training data...")

    for row in tqdm(pairs.itertuples(index=False)):
        leader = row.leading_item_id
        follower = row.following_item_id
        lag = int(row.best_lag)
        corr = float(row.max_corr)

        if leader not in pivot.index or follower not in pivot.index:
            continue

        a_series = pivot.loc[leader].values.astype(float)
        b_series = pivot.loc[follower].values.astype(float)

        for t in range(max(lag, 1), n_months - 1):
            b_t = b_series[t]
            b_t_1 = b_series[t - 1]
            a_t_lag = a_series[t - lag]
            b_t_plus_1 = b_series[t + 1]
            
            current_ym = months[t]
            
            new_row = {
                "b_t": b_t,
                "b_t_1": b_t_1,
                "a_t_lag": a_t_lag,
                "max_corr": corr,
                "best_lag": float(lag),
                "target": b_t_plus_1,
            }
            
            try:
                features_to_add = features_df.loc[(follower, current_ym)]
                for feat in new_feature_cols:
                    new_row[feat] = features_to_add[feat]
            except KeyError:
                for feat in new_feature_cols:
                    new_row[feat] = np.nan # Will be filled later
            
            rows.append(new_row)

    df_train = pd.DataFrame(rows)
    # Fill NaNs from lags/rolling with 0
    df_train = df_train.fillna(0)
    print("Training data built.")
    return df_train

df_train_model = build_training_data(pivot, pairs, df_features_long)
print('Shape of training data:', df_train_model.shape)
df_train_model.head()

Building training data...


2565it [00:05, 483.34it/s]


Training data built.
Shape of training data: (98571, 17)


Unnamed: 0,b_t,b_t_1,a_t_lag,max_corr,best_lag,target,month,lag_1,lag_3,lag_6,lag_12,rolling_mean_3,rolling_mean_6,rolling_mean_12,rolling_std_3,rolling_std_6,rolling_std_12
0,582317.0,539873.0,14276.0,-0.443984,5.0,759980.0,6.0,539873.0,217064.0,0.0,0.0,409111.666667,424829.4,195647.909091,169907.102884,130274.924753,234522.93115
1,759980.0,582317.0,52347.0,-0.443984,5.0,216019.0,7.0,582317.0,470398.0,383999.0,0.0,530862.666667,451077.333333,246042.181818,56500.931146,133082.564001,253668.110041
2,216019.0,759980.0,53549.0,-0.443984,5.0,537693.0,8.0,759980.0,539873.0,512813.0,0.0,627390.0,513740.833333,315131.272727,116770.948266,176587.606294,281880.13391
3,537693.0,216019.0,0.0,-0.443984,5.0,205326.0,9.0,216019.0,582317.0,217064.0,0.0,519438.666667,464275.166667,334769.363636,277378.182239,214416.524353,264733.341634
4,205326.0,537693.0,26997.0,-0.443984,5.0,169440.0,10.0,537693.0,759980.0,470398.0,0.0,504564.0,517713.333333,383650.545455,273489.561704,177209.040946,245695.304757


## 6. Grid Search with K-Fold Validation

In [6]:
print("Starting Grid Search...")
X_train = df_train_model[all_feature_cols]
y_train = df_train_model["target"]

lgb_model = lgb.LGBMRegressor(random_state=42, device='cuda', n_jobs=-1, verbose=-1)

# Define a smaller parameter grid for faster search
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'num_leaves': [31, 50]
}

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=lgb_model, 
    param_grid=param_grid, 
    cv=kfold, 
    scoring='neg_mean_squared_error', 
    verbose=2, 
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print(f"Grid Search finished.")
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best CV RMSE: {np.sqrt(-grid_search.best_score_)}")

best_params = grid_search.best_params_

Starting Grid Search...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END learning_rate=0.05, n_estimators=100, num_leaves=31; total time=  32.0s
[CV] END learning_rate=0.05, n_estimators=100, num_leaves=31; total time=  32.2s
[CV] END learning_rate=0.05, n_estimators=100, num_leaves=31; total time=  32.4s
[CV] END learning_rate=0.05, n_estimators=100, num_leaves=31; total time=  33.3s
[CV] END learning_rate=0.05, n_estimators=100, num_leaves=31; total time=  35.9s
[CV] END learning_rate=0.05, n_estimators=100, num_leaves=50; total time=  47.3s
[CV] END learning_rate=0.05, n_estimators=100, num_leaves=50; total time=  48.9s
[CV] END learning_rate=0.05, n_estimators=100, num_leaves=50; total time=  49.9s
[CV] END learning_rate=0.05, n_estimators=100, num_leaves=50; total time=  52.2s
[CV] END learning_rate=0.05, n_estimators=100, num_leaves=50; total time=  58.2s
[CV] END learning_rate=0.05, n_estimators=200, num_leaves=31; total time= 1.0min
[CV] END learning_rate=0.

## 7. Train Final Model on Whole Data

Train the model using the best parameters found by Grid Search on the entire training set.

In [7]:
print("Training final model on whole dataset...")
# Prepare data with feature names (DataFrames)
train_X_df = df_train_model[all_feature_cols]
train_y = df_train_model["target"]

final_params = {
    'seed': 42,
    'n_jobs': -1,
    'device': 'cuda',
    'verbose': -1
}
final_params.update(best_params) # Add best params from grid search

reg = lgb.LGBMRegressor(**final_params)
reg.fit(train_X_df, train_y)

print("Final model trained.")

Training final model on whole dataset...
Final model trained.


## 8. Create Submission File

In [8]:
def predict(pivot, pairs, reg, features_df):
    months = pivot.columns.to_list()
    n_months = len(months)

    t_last = n_months - 1
    t_prev = n_months - 2
    
    last_ym = months[t_last]

    preds = []
    print("Generating predictions...")

    for row in tqdm(pairs.itertuples(index=False)):
        leader = row.leading_item_id
        follower = row.following_item_id
        lag = int(row.best_lag)
        corr = float(row.max_corr)

        if leader not in pivot.index or follower not in pivot.index:
            continue

        a_series = pivot.loc[leader].values.astype(float)
        b_series = pivot.loc[follower].values.astype(float)

        if t_last - lag < 0:
            continue

        # Base features
        b_t = b_series[t_last]
        b_t_1 = b_series[t_prev]
        a_t_lag = a_series[t_last - lag]

        X_test_list = [b_t, b_t_1, a_t_lag, corr, float(lag)]
        
        # New features
        try:
            features_to_add = features_df.loc[(follower, last_ym)]
            for feat in new_feature_cols:
                X_test_list.append(features_to_add[feat])
        except KeyError:
            for feat in new_feature_cols:
                X_test_list.append(0) # Fill with 0 if missing

        # Create DataFrame for prediction to ensure feature name consistency
        X_test_df = pd.DataFrame([X_test_list], columns=all_feature_cols)
        X_test_df = X_test_df.fillna(0)
        
        y_pred = reg.predict(X_test_df)[0]
        y_pred = max(0.0, float(y_pred))
        y_pred = int(round(y_pred))

        preds.append({
            "leading_item_id": leader,
            "following_item_id": follower,
            "value": y_pred,
        })

    df_pred = pd.DataFrame(preds)
    print("Predictions generated.")
    return df_pred

In [9]:
submission = predict(pivot, pairs, reg, df_features_long)
submission.head()

Generating predictions...


2565it [00:12, 197.33it/s]

Predictions generated.





Unnamed: 0,leading_item_id,following_item_id,value
0,AANGBULD,APQGTRMF,47237
1,AANGBULD,BEZYMBBT,3677097
2,AANGBULD,DDEXPPXU,17325
3,AANGBULD,DEWLVASR,382646
4,AANGBULD,DNMPSKTB,5355388


In [10]:
submission.to_csv('./submissions/gemini.csv', index=False)
print("Submission file saved.")

Submission file saved.


In [20]:
# 4. ⭐️ (수정) .feature_importances_ '속성'으로 값을 가져옵니다.
# (괄호()가 없고, 뒤에 밑줄(_)이 붙습니다)
feature_gain = reg.feature_importances_

# 5. 결과를 보기 좋게 DataFrame으로 만들기
feature_names = X_train.columns
importance_df = pd.DataFrame({
    'feature': feature_names,
    'gain': feature_gain
})

# 6. 중요도(gain)가 높은 순서대로 정렬
importance_df = importance_df.sort_values(by='gain', ascending=False)

# 7. 결과 출력
print("--- LGBMRegressor Feature Importance (based on Gain) ---")
print(importance_df)

--- LGBMRegressor Feature Importance (based on Gain) ---
            feature  gain
0               b_t  1061
13    rolling_std_3   975
7             lag_3   868
12  rolling_mean_12   797
15   rolling_std_12   774
8             lag_6   767
14    rolling_std_6   750
6             lag_1   703
9            lag_12   685
5             month   669
10   rolling_mean_3   666
11   rolling_mean_6   660
1             b_t_1   280
3          max_corr    76
2           a_t_lag    59
4          best_lag    10
