In [1]:

!pip install optuna_integration

Collecting optuna_integration
  Downloading optuna_integration-4.3.0-py3-none-any.whl.metadata (12 kB)
Collecting optuna (from optuna_integration)
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna->optuna_integration)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna->optuna_integration)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna_integration-4.3.0-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.1-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloa

In [82]:
import os
import random
from tqdm import tqdm
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor

import numpy as np
import pandas as pd

from sklearn.metrics import make_scorer, mean_squared_error, mean_gamma_deviance
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import VotingRegressor
from sklearn.impute import SimpleImputer, KNNImputer

from scipy.optimize import minimize
import optuna

import optuna.integration.lightgbm as lgb

#from sklearn.ensemble import RandomForestRegressor
#from sklearn.experimental import enable_iterative_imputer
#from sklearn.impute import IterativeImputer

import warnings
warnings.filterwarnings('ignore')


SEED = 42

In [83]:
root = Path('/content')
df_train = pd.read_csv(root / 'sample_loss_data_shape.csv', parse_dates=["yyyymm"])
#df_test = pd.read_csv(root / 'test.csv')

In [84]:
df_train['diff'] = df_train['input_qty'] - df_train['output_qty']
df_train["month"] = pd.to_datetime(df_train["yyyymm"]).dt.to_period("M")
df_train.head()

Unnamed: 0,yyyymm,product_id,process_id,input_qty,output_qty,shape_mm,diff,month
0,2023-01-01,P-001,ProcA,9802,9367,4.2,435,2023-01
1,2023-01-01,P-001,ProcB,8909,8639,4.2,270,2023-01
2,2023-01-01,P-001,ProcC,11434,10912,4.2,522,2023-01
3,2023-01-01,P-002,ProcA,11311,10854,5.0,457,2023-01
4,2023-01-01,P-002,ProcB,10802,10515,5.0,287,2023-01


In [85]:
feature_cols = ['product_id', 'process_id', 'input_qty', 'shape_mm']
target_col = 'diff'

In [86]:
df_train = df_train.dropna(subset=[target_col])
df_train[target_col].isnull().any()

np.False_

In [87]:
# 初期訓練期間
start_test = df_train["month"].min() + 12  # 12 ヶ月後から予測開始
months = df_train["month"].unique()
results = []

train_idx = df_train["month"] < start_test

In [88]:
df_train_hy = df_train[train_idx]
df_train_hy["diff"] = df_train_hy["diff"].astype("float64")
df_train_hy.tail()

Unnamed: 0,yyyymm,product_id,process_id,input_qty,output_qty,shape_mm,diff,month
535,2023-12-01,P-004,ProcE,10285,9988,6.1,297.0,2023-12
536,2023-12-01,P-004,ProcG,8028,7637,6.1,391.0,2023-12
537,2023-12-01,P-005,ProcD,9494,9152,4.9,342.0,2023-12
538,2023-12-01,P-005,ProcE,8373,7992,4.9,381.0,2023-12
539,2023-12-01,P-005,ProcG,11334,10940,4.9,394.0,2023-12


In [89]:
df_train_hy[df_train_hy['diff']>0]

Unnamed: 0,yyyymm,product_id,process_id,input_qty,output_qty,shape_mm,diff,month
0,2023-01-01,P-001,ProcA,9802,9367,4.2,435.0,2023-01
1,2023-01-01,P-001,ProcB,8909,8639,4.2,270.0,2023-01
2,2023-01-01,P-001,ProcC,11434,10912,4.2,522.0,2023-01
3,2023-01-01,P-002,ProcA,11311,10854,5.0,457.0,2023-01
4,2023-01-01,P-002,ProcB,10802,10515,5.0,287.0,2023-01
...,...,...,...,...,...,...,...,...
535,2023-12-01,P-004,ProcE,10285,9988,6.1,297.0,2023-12
536,2023-12-01,P-004,ProcG,8028,7637,6.1,391.0,2023-12
537,2023-12-01,P-005,ProcD,9494,9152,4.9,342.0,2023-12
538,2023-12-01,P-005,ProcE,8373,7992,4.9,381.0,2023-12


In [90]:
gamma_scorer = make_scorer(mean_gamma_deviance)
rmse_scorer2 = make_scorer(mean_squared_error, squared=False)
rmse_scorer = make_scorer(mean_squared_error)

In [96]:
class CustomLGBMRegressor2(lgb.LGBMRegressor):
    def __init__(self, random_state, **kwargs):
        if 'random_state' in kwargs:
            random_state = kwargs.pop('random_state')
        super().__init__(random_state=random_state, **kwargs)
        self.enc_prod = OrdinalEncoder(
                          #categories=categories,
                          dtype=np.int32,
                          handle_unknown='use_encoded_value',
                          unknown_value=-1,
                          encoded_missing_value=-2,
                          min_frequency=2)
        self.enc_proc = OrdinalEncoder(
                          #categories=categories,
                          dtype=np.int32,
                          handle_unknown='use_encoded_value',
                          unknown_value=-1,
                          encoded_missing_value=-2,
                          min_frequency=2)

        #self.imputer = IterativeImputer(RandomForestRegressor())


#    def _encode(self, X, fit=False):
#        X = X.copy()
#        for col in ['product_id', 'process_id']:
#            X[col] = X[col].astype('category')
#        return X

    def _encode(self, X, fit=False):
        X = X.copy()
        if fit:
            X['product_id'] = self.enc_prod.fit_transform(X[['product_id']]).ravel()
            X['process_id'] = self.enc_proc.fit_transform(X[['process_id']]).ravel()
        else:
            X['product_id'] = self.enc_prod.transform(X[['product_id']]).ravel()
            X['process_id'] = self.enc_proc.transform(X[['process_id']]).ravel()
        return X

    def fit(self, X, y, **kwargs):
        X_enc = self._encode(X, fit=True)
        super().fit(X_enc, y, **kwargs)


#        super().fit(X, y, **kwargs)
        y_pred = super().predict(X_enc, **kwargs)
        #return self
#        X_enc = self._encode(X, fit=True)
#        super().fit(X, y, categorical_feature=['product_id','process_id'], **kwargs)
        #print(f'X:{X_enc.head()}')
        #print(f'y:{y}')
        #print(f'y_pred-y:{y_pred-y}')
        #print(self.random_state)
        return self

    def predict(self, X, **kwargs):
        X_enc = self._encode(X, fit=False)
        y_pred = super().predict(X_enc, **kwargs)
        return y_pred

In [100]:
#Nested CV
# ---------------- データ ---------------------
X = df_train_hy[feature_cols]
y = df_train_hy[target_col]

outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
outer_scores = []
best_params_each_fold = []

for outer_i, (train_idx, test_idx) in enumerate(outer_cv.split(X, y), 1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    inner_cv = KFold(n_splits=3, shuffle=True, random_state=outer_i)

    # -------- Optuna objective ---------------
    def lgb_objective(trial):
      params = {
          'objective':         'gamma',
          'verbosity':         -1,
           #'n_iter':            200,
          'random_state':      SEED,
          'boosting_type':     'gbdt',
          #"lambda_l1":        trial.suggest_float("lambda_l1", 1e-3, 0.01, log=True),
          #'lambda_l2':         trial.suggest_float('lambda_l2', 1e-3, 0.01, log=True),
          'learning_rate':     trial.suggest_float('learning_rate', 1e-2, 1e-1, log=True),
          #'max_depth':         trial.suggest_int('max_depth', 2, 2),
          #'num_leaves':        trial.suggest_int('num_leaves', 2, 2),
          #'colsample_bytree':  trial.suggest_float('colsample_bytree', 0.4, 1.0),
          #'colsample_bynode':  trial.suggest_float('colsample_bynode', 0.4, 1.0),
          #'bagging_fraction':  trial.suggest_float('bagging_fraction', 0.4, 1.0),
          #'bagging_freq':      trial.suggest_int('bagging_freq', 1, 2),
          'min_data_in_leaf':  trial.suggest_int('min_data_in_leaf', 1, 2),
      }

      cv = KFold(3, shuffle=True, random_state=SEED)
      estimator = CustomLGBMRegressor2(randome_state=SEED, **params)
      #print(f'X:{X.head()}')
      #print(f'y:{y}')

      val_scores = cross_val_score(
            estimator=estimator,
            X=X_train, y=y_train,
            cv=inner_cv,
            scoring=gamma_scorer,
        )


      return np.mean(val_scores)

    study = optuna.create_study(direction='minimize', study_name='Regressor')
    study.optimize(lgb_objective, n_trials=10, show_progress_bar=True)

    best_params = study.best_trial.params
    best_params_each_fold.append(best_params)

    # -------- Outer-test 評価 ----------------
    final_model = CustomLGBMRegressor2(random_state=SEED, **best_params)

    # early-stopping を効かせる
    final_model.fit(
        X_train, y_train,
        #eval_set=[(X_test, y_test)],
        #early_stopping_rounds=50,
        #verbose=False
    )

    y_pred = final_model.predict(X_test)
    gamma = mean_gamma_deviance(y_test, y_pred)
    outer_scores.append(gamma)
    print(f"Fold {outer_i} gamma = {gamma:.4f}")

# ------------- まとめ ------------------------
print("\n=== Nested CV result ===")
print(f"Mean gamma : {np.mean(outer_scores):.4f} ± {np.std(outer_scores):.4f}")

pd.set_option("display.max_columns", None)
print(pd.DataFrame(best_params_each_fold))

[I 2025-05-25 07:16:03,651] A new study created in memory with name: Regressor


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-05-25 07:16:03,974] Trial 0 finished with value: 0.036072681292404075 and parameters: {'learning_rate': 0.015362547360287242, 'min_data_in_leaf': 2}. Best is trial 0 with value: 0.036072681292404075.
[I 2025-05-25 07:16:04,217] Trial 1 finished with value: 0.03791766628600738 and parameters: {'learning_rate': 0.06668229673640748, 'min_data_in_leaf': 1}. Best is trial 0 with value: 0.036072681292404075.
[I 2025-05-25 07:16:04,443] Trial 2 finished with value: 0.03791766628600738 and parameters: {'learning_rate': 0.051496125408171806, 'min_data_in_leaf': 1}. Best is trial 0 with value: 0.036072681292404075.
[I 2025-05-25 07:16:04,678] Trial 3 finished with value: 0.03791766628600738 and parameters: {'learning_rate': 0.04307856039185255, 'min_data_in_leaf': 1}. Best is trial 0 with value: 0.036072681292404075.
[I 2025-05-25 07:16:04,920] Trial 4 finished with value: 0.036072681292404075 and parameters: {'learning_rate': 0.01957723208227308, 'min_data_in_leaf': 2}. Best is trial 0 

[I 2025-05-25 07:16:06,199] A new study created in memory with name: Regressor


[I 2025-05-25 07:16:06,114] Trial 9 finished with value: 0.036072681292404075 and parameters: {'learning_rate': 0.05473207839642385, 'min_data_in_leaf': 2}. Best is trial 0 with value: 0.036072681292404075.
Fold 1 gamma = 0.0228


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-05-25 07:16:06,475] Trial 0 finished with value: 0.032530278834545824 and parameters: {'learning_rate': 0.014454677910889245, 'min_data_in_leaf': 2}. Best is trial 0 with value: 0.032530278834545824.
[I 2025-05-25 07:16:06,698] Trial 1 finished with value: 0.032530278834545824 and parameters: {'learning_rate': 0.024824695423250172, 'min_data_in_leaf': 2}. Best is trial 0 with value: 0.032530278834545824.
[I 2025-05-25 07:16:06,958] Trial 2 finished with value: 0.032530278834545824 and parameters: {'learning_rate': 0.02320995659151259, 'min_data_in_leaf': 2}. Best is trial 0 with value: 0.032530278834545824.
[I 2025-05-25 07:16:07,186] Trial 3 finished with value: 0.032216171487309105 and parameters: {'learning_rate': 0.019413515714490296, 'min_data_in_leaf': 1}. Best is trial 3 with value: 0.032216171487309105.
[I 2025-05-25 07:16:07,410] Trial 4 finished with value: 0.032216171487309105 and parameters: {'learning_rate': 0.026325222212459108, 'min_data_in_leaf': 1}. Best is tri

[I 2025-05-25 07:16:08,426] A new study created in memory with name: Regressor


[I 2025-05-25 07:16:08,224] Trial 8 finished with value: 0.032530278834545824 and parameters: {'learning_rate': 0.053729459073697146, 'min_data_in_leaf': 2}. Best is trial 3 with value: 0.032216171487309105.
[I 2025-05-25 07:16:08,373] Trial 9 finished with value: 0.032530278834545824 and parameters: {'learning_rate': 0.017022356213467274, 'min_data_in_leaf': 2}. Best is trial 3 with value: 0.032216171487309105.
Fold 2 gamma = 0.0171


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-05-25 07:16:08,587] Trial 0 finished with value: 0.030755675837042598 and parameters: {'learning_rate': 0.013682810415414642, 'min_data_in_leaf': 1}. Best is trial 0 with value: 0.030755675837042598.
[I 2025-05-25 07:16:08,729] Trial 1 finished with value: 0.029806875836709473 and parameters: {'learning_rate': 0.01818552723503213, 'min_data_in_leaf': 2}. Best is trial 1 with value: 0.029806875836709473.
[I 2025-05-25 07:16:08,892] Trial 2 finished with value: 0.030755675837042598 and parameters: {'learning_rate': 0.027199205914398623, 'min_data_in_leaf': 1}. Best is trial 1 with value: 0.029806875836709473.
[I 2025-05-25 07:16:09,028] Trial 3 finished with value: 0.029806875836709473 and parameters: {'learning_rate': 0.09212415722349612, 'min_data_in_leaf': 2}. Best is trial 1 with value: 0.029806875836709473.
[I 2025-05-25 07:16:09,176] Trial 4 finished with value: 0.030755675837042598 and parameters: {'learning_rate': 0.04426921501441371, 'min_data_in_leaf': 1}. Best is trial

[I 2025-05-25 07:16:09,960] A new study created in memory with name: Regressor


Fold 3 gamma = 0.0314


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-05-25 07:16:10,129] Trial 0 finished with value: 0.038057862046679894 and parameters: {'learning_rate': 0.056633888916703526, 'min_data_in_leaf': 1}. Best is trial 0 with value: 0.038057862046679894.
[I 2025-05-25 07:16:10,284] Trial 1 finished with value: 0.038057862046679894 and parameters: {'learning_rate': 0.010989419260320559, 'min_data_in_leaf': 1}. Best is trial 0 with value: 0.038057862046679894.
[I 2025-05-25 07:16:10,426] Trial 2 finished with value: 0.03680210616058191 and parameters: {'learning_rate': 0.03720770114679948, 'min_data_in_leaf': 2}. Best is trial 2 with value: 0.03680210616058191.
[I 2025-05-25 07:16:10,569] Trial 3 finished with value: 0.03680210616058191 and parameters: {'learning_rate': 0.06949968679512407, 'min_data_in_leaf': 2}. Best is trial 2 with value: 0.03680210616058191.
[I 2025-05-25 07:16:10,712] Trial 4 finished with value: 0.038057862046679894 and parameters: {'learning_rate': 0.023126334462475723, 'min_data_in_leaf': 1}. Best is trial 2 

[I 2025-05-25 07:16:11,504] A new study created in memory with name: Regressor


[I 2025-05-25 07:16:11,451] Trial 9 finished with value: 0.038057862046679894 and parameters: {'learning_rate': 0.01034214078657064, 'min_data_in_leaf': 1}. Best is trial 2 with value: 0.03680210616058191.
Fold 4 gamma = 0.0246


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-05-25 07:16:11,669] Trial 0 finished with value: 0.04453165535350556 and parameters: {'learning_rate': 0.027956940857999754, 'min_data_in_leaf': 1}. Best is trial 0 with value: 0.04453165535350556.
[I 2025-05-25 07:16:11,816] Trial 1 finished with value: 0.04453165535350556 and parameters: {'learning_rate': 0.011478855868618654, 'min_data_in_leaf': 1}. Best is trial 0 with value: 0.04453165535350556.
[I 2025-05-25 07:16:11,959] Trial 2 finished with value: 0.042947514798645475 and parameters: {'learning_rate': 0.09766262940864467, 'min_data_in_leaf': 2}. Best is trial 2 with value: 0.042947514798645475.
[I 2025-05-25 07:16:12,123] Trial 3 finished with value: 0.042947514798645475 and parameters: {'learning_rate': 0.015258421175880325, 'min_data_in_leaf': 2}. Best is trial 2 with value: 0.042947514798645475.
[I 2025-05-25 07:16:12,277] Trial 4 finished with value: 0.04453165535350556 and parameters: {'learning_rate': 0.09827141170062129, 'min_data_in_leaf': 1}. Best is trial 2 w

In [101]:
def lgb_objective(trial):
    params = {
        'objective':         'gamma',
        'verbosity':         -1,
        #'n_iter':            200,
        'random_state':      SEED,
        'boosting_type':     'gbdt',
        #"lambda_l1":        trial.suggest_float("lambda_l1", 1e-3, 0.01, log=True),
        #'lambda_l2':         trial.suggest_float('lambda_l2', 1e-3, 0.01, log=True),
        'learning_rate':     trial.suggest_float('learning_rate', 1e-2, 1e-1, log=True),
        #'max_depth':         trial.suggest_int('max_depth', 2, 2),
        #'num_leaves':        trial.suggest_int('num_leaves', 2, 2),
        #'colsample_bytree':  trial.suggest_float('colsample_bytree', 0.4, 1.0),
        #'colsample_bynode':  trial.suggest_float('colsample_bynode', 0.4, 1.0),
        #'bagging_fraction':  trial.suggest_float('bagging_fraction', 0.4, 1.0),
        #'bagging_freq':      trial.suggest_int('bagging_freq', 1, 2),
        'min_data_in_leaf':  trial.suggest_int('min_data_in_leaf', 1, 2),
    }

    X = df_train_hy[feature_cols]
    y = df_train_hy[target_col]
    #cv = StratifiedKFold(5, shuffle=True, random_state=SEED)
    cv = KFold(3, shuffle=True, random_state=SEED)
    estimator = CustomLGBMRegressor2(**params)
    #print(f'X:{X.head()}')
    #print(f'y:{y}')

    val_scores = cross_val_score(
        estimator=estimator,
        X=X, y=y,
        cv=cv,
        scoring=gamma_scorer,
    )

    return np.mean(val_scores)

In [102]:
params = study.best_params

model = CustomLGBMRegressor2(**params, random_state=SEED)


In [103]:
import shap

In [111]:

# 初期訓練期間
start_test = df_train["month"].min() + 12  # 12 ヶ月後から予測開始
months = df_train["month"].unique()
results = []

train_idx = df_train["month"] < start_test

# SHAP値を格納するための新しいカラムを初期化
# 各特徴量に対応するSHAP値の新しいカラムを作成
for col in feature_cols:
    df_train[f'shap_{col}'] = np.nan

for m in tqdm(months[months >= start_test]):
    # --- 学習データ / 予測データ ---
    X_train, y_train = df_train.loc[train_idx, feature_cols], df_train.loc[train_idx, target_col]
    X_pred  = df_train.loc[df_train["month"] == m, feature_cols]
    y_true  = df_train.loc[df_train["month"] == m, target_col]
    #w_pred  = df_train.loc[df_train["month"] == m, "inp"]          # 重み用

    # --- モデル再学習（必要なら 3 ヶ月ごと check） ---
    model.fit(X_train, y_train)

    # --- 予測 & 残差 ---
    y_hat = model.predict(X_pred)
    df_train.loc[df_train["month"] == m, "y_hat"] = y_hat
    df_train.loc[df_train["month"] == m, "residual"] = y_true - y_hat

    # --- KPI 集計（工程 A を例） ---
    #mask_kpi = (df["month"] == m) & (df["process_id"] == "A")
    #kpi_exp  = (df.loc[mask_kpi, "y_hat"] * w_pred[mask_kpi]).sum() / w_pred[mask_kpi].sum()
    #kpi_act  = (df.loc[mask_kpi, "loss_rate"] * w_pred[mask_kpi]).sum() / w_pred[mask_kpi].sum()
    #delta_kpi = kpi_act - kpi_exp

    # --- Δ分解 ---
    #sub = df.loc[mask_kpi].copy()
    #sub["delta_qty"] = (sub["loss_rate"] - sub["y_hat"]) * sub["inp"]
    #contrib = (sub.groupby("product_id")["delta_qty"].sum()
    #             .sort_values(key=abs, ascending=False).head(10))

    # --- SHAP（任意） ---
    X_enc = model._encode(X_pred, fit=False)
    expl = shap.TreeExplainer(model)
    shap_vals = expl.shap_values(X_enc, check_additivity=False)
    #df_train.loc[df_train["month"] == m, "shap"] = shap_vals
    for i, col in enumerate(feature_cols):
        df_train.loc[df_train["month"] == m, f'shap_{col}'] = shap_vals[:, i]
    #shap_df   = pd.DataFrame(shap_vals, columns=X_pred.columns).mul(w_pred.values, axis=0)
    #shap_prod = shap_df.filter(like="product_").sum().sort_values(key=abs, ascending=False).head(10)

    #results.append({
    #    "month": m, "kpi_exp": kpi_exp, "kpi_act": kpi_act,
    #    "delta": delta_kpi, "top_delta": contrib, "top_shap": shap_prod
    #})

    # --- 訓練セットに当月を追加して次ループへ ---
    train_idx |= (df_train["month"] == m)

100%|██████████| 12/12 [00:03<00:00,  3.40it/s]


In [112]:
df_train.to_csv(os.path.join(root, 'sample_loss_data_result.csv'), encoding='utf-8')