# モデリング テンプレート

このノートブックは、Kaggle競技におけるモデリングのテンプレートです。

## 1. ライブラリのインポート

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
import optuna
import joblib
import warnings
warnings.filterwarnings("ignore")

# 設定
plt.rcParams["figure.figsize"] = (12, 8)
pd.set_option("display.max_columns", None)

# シード設定
SEED = 42
np.random.seed(SEED)

## 2. データの読み込み

In [None]:
# 処理済みデータの読み込み
X_train = pd.read_csv("../data/processed/X_train_processed.csv")
X_test = pd.read_csv("../data/processed/X_test_processed.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").iloc[:, 0]

print(f"訓練データ形状: {X_train.shape}")
print(f"テストデータ形状: {X_test.shape}")
print(f"ターゲット形状: {y_train.shape}")

## 3. ベースラインモデルの構築

In [None]:
# LightGBMベースラインモデル
from sklearn.model_selection import KFold

# クロスバリデーション設定
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

# LightGBMパラメータ
lgb_params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": -1,
    "random_state": SEED
}

# クロスバリデーション実行
oof_predictions = np.zeros(len(X_train))
test_predictions = np.zeros(len(X_test))
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"Fold {fold + 1}")
    
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # LightGBMデータセット作成
    train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
    val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)
    
    # モデル訓練
    model = lgb.train(
        lgb_params,
        train_data,
        valid_sets=[val_data],
        num_boost_round=1000,
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)]
    )
    
    # 予測
    val_pred = model.predict(X_val_fold, num_iteration=model.best_iteration)
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    oof_predictions[val_idx] = val_pred
    test_predictions += test_pred / 5
    
    # スコア計算
    fold_score = np.sqrt(mean_squared_error(y_val_fold, val_pred))
    cv_scores.append(fold_score)
    print(f"Fold {fold + 1} RMSE: {fold_score:.6f}")

# 全体のスコア
overall_score = np.sqrt(mean_squared_error(y_train, oof_predictions))
print(f"\nOverall CV RMSE: {overall_score:.6f}")
print(f"CV RMSE: {np.mean(cv_scores):.6f} ± {np.std(cv_scores):.6f}")

## 4. 提出ファイルの作成

In [None]:
# 提出ファイルの作成
submission = pd.DataFrame({
    "id": range(len(test_predictions)),  # 適宜IDカラムを調整
    "target": test_predictions  # 適宜ターゲットカラム名を調整
})

os.makedirs("../submissions", exist_ok=True)
submission.to_csv("../submissions/baseline_submission.csv", index=False)
print("提出ファイルを保存しました: ../submissions/baseline_submission.csv")
print(f"提出ファイル形状: {submission.shape}")
display(submission.head())