# Titanic Survival Prediction - Optuna + Stratified K-Fold

このノートブックは、Optunaによるハイパーパラメータ最適化とStratified K-Foldクロスバリデーションを使用します。

## 目次
1. セットアップとデータ読み込み
2. 特徴量エンジニアリング
3. Optunaによるハイパーパラメータチューニング
4. Stratified K-Fold学習
5. アンサンブル予測と提出

## 1. セットアップとデータ読み込み

In [None]:
# ライブラリのインポート
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import string
import pickle
from typing import Dict, List, Tuple
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.ensemble import RandomForestClassifier

from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgbm

# TensorFlow/Keras for CNN
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.optimizers import Adam

import optuna
from optuna.samplers import TPESampler

# SHAP for model interpretation
import shap

# Weights & Biases for experiment tracking
import wandb

# 定数設定
RANDOM_STATE = 42
N_FOLDS = 2
N_TRIALS = 100  # Optunaの試行回数（より多くのパラメータを探索）
TIMEOUT = 600  # Optunaのタイムアウト（秒）- より長く設定

# wandb設定
WANDB_PROJECT = "titanic-classification"
WANDB_API_KEY = "645b90cb2db844ae6d87767f0e414fac7daf7461"

# 乱数シード固定
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

print("ライブラリのインポート完了")

In [None]:
# データ読み込み
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")
print(f"\nTarget distribution:")
print(df_train['Perished'].value_counts(normalize=True))

df_train.head()

In [None]:
# wandb初期化
import sys
sys.path.append('../notebook')


# wandbにログイン
wandb.login(key="645b90cb2db844ae6d87767f0e414fac7daf7461")

# wandb runを開始
run = wandb.init(
    project=WANDB_PROJECT,
    name="titanic-5model-ensemble",
    config={
        "n_folds": N_FOLDS,
        "n_trials": N_TRIALS,
        "random_state": RANDOM_STATE,
        "models": ["CNN", "RandomForest", "CatBoost", "XGBoost", "LightGBM"],
        "features": X.columns.tolist() if 'X' in dir() else []
    },
    reinit=True
)

print(f"✓ Wandb initialized: {WANDB_PROJECT}")
print(f"  Run name: titanic-5model-ensemble")
print(f"  Dashboard: {run.get_url()}")

## 2. 特徴量エンジニアリング

In [None]:
def extract_surname(data):
    """名前から姓を抽出"""
    families = []
    
    for i in range(len(data)):
        name = data.iloc[i]
        if '(' in name:
            name_no_bracket = name.split('(')[0]
        else:
            name_no_bracket = name
        
        family = name_no_bracket.split(',')[0]
        
        for c in string.punctuation:
            family = family.replace(c, '').strip()
        
        families.append(family)
    
    return families


def create_features(df_train, df_test):
    """
    特徴量エンジニアリングを実行
    
    Returns:
        train_df, test_df: 特徴量を追加したDataFrame
    """
    train = df_train.copy()
    test = df_test.copy()
    
    # 全データを結合して特徴量作成
    all_data = pd.concat([train, test], axis=0, sort=False).reset_index(drop=True)
    
    print("特徴量エンジニアリング開始...")
    
    # 1. Title（敬称）抽出
    all_data['Title'] = all_data['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    
    # Titleをグループ化
    all_data['Title'] = all_data['Title'].replace(
        ['Miss', 'Mrs', 'Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Mrs/Ms'
    )
    all_data['Title'] = all_data['Title'].replace(
        ['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev'], 'Dr/Military/Noble/Clergy'
    )
    
    # 2. 結婚フラグ
    all_data['Is_Married'] = 0
    all_data.loc[all_data['Title'] == 'Mrs', 'Is_Married'] = 1
    
    # 3. Family（姓）
    all_data['Family'] = extract_surname(all_data['Name'])
    
    # 4. Family_Size（家族サイズ）
    all_data['Family_Size'] = all_data['SibSp'] + all_data['Parch'] + 1
    
    # 5. Family_Size_Grouped（家族サイズのビン化）
    family_map = {
        1: 'Alone',
        2: 'Small', 3: 'Small', 4: 'Small',
        5: 'Medium', 6: 'Medium',
        7: 'Large', 8: 'Large', 11: 'Large'
    }
    all_data['Family_Size_Grouped'] = all_data['Family_Size'].map(family_map)
    
    # 6. Ticket_Frequency（チケット頻度）
    all_data['Ticket_Frequency'] = all_data.groupby('Ticket')['Ticket'].transform('count')
    
    # 7. Deck（デッキ）
    all_data['Deck'] = all_data['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')
    
    # デッキをグループ化
    all_data['Deck'] = all_data['Deck'].replace(['A', 'B', 'C'], 'ABC')
    all_data['Deck'] = all_data['Deck'].replace(['D', 'E'], 'DE')
    all_data['Deck'] = all_data['Deck'].replace(['F', 'G'], 'FG')
    all_data['Deck'] = all_data['Deck'].replace(['T'], 'M')
    
    # 8. Age補完と離散化
    all_data['Age'] = pd.to_numeric(all_data['Age'], errors='coerce')
    
    # Age を Sex x Pclass の中央値で補完
    age_by_pclass_sex = all_data.groupby(['Sex', 'Pclass'])['Age'].median()
    
    for pclass in [1, 2, 3]:
        for sex in ['male', 'female']:
            mask = (all_data['Age'].isnull()) & (all_data['Pclass'] == pclass) & (all_data['Sex'] == sex)
            all_data.loc[mask, 'Age'] = age_by_pclass_sex.loc[(sex, pclass)]
    
    # Ageをビン化
    all_data['Age_Band'] = pd.cut(all_data['Age'], bins=[0, 12, 18, 30, 50, 80], 
                                    labels=['Child', 'Teen', 'Adult', 'Middle', 'Senior'])
    
    # 9. Fare補完と離散化
    all_data['Fare'] = all_data['Fare'].fillna(all_data['Fare'].median())
    all_data['Fare_Band'] = pd.qcut(all_data['Fare'], q=4, labels=['Low', 'Medium', 'High', 'Very_High'], duplicates='drop')
    
    # 10. Embarked補完
    all_data['Embarked'] = all_data['Embarked'].fillna('S')
    
    # 11. Family_Survival（家族生存率） - 訓練データのみから計算
    train_idx = ~all_data['Perished'].isna()
    
    # 家族生存率を計算
    family_survival = all_data[train_idx].groupby('Family')['Perished'].transform('mean')
    family_survival_dict = all_data[train_idx].groupby('Family')['Perished'].mean().to_dict()
    
    all_data['Family_Survival'] = all_data['Family'].map(family_survival_dict)
    all_data['Family_Survival'] = all_data['Family_Survival'].fillna(0.5)  # 不明は0.5
    
    # 12. Ticket_Survival（チケット生存率）
    ticket_survival_dict = all_data[train_idx].groupby('Ticket')['Perished'].mean().to_dict()
    all_data['Ticket_Survival'] = all_data['Ticket'].map(ticket_survival_dict)
    all_data['Ticket_Survival'] = all_data['Ticket_Survival'].fillna(0.5)
    
    # 13. Sex x Pclass 交互作用
    all_data['Sex_Pclass'] = all_data['Sex'] + '_' + all_data['Pclass'].astype(str)
    
    print(f"  ✓ 特徴量作成完了: {all_data.shape[1]} columns")
    
    # 訓練データとテストデータに分割
    train_processed = all_data[train_idx].reset_index(drop=True)
    test_processed = all_data[~train_idx].reset_index(drop=True)
    
    return train_processed, test_processed


# 特徴量作成
train_df, test_df = create_features(df_train, df_test)

print(f"\nTrain shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"\nCreated features: {train_df.columns.tolist()}")

In [None]:
def prepare_data(train_df, test_df):
    """
    データを機械学習モデル用に準備
    """
    # 使用する特徴量を選択
    feature_cols = [
        'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
        'Embarked', 'Title', 'Is_Married', 'Family_Size',
        'Family_Size_Grouped', 'Ticket_Frequency', 'Deck',
        'Age_Band', 'Fare_Band', 'Family_Survival', 'Ticket_Survival',
        'Sex_Pclass'
    ]
    
    train = train_df[feature_cols + ['Perished']].copy()
    test = test_df[feature_cols].copy()
    
    # カテゴリカル変数をエンコード
    cat_features = ['Sex', 'Embarked', 'Title', 'Family_Size_Grouped', 'Deck', 'Age_Band', 'Fare_Band', 'Sex_Pclass']
    
    le_dict = {}
    for col in cat_features:
        le = LabelEncoder()
        # 訓練データとテストデータを結合してfit
        all_values = pd.concat([train[col], test[col]], axis=0)
        le.fit(all_values.astype(str))
        
        train[col] = le.transform(train[col].astype(str))
        test[col] = le.transform(test[col].astype(str))
        
        le_dict[col] = le
    
    # 特徴量とターゲットに分割
    X = train.drop('Perished', axis=1)
    y = train['Perished']
    X_test = test
    
    print(f"X shape: {X.shape}")
    print(f"y shape: {y.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"\nFeatures: {X.columns.tolist()}")
    
    return X, y, X_test, cat_features


X, y, X_test, cat_features = prepare_data(train_df, test_df)

print(f"\nCategorical features: {cat_features}")

## 3. Optunaによるハイパーパラメータチューニング

In [None]:
# Optuna用のObjective関数を定義 - 詳細なハイパーパラメータチューニング

def objective_cnn(trial, X, y, n_folds=2):
    """1D CNN用のObjective関数 - 表形式データ用"""
    
    # ハイパーパラメータ
    params = {
        'n_conv_layers': trial.suggest_int('n_conv_layers', 1, 3),
        'filters_1': trial.suggest_int('filters_1', 32, 256),
        'kernel_size_1': trial.suggest_int('kernel_size_1', 2, 5),
        'dropout_conv': trial.suggest_float('dropout_conv', 0.1, 0.5),
        
        'n_dense_layers': trial.suggest_int('n_dense_layers', 1, 3),
        'dense_units_1': trial.suggest_int('dense_units_1', 32, 256),
        'dropout_dense': trial.suggest_float('dropout_dense', 0.2, 0.6),
        
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True),
        'batch_size': trial.suggest_categorical('batch_size', [16, 32, 64]),
        'epochs': trial.suggest_int('epochs', 50, 200)
    }
    
    # 追加のCNN層パラメータ
    if params['n_conv_layers'] >= 2:
        params['filters_2'] = trial.suggest_int('filters_2', 32, 256)
        params['kernel_size_2'] = trial.suggest_int('kernel_size_2', 2, 5)
    if params['n_conv_layers'] >= 3:
        params['filters_3'] = trial.suggest_int('filters_3', 32, 256)
        params['kernel_size_3'] = trial.suggest_int('kernel_size_3', 2, 5)
    
    # 追加のDense層パラメータ
    if params['n_dense_layers'] >= 2:
        params['dense_units_2'] = trial.suggest_int('dense_units_2', 32, 256)
    if params['n_dense_layers'] >= 3:
        params['dense_units_3'] = trial.suggest_int('dense_units_3', 32, 128)
    
    # Stratified K-Fold
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)
    scores = []
    
    # データ正規化用
    scaler = StandardScaler()
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
        # データ正規化
        X_train_scaled = scaler.fit_transform(X_train)
        X_valid_scaled = scaler.transform(X_valid)
        
        # CNNのため3次元に変換 (samples, timesteps, features)
        X_train_cnn = X_train_scaled.reshape(X_train_scaled.shape[0], X_train_scaled.shape[1], 1)
        X_valid_cnn = X_valid_scaled.reshape(X_valid_scaled.shape[0], X_valid_scaled.shape[1], 1)
        
        # モデル構築
        model = models.Sequential()
        
        # Conv層1
        model.add(layers.Conv1D(
            filters=params['filters_1'],
            kernel_size=params['kernel_size_1'],
            activation='relu',
            input_shape=(X_train_cnn.shape[1], 1)
        ))
        model.add(layers.BatchNormalization())
        model.add(layers.Dropout(params['dropout_conv']))
        
        # Conv層2
        if params['n_conv_layers'] >= 2:
            model.add(layers.Conv1D(
                filters=params['filters_2'],
                kernel_size=params['kernel_size_2'],
                activation='relu'
            ))
            model.add(layers.BatchNormalization())
            model.add(layers.Dropout(params['dropout_conv']))
        
        # Conv層3
        if params['n_conv_layers'] >= 3:
            model.add(layers.Conv1D(
                filters=params['filters_3'],
                kernel_size=params['kernel_size_3'],
                activation='relu'
            ))
            model.add(layers.BatchNormalization())
            model.add(layers.Dropout(params['dropout_conv']))
        
        model.add(layers.GlobalMaxPooling1D())
        
        # Dense層1
        model.add(layers.Dense(params['dense_units_1'], activation='relu'))
        model.add(layers.BatchNormalization())
        model.add(layers.Dropout(params['dropout_dense']))
        
        # Dense層2
        if params['n_dense_layers'] >= 2:
            model.add(layers.Dense(params['dense_units_2'], activation='relu'))
            model.add(layers.BatchNormalization())
            model.add(layers.Dropout(params['dropout_dense']))
        
        # Dense層3
        if params['n_dense_layers'] >= 3:
            model.add(layers.Dense(params['dense_units_3'], activation='relu'))
            model.add(layers.Dropout(params['dropout_dense']))
        
        # 出力層
        model.add(layers.Dense(1, activation='sigmoid'))
        
        # コンパイル
        model.compile(
            optimizer=Adam(learning_rate=params['learning_rate']),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        # Early Stopping
        early_stop = callbacks.EarlyStopping(
            monitor='val_loss',
            patience=20,
            restore_best_weights=True,
            verbose=0
        )
        
        # 学習
        model.fit(
            X_train_cnn, y_train,
            validation_data=(X_valid_cnn, y_valid),
            epochs=params['epochs'],
            batch_size=params['batch_size'],
            callbacks=[early_stop],
            verbose=0
        )
        
        # 予測
        preds_proba = model.predict(X_valid_cnn, verbose=0).flatten()
        preds = (preds_proba >= 0.5).astype(int)
        score = accuracy_score(y_valid, preds)
        scores.append(score)
        
        # メモリ解放
        tf.keras.backend.clear_session()
        del model
    
    return np.mean(scores)


def objective_randomforest(trial, X, y, n_folds=2):
    """RandomForest用のObjective関数 - 詳細版"""
    params = {
        # 基本パラメータ
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        
        # 特徴量サンプリング
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 10, 100),
        
        # ブートストラップとサンプリング
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'max_samples': trial.suggest_float('max_samples', 0.5, 1.0) if trial.params.get('bootstrap', True) else None,
        
        # 不純度と分割
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'min_impurity_decrease': trial.suggest_float('min_impurity_decrease', 0.0, 0.5),
        
        # その他
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', 'balanced_subsample', None]),
        'ccp_alpha': trial.suggest_float('ccp_alpha', 0.0, 0.1),
        
        'random_state': RANDOM_STATE,
        'n_jobs': -1
    }
    
    # bootstrap=Falseの場合はmax_samplesを削除
    if not params['bootstrap']:
        params.pop('max_samples', None)
    
    # Stratified K-Fold
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)
    scores = []
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
        model = RandomForestClassifier(**params)
        model.fit(X_train, y_train)
        
        preds = model.predict(X_valid)
        score = accuracy_score(y_valid, preds)
        scores.append(score)
    
    return np.mean(scores)


def objective_catboost(trial, X, y, n_folds=2):
    """CatBoost用のObjective関数 - 詳細版"""
    params = {
        # 基本パラメータ
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'depth': trial.suggest_int('depth', 1, 10),
        
        # 正則化パラメータ
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-2, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_float('random_strength', 1e-9, 10.0, log=True),
        
        # バギングとブースティング
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'subsample': trial.suggest_float('subsample', 0.05, 1.0),
        
        # リーフ関連
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        'max_leaves': trial.suggest_int('max_leaves', 16, 64),
        
        # その他
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 10),
        'random_state': RANDOM_STATE,
        'verbose': 0,
        'task_type': 'CPU'
    }
    
    # Stratified K-Fold
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)
    scores = []
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
        model = CatBoostClassifier(**params)
        model.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=50, verbose=0)
        
        preds = model.predict(X_valid)
        score = accuracy_score(y_valid, preds)
        scores.append(score)
    
    return np.mean(scores)


def objective_xgboost(trial, X, y, n_folds=2):
    """XGBoost用のObjective関数 - 詳細版"""
    params = {
        # 基本パラメータ
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),  # eta
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        
        # ツリー構造パラメータ
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 1e-9, 1.0, log=True),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        
        # サンプリングパラメータ
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0),
        
        # 正則化パラメータ
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True),  # L1正則化
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True),  # L2正則化
        
        # その他
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.5, 2.0),
        'max_bin': trial.suggest_int('max_bin', 128, 512),
        
        'random_state': RANDOM_STATE,
        'eval_metric': 'logloss',
        'tree_method': 'hist'
    }
    
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)
    scores = []
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
        model = XGBClassifier(**params)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50, verbose=0)
        
        preds = model.predict(X_valid)
        score = accuracy_score(y_valid, preds)
        scores.append(score)
    
    return np.mean(scores)


def objective_lightgbm(trial, X, y, n_folds=5):
    """LightGBM用のObjective関数 - 詳細版"""
    params = {
        # 基本パラメータ
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        
        # リーフ関連パラメータ
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-5, 1e2, log=True),
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 1.0),
        
        # サンプリングパラメータ
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 0, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        
        # 正則化パラメータ
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True),  # L1正則化
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True),  # L2正則化
        
        # カテゴリカル関連
        'max_cat_threshold': trial.suggest_int('max_cat_threshold', 10, 100),
        'cat_smooth': trial.suggest_float('cat_smooth', 1.0, 100.0),
        
        # その他
        'max_bin': trial.suggest_int('max_bin', 128, 512),
        'path_smooth': trial.suggest_float('path_smooth', 0.0, 1.0),
        
        'random_state': RANDOM_STATE,
        'verbose': -1,
        'boosting_type': 'gbdt'
    }
    
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)
    scores = []
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
        model = LGBMClassifier(**params)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], callbacks=[lgbm.early_stopping(50), lgbm.log_evaluation(0)])
        
        preds = model.predict(X_valid)
        score = accuracy_score(y_valid, preds)
        scores.append(score)
    
    return np.mean(scores)

print("Objective functions defined with extensive hyperparameters (including CNN and RandomForest).")

In [None]:
# CNNの最適化
print("="*60)
print("CNN (1D Convolutional Neural Network) Hyperparameter Optimization")
print("="*60)

study_cnn = optuna.create_study(direction='maximize', sampler=TPESampler(seed=RANDOM_STATE))
study_cnn.optimize(lambda trial: objective_cnn(trial, X, y, N_FOLDS), 
                   n_trials=N_TRIALS, timeout=TIMEOUT, show_progress_bar=True)

print(f"\nBest CNN Score: {study_cnn.best_value:.4f}")
print(f"Best CNN Params: {study_cnn.best_params}")

best_params_cnn = study_cnn.best_params

In [None]:
# CatBoostの最適化
print("="*60)
print("CatBoost Hyperparameter Optimization")
print("="*60)

study_catboost = optuna.create_study(direction='maximize', sampler=TPESampler(seed=RANDOM_STATE))
study_catboost.optimize(lambda trial: objective_catboost(trial, X, y, N_FOLDS), 
                        n_trials=N_TRIALS, timeout=TIMEOUT, show_progress_bar=True)

print(f"\nBest CatBoost Score: {study_catboost.best_value:.4f}")
print(f"Best CatBoost Params: {study_catboost.best_params}")

best_params_catboost = study_catboost.best_params

In [None]:
# XGBoostの最適化
print("="*60)
print("XGBoost Hyperparameter Optimization")
print("="*60)

study_xgboost = optuna.create_study(direction='maximize', sampler=TPESampler(seed=RANDOM_STATE))
study_xgboost.optimize(lambda trial: objective_xgboost(trial, X, y, N_FOLDS), 
                       n_trials=N_TRIALS, timeout=TIMEOUT, show_progress_bar=True)

print(f"\nBest XGBoost Score: {study_xgboost.best_value:.4f}")
print(f"Best XGBoost Params: {study_xgboost.best_params}")

best_params_xgboost = study_xgboost.best_params

## 4. Stratified K-Fold学習

In [None]:
def build_cnn_model(params, input_shape):
    """CNNモデルを構築する補助関数"""
    model = models.Sequential()
    
    # Conv層1
    model.add(layers.Conv1D(
        filters=params['filters_1'],
        kernel_size=params['kernel_size_1'],
        activation='relu',
        input_shape=input_shape
    ))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(params['dropout_conv']))
    
    # Conv層2
    if params['n_conv_layers'] >= 2:
        model.add(layers.Conv1D(
            filters=params['filters_2'],
            kernel_size=params['kernel_size_2'],
            activation='relu'
        ))
        model.add(layers.BatchNormalization())
        model.add(layers.Dropout(params['dropout_conv']))
    
    # Conv層3
    if params['n_conv_layers'] >= 3:
        model.add(layers.Conv1D(
            filters=params['filters_3'],
            kernel_size=params['kernel_size_3'],
            activation='relu'
        ))
        model.add(layers.BatchNormalization())
        model.add(layers.Dropout(params['dropout_conv']))
    
    model.add(layers.GlobalMaxPooling1D())
    
    # Dense層1
    model.add(layers.Dense(params['dense_units_1'], activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(params['dropout_dense']))
    
    # Dense層2
    if params['n_dense_layers'] >= 2:
        model.add(layers.Dense(params['dense_units_2'], activation='relu'))
        model.add(layers.BatchNormalization())
        model.add(layers.Dropout(params['dropout_dense']))
    
    # Dense層3
    if params['n_dense_layers'] >= 3:
        model.add(layers.Dense(params['dense_units_3'], activation='relu'))
        model.add(layers.Dropout(params['dropout_dense']))
    
    # 出力層
    model.add(layers.Dense(1, activation='sigmoid'))
    
    # コンパイル
    model.compile(
        optimizer=Adam(learning_rate=params['learning_rate']),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model


def train_with_stratified_kfold(X, y, X_test, best_params_cnn, best_params_randomforest, best_params_catboost, best_params_xgboost, best_params_lightgbm, n_folds=5):
    """
    Stratified K-Foldで複数モデルを学習し、アンサンブル予測を作成
    """
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)
    
    # OOF予測とテスト予測を保存
    oof_preds_cnn = np.zeros(len(X))
    oof_preds_randomforest = np.zeros(len(X))
    oof_preds_catboost = np.zeros(len(X))
    oof_preds_xgboost = np.zeros(len(X))
    oof_preds_lightgbm = np.zeros(len(X))
    
    test_preds_cnn = np.zeros((len(X_test), n_folds))
    test_preds_randomforest = np.zeros((len(X_test), n_folds))
    test_preds_catboost = np.zeros((len(X_test), n_folds))
    test_preds_xgboost = np.zeros((len(X_test), n_folds))
    test_preds_lightgbm = np.zeros((len(X_test), n_folds))
    
    models_cnn = []
    models_randomforest = []
    models_catboost = []
    models_xgboost = []
    models_lightgbm = []
    scalers = []
    
    print("="*60)
    print("Stratified K-Fold Training")
    print("="*60)
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
        print(f"\n{'='*60}")
        print(f"Fold {fold + 1}/{n_folds}")
        print(f"{'='*60}")
        
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
        # CNN用データ正規化
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_valid_scaled = scaler.transform(X_valid)
        X_test_scaled = scaler.transform(X_test)
        
        X_train_cnn = X_train_scaled.reshape(X_train_scaled.shape[0], X_train_scaled.shape[1], 1)
        X_valid_cnn = X_valid_scaled.reshape(X_valid_scaled.shape[0], X_valid_scaled.shape[1], 1)
        X_test_cnn = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)
        
        scalers.append(scaler)
        
        # CNN
        print("\n[1/5] Training CNN...")
        model_cnn = build_cnn_model(best_params_cnn, (X_train_cnn.shape[1], 1))
        
        early_stop = callbacks.EarlyStopping(
            monitor='val_loss',
            patience=20,
            restore_best_weights=True,
            verbose=0
        )
        
        model_cnn.fit(
            validation_data=(X_valid_cnn, y_valid),
            epochs=best_params_cnn['epochs'],
            batch_size=best_params_cnn['batch_size'],
            callbacks=[early_stop],
            verbose=0
        )
        
        oof_preds_cnn[valid_idx] = (model_cnn.predict(X_valid_cnn, verbose=0).flatten() >= 0.5).astype(int)
        test_preds_cnn[:, fold] = (model_cnn.predict(X_test_cnn, verbose=0).flatten() >= 0.5).astype(int)
        models_cnn.append(model_cnn)
        
        acc_cnn = accuracy_score(y_valid, oof_preds_cnn[valid_idx])
        print(f"  Validation Accuracy: {acc_cnn:.4f}")
        
        # RandomForest
        print("\n[2/5] Training RandomForest...")
        params_rf = best_params_randomforest.copy()
        params_rf.update({'random_state': RANDOM_STATE, 'n_jobs': -1})
        
        if not params_rf.get('bootstrap', True):
            params_rf.pop('max_samples', None)
        
        model_rf = RandomForestClassifier(**params_rf)
        model_rf.fit(X_train, y_train)
        
        oof_preds_randomforest[valid_idx] = model_rf.predict(X_valid)
        test_preds_randomforest[:, fold] = model_rf.predict(X_test)
        models_randomforest.append(model_rf)
        
        acc_rf = accuracy_score(y_valid, model_rf.predict(X_valid))
        print(f"  Validation Accuracy: {acc_rf:.4f}")
        
        # CatBoost
        print("\n[3/5] Training CatBoost...")
        params_cat = best_params_catboost.copy()
        params_cat.update({'random_state': RANDOM_STATE, 'verbose': 0})
        
        model_cat = CatBoostClassifier(**params_cat)
        model_cat.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=50, verbose=0)
        
        oof_preds_catboost[valid_idx] = model_cat.predict(X_valid)
        test_preds_catboost[:, fold] = model_cat.predict(X_test)
        models_catboost.append(model_cat)
        
        acc_cat = accuracy_score(y_valid, model_cat.predict(X_valid))
        print(f"  Validation Accuracy: {acc_cat:.4f}")
        
        # XGBoost
        print("\n[4/5] Training XGBoost...")
        params_xgb = best_params_xgboost.copy()
        params_xgb.update({'random_state': RANDOM_STATE, 'eval_metric': 'logloss'})
        
        model_xgb = XGBClassifier(**params_xgb)
        model_xgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50, verbose=0)
        
        oof_preds_xgboost[valid_idx] = model_xgb.predict(X_valid)
        test_preds_xgboost[:, fold] = model_xgb.predict(X_test)
        models_xgboost.append(model_xgb)
        
        acc_xgb = accuracy_score(y_valid, model_xgb.predict(X_valid))
        print(f"  Validation Accuracy: {acc_xgb:.4f}")
        
        # LightGBM
        print("\n[5/5] Training LightGBM...")
        params_lgb = best_params_lightgbm.copy()
        params_lgb.update({'random_state': RANDOM_STATE, 'verbose': -1})
        
        model_lgb = LGBMClassifier(**params_lgb)
        model_lgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], callbacks=[lgbm.early_stopping(50), lgbm.log_evaluation(0)])
        
        oof_preds_lightgbm[valid_idx] = model_lgb.predict(X_valid)
        test_preds_lightgbm[:, fold] = model_lgb.predict(X_test)
        models_lightgbm.append(model_lgb)
        
        acc_lgb = accuracy_score(y_valid, model_lgb.predict(X_valid))
        print(f"  Validation Accuracy: {acc_lgb:.4f}")
    
    # OOFスコアを計算
    print(f"\n{'='*60}")
    print("Out-of-Fold Scores")
    print(f"{'='*60}")
    print(f"CNN OOF Accuracy: {accuracy_score(y, oof_preds_cnn):.4f}")
    print(f"RandomForest OOF Accuracy: {accuracy_score(y, oof_preds_randomforest):.4f}")
    print(f"CatBoost OOF Accuracy: {accuracy_score(y, oof_preds_catboost):.4f}")
    print(f"XGBoost OOF Accuracy: {accuracy_score(y, oof_preds_xgboost):.4f}")
    print(f"LightGBM OOF Accuracy: {accuracy_score(y, oof_preds_lightgbm):.4f}")
    
    # テスト予測を平均化
    test_preds_cnn_avg = test_preds_cnn.mean(axis=1)
    test_preds_randomforest_avg = test_preds_randomforest.mean(axis=1)
    test_preds_catboost_avg = test_preds_catboost.mean(axis=1)
    test_preds_xgboost_avg = test_preds_xgboost.mean(axis=1)
    test_preds_lightgbm_avg = test_preds_lightgbm.mean(axis=1)
    
    return {
        'oof_cnn': oof_preds_cnn,
        'oof_randomforest': oof_preds_randomforest,
        'oof_catboost': oof_preds_catboost,
        'oof_xgboost': oof_preds_xgboost,
        'oof_lightgbm': oof_preds_lightgbm,
        'test_cnn': test_preds_cnn_avg,
        'test_randomforest': test_preds_randomforest_avg,
        'test_catboost': test_preds_catboost_avg,
        'test_xgboost': test_preds_xgboost_avg,
        'test_lightgbm': test_preds_lightgbm_avg,
        'models_cnn': models_cnn,
        'models_randomforest': models_randomforest,
        'models_catboost': models_catboost,
        'models_xgboost': models_xgboost,
        'models_lightgbm': models_lightgbm,
        'scalers': scalers
    }


# LightGBMの最適化（まだ実行されていない場合）
print("="*60)
print("LightGBM Hyperparameter Optimization")
print("="*60)

study_lightgbm = optuna.create_study(direction='maximize', sampler=TPESampler(seed=RANDOM_STATE))
study_lightgbm.optimize(lambda trial: objective_lightgbm(trial, X, y, N_FOLDS), 
                        n_trials=N_TRIALS, timeout=TIMEOUT, show_progress_bar=True)

print(f"\nBest LightGBM Score: {study_lightgbm.best_value:.4f}")
print(f"Best LightGBM Params: {study_lightgbm.best_params}")

best_params_lightgbm = study_lightgbm.best_params

# K-Fold学習実行
import lightgbm as lgbm

results = train_with_stratified_kfold(
    X, y, X_test,
    best_params_cnn,
    best_params_randomforest,
    best_params_catboost,
    best_params_xgboost,
    best_params_lightgbm,
    n_folds=N_FOLDS
)

## 5. アンサンブル予測と提出

In [None]:
# アンサンブル予測（5モデルの多数決）
test_preds_ensemble = (
    (results['test_cnn'] >= 0.5).astype(int) +
    (results['test_randomforest'] >= 0.5).astype(int) +
    (results['test_catboost'] >= 0.5).astype(int) +
    (results['test_xgboost'] >= 0.5).astype(int) +
    (results['test_lightgbm'] >= 0.5).astype(int)
) / 5

# 0.5を閾値として予測
final_predictions = (test_preds_ensemble >= 0.5).astype(int)

print("="*60)
print("Ensemble Predictions (5 Models: CNN + Tree-based)")
print("="*60)
print(f"Survived (0): {(final_predictions == 0).sum()} ({(final_predictions == 0).sum() / len(final_predictions) * 100:.1f}%)")
print(f"Perished (1): {(final_predictions == 1).sum()} ({(final_predictions == 1).sum() / len(final_predictions) * 100:.1f}%)")

In [None]:
# 提出ファイルの作成
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Perished': final_predictions
})

# 出力ディレクトリの作成
os.makedirs('../output', exist_ok=True)
output_path = '../output/submission_optuna_stratified.csv'
submission.to_csv(output_path, index=False)

print(f"\n提出ファイルを保存: {output_path}")
print(f"\n最初の10行:")
print(submission.head(10))

In [None]:
# モデルと結果を保存
model_save_path = '../output/models_optuna_stratified.pkl'

# CNNモデルは別途保存（Kerasモデル）
for i, cnn_model in enumerate(results['models_cnn']):
    cnn_model.save(f'../output/cnn_model_fold_{i+1}.h5')

# その他のモデルとパラメータを保存
with open(model_save_path, 'wb') as f:
    pickle.dump({
        'models_randomforest': results['models_randomforest'],
        'models_catboost': results['models_catboost'],
        'models_xgboost': results['models_xgboost'],
        'models_lightgbm': results['models_lightgbm'],
        'scalers': results['scalers'],
        'best_params_cnn': best_params_cnn,
        'best_params_randomforest': best_params_randomforest,
        'best_params_catboost': best_params_catboost,
        'best_params_xgboost': best_params_xgboost,
        'best_params_lightgbm': best_params_lightgbm
    }, f)

print(f"\nモデルを保存: {model_save_path}")
print(f"CNNモデルを保存: ../output/cnn_model_fold_*.h5")

In [None]:
print("\n" + "="*60)
print("すべての処理が完了しました!")
print("="*60)
print(f"\n最終結果:")
print(f"  CNN OOF Score: {accuracy_score(y, results['oof_cnn']):.4f}")
print(f"  RandomForest OOF Score: {accuracy_score(y, results['oof_randomforest']):.4f}")
print(f"  CatBoost OOF Score: {accuracy_score(y, results['oof_catboost']):.4f}")
print(f"  XGBoost OOF Score: {accuracy_score(y, results['oof_xgboost']):.4f}")
print(f"  LightGBM OOF Score: {accuracy_score(y, results['oof_lightgbm']):.4f}")
print(f"\n提出ファイル: {output_path}")
print(f"モデルファイル: {model_save_path}")
print(f"CNNモデル: ../output/cnn_model_fold_*.h5")