# CatBoost + Optuna ハイパーパラメータチューニング
変数名を統一して整理したバージョン

In [1]:
# インポート
import numpy as np
import pandas as pd
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
from collections import Counter
import warnings
import os
warnings.filterwarnings("ignore")

# ランダムシード
RANDOM_STATE = 42

print("ライブラリのインポート完了")

ライブラリのインポート完了


In [2]:
# データ読み込み
train_raw = pd.read_csv("../input/train.csv", skipinitialspace=True)
test_raw = pd.read_csv("../input/test.csv", skipinitialspace=True)

# 文字列列を一括でstripし、空白のみをNaNに
str_cols = train_raw.select_dtypes(include=["object"]).columns
train_raw[str_cols] = (train_raw[str_cols]
                .apply(lambda s: s.str.strip())
                .replace(r"^\s*$", np.nan, regex=True))

str_cols_test = test_raw.select_dtypes(include=["object"]).columns
test_raw[str_cols_test] = (test_raw[str_cols_test]
                .apply(lambda s: s.str.strip())
                .replace(r"^\s*$", np.nan, regex=True))

print(f"Train shape: {train_raw.shape}")
print(f"Test shape: {test_raw.shape}")
print(f"\nTrain columns: {train_raw.columns.tolist()}")

Train shape: (891, 12)
Test shape: (418, 11)

Train columns: ['PassengerId', 'Perished', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [3]:
# 外れ値検出関数
def detect_outliers(df, features):
    outlier_indices = []
    for c in features:
        Q1 = np.percentile(df[c], 25)
        Q3 = np.percentile(df[c], 75)
        IQR = Q3 - Q1
        outlier_step = IQR * 1.5
        outlier_list_col = df[(df[c] < Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index
        outlier_indices.extend(outlier_list_col)
    
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 2)
    return multiple_outliers

# 外れ値削除
outlier_idx = detect_outliers(train_raw, ["Age", "SibSp", "Parch", "Fare"])
train_raw = train_raw.drop(outlier_idx, axis=0).reset_index(drop=True)

print(f"外れ値削除後のTrain shape: {train_raw.shape}")
print(f"削除された行数: {len(outlier_idx)}")

外れ値削除後のTrain shape: (881, 12)
削除された行数: 10


In [4]:
# trainとtestを結合
train_len = len(train_raw)
df_combined = pd.concat([train_raw, test_raw], axis=0).reset_index(drop=True)

print(f"結合後のデータサイズ: {df_combined.shape}")
print(f"Train length: {train_len}")

結合後のデータサイズ: (1299, 12)
Train length: 881


In [5]:
# 欠損値処理
print("欠損値処理前:")
print(df_combined.isnull().sum())

# Embarked
df_combined["Embarked"] = df_combined["Embarked"].fillna("C")

# Fare
df_combined["Fare"] = df_combined["Fare"].fillna(np.mean(df_combined[df_combined["Pclass"] == 3]["Fare"]))

# Age の欠損値補完
df_combined["Sex_num"] = [1 if i == "male" else 0 for i in df_combined["Sex"]]
index_nan_age = list(df_combined["Age"][df_combined["Age"].isnull()].index)
for i in index_nan_age:
    age_pred = df_combined["Age"][((df_combined["SibSp"] == df_combined.iloc[i]["SibSp"]) &
                              (df_combined["Parch"] == df_combined.iloc[i]["Parch"]) &
                              (df_combined["Pclass"] == df_combined.iloc[i]["Pclass"]))].median()
    age_med = df_combined["Age"].median()
    if not np.isnan(age_pred):
        df_combined["Age"].iloc[i] = age_pred
    else:
        df_combined["Age"].iloc[i] = age_med

print("\n欠損値処理後:")
print(df_combined.isnull().sum())

欠損値処理前:
PassengerId       0
Perished        418
Pclass            0
Name              0
Sex               0
Age             256
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1007
Embarked          2
dtype: int64

欠損値処理後:
PassengerId       0
Perished        418
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin          1007
Embarked          0
Sex_num           0
dtype: int64


In [6]:
# Pclass を One-Hot Encoding
df_combined = pd.get_dummies(df_combined, columns=["Pclass"], prefix="Pclass")

print(f"Pclass One-Hot Encoding後のカラム数: {len(df_combined.columns)}")
print(f"追加されたカラム: {[col for col in df_combined.columns if 'Pclass' in col]}")

Pclass One-Hot Encoding後のカラム数: 15
追加されたカラム: ['Pclass_1', 'Pclass_2', 'Pclass_3']


In [7]:
# Title抽出
df_combined["Title"] = [i.split(".")[0].split(",")[-1].strip() for i in df_combined["Name"]]
df_combined["Title"] = df_combined["Title"].replace(["Lady", "the Countess", "Capt", "Col", "Don", "Dr", "Major", "Rev", "Sir", "Jonkheer", "Dona"], "other")
df_combined["Title"] = [0 if i == "Master" else 1 if i in ["Miss", "Ms", "Mlle", "Mrs"] else 2 if i == "Mr" else 3 for i in df_combined["Title"]]

# 家族サイズ
df_combined["Fsize"] = df_combined["SibSp"] + df_combined["Parch"] + 1
df_combined["family_size"] = [1 if i < 5 else 0 for i in df_combined["Fsize"]]

print("特徴量エンジニアリング完了")
print(f"Title の分布:\n{df_combined['Title'].value_counts()}")

特徴量エンジニアリング完了
Title の分布:
Title
2    753
1    456
0     60
3     30
Name: count, dtype: int64


In [8]:
# One-Hot Encoding
df_combined = pd.get_dummies(df_combined, columns=["family_size", "Embarked"])

# Ticket処理
tickets = []
for i in list(df_combined.Ticket):
    if not i.isdigit():
        tickets.append(i.replace(".", "").replace("/", "").strip().split(" ")[0])
    else:
        tickets.append("x")
df_combined["Ticket"] = tickets
df_combined = pd.get_dummies(df_combined, columns=["Ticket"], prefix="T")

# Sex One-Hot
df_combined["Sex"] = df_combined["Sex_num"]
df_combined = pd.get_dummies(df_combined, columns=["Sex"])

# Title One-Hot
df_combined = pd.get_dummies(df_combined, columns=["Title"])

print(f"One-Hot Encoding後のカラム数: {len(df_combined.columns)}")

One-Hot Encoding後のカラム数: 61


In [9]:
# 不要なカラムを削除
drop_cols = ["PassengerId", "Cabin", "Name", "Sex_num"]
df_combined = df_combined.drop(labels=drop_cols, axis=1)

print(f"カラム削除後のサイズ: {df_combined.shape}")
print(f"\n残っているカラム: {df_combined.columns.tolist()[:10]}...")

カラム削除後のサイズ: (1299, 57)

残っているカラム: ['Perished', 'Age', 'SibSp', 'Parch', 'Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Fsize', 'family_size_0']...


In [10]:
# trainとtestに分割
train_processed = df_combined[:train_len].copy()
test_processed = df_combined[train_len:].copy()

# インデックスをリセット（重要！）
train_processed = train_processed.reset_index(drop=True)
test_processed = test_processed.reset_index(drop=True)

# 特徴量とターゲットに分割
X_train_full = train_processed.drop(labels=["Perished"], axis=1)
y_train_full = train_processed["Perished"]
X_test_final = test_processed.drop(labels=["Perished"], axis=1)

print(f"X_train_full shape: {X_train_full.shape}")
print(f"y_train_full shape: {y_train_full.shape}")
print(f"X_test_final shape: {X_test_final.shape}")
print(f"\nX_train_full type: {type(X_train_full)}")
print(f"y_train_full type: {type(y_train_full)}")

X_train_full shape: (881, 56)
y_train_full shape: (881,)
X_test_final shape: (418, 56)

X_train_full type: <class 'pandas.core.frame.DataFrame'>
y_train_full type: <class 'pandas.core.series.Series'>


In [11]:
# Optuna目的関数
def objective(trial, X_train_full, y_train_full, n_folds=5):
    """Optuna目的関数: CatBoostのハイパーパラメータをチューニング"""
    
    params = {
        "iterations": trial.suggest_int("iterations", 50, 200),
        "learning_rate": trial.suggest_float("learning_rate", 0.03, 0.1),
        "depth": trial.suggest_int("depth", 3, 5),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 20),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 5.0, 20.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 0.5),
        "random_strength": trial.suggest_float("random_strength", 0.0, 2.0),
        "rsm": trial.suggest_float("rsm", 0.7, 0.9),
        "loss_function": "Logloss",
        "eval_metric": "Logloss",
        "random_state": RANDOM_STATE,
        "verbose": 0,
        "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 30, 50)
    }
    
    # 5-Fold Cross Validation
    cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)
    scores = []
    
    for train_idx, val_idx in cv.split(X_train_full, y_train_full):
        # .ilocを使ってDataFrameからデータを取得
        X_train_fold = X_train_full.iloc[train_idx]
        X_val_fold = X_train_full.iloc[val_idx]
        y_train_fold = y_train_full.iloc[train_idx]
        y_val_fold = y_train_full.iloc[val_idx]
        
        model = CatBoostClassifier(**params)
        model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), early_stopping_rounds=50, verbose=0)
        
        y_pred_proba = model.predict_proba(X_val_fold)[:, 1]
        score = roc_auc_score(y_val_fold, y_pred_proba)
        scores.append(score)
    
    return np.mean(scores)

print("Optuna目的関数を定義しました")

Optuna目的関数を定義しました


In [12]:
# Optunaでハイパーパラメータチューニング
TUNE_HYPERPARAMETERS = True  # Falseにするとチューニングをスキップ
N_TRIALS = 50  # チューニングの試行回数

if TUNE_HYPERPARAMETERS:
    print("="*60)
    print("Optuna ハイパーパラメータチューニング開始")
    print("="*60)
    print(f"試行回数: {N_TRIALS}")
    print(f"CV Folds: 5")
    print("="*60)
    
    # Optunaスタディの作成
    study = optuna.create_study(
        direction="maximize",  # ROC-AUCを最大化
        sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE)
    )
    
    # 最適化実行
    study.optimize(
        lambda trial: objective(trial, X_train_full, y_train_full, n_folds=5),
        n_trials=N_TRIALS,
        show_progress_bar=True
    )
    
    # 最適なパラメータを取得
    best_params_catboost = study.best_params
    best_params_catboost.update({
        "loss_function": "Logloss",
        "eval_metric": "Logloss",
        "random_state": RANDOM_STATE,
        "verbose": 0
    })
    
    print("\n" + "="*60)
    print("チューニング完了")
    print("="*60)
    print(f"最適なROC-AUC: {study.best_value:.4f}")
    print(f"\n最適なパラメータ:")
    for key, value in study.best_params.items():
        print(f"  {key}: {value}")
    print("="*60)
    
else:
    # デフォルトパラメータを使用
    best_params_catboost = {
        "iterations": 1000,
        "learning_rate": 0.03,
        "depth": 7,
        "l2_leaf_reg": 6.0,
        "bagging_temperature": 0.5,
        "random_strength": 1.0,
        "rsm": 0.9,
        "loss_function": "Logloss",
        "eval_metric": "Logloss",
        "random_state": RANDOM_STATE,
        "verbose": 0
    }
    print("デフォルトパラメータを使用します")
    print(best_params_catboost)

[I 2025-11-11 00:31:58,662] A new study created in memory with name: no-name-538a1583-b7a7-4e81-8e6b-c60d9c8f1734


Optuna ハイパーパラメータチューニング開始
試行回数: 50
CV Folds: 5


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-11 00:32:00,560] Trial 0 finished with value: 0.8845240950610622 and parameters: {'iterations': 106, 'learning_rate': 0.09655000144869413, 'depth': 5, 'min_data_in_leaf': 14, 'l2_leaf_reg': 7.340279606636548, 'bagging_temperature': 0.07799726016810132, 'random_strength': 0.11616722433639892, 'rsm': 0.8732352291549871, 'early_stopping_rounds': 42}. Best is trial 0 with value: 0.8845240950610622.
[I 2025-11-11 00:32:03,803] Trial 1 finished with value: 0.8827054726069837 and parameters: {'iterations': 156, 'learning_rate': 0.03144091460070617, 'depth': 5, 'min_data_in_leaf': 18, 'l2_leaf_reg': 8.185086660174143, 'bagging_temperature': 0.09091248360355031, 'random_strength': 0.36680901970686763, 'rsm': 0.7608484485919075, 'early_stopping_rounds': 41}. Best is trial 0 with value: 0.8845240950610622.
[I 2025-11-11 00:32:05,994] Trial 2 finished with value: 0.8843595720653195 and parameters: {'iterations': 115, 'learning_rate': 0.050386039813862936, 'depth': 4, 'min_data_in_leaf':

In [13]:
# 最適なパラメータで全データを使って学習
print("\n" + "="*60)
print("最終モデルで学習開始")
print("="*60)

final_model = CatBoostClassifier(**best_params_catboost)
final_model.fit(X_train_full, y_train_full, verbose=100)

# テストデータで予測
y_pred = final_model.predict(X_test_final)

print("="*60)
print("予測完了")
print("="*60)


最終モデルで学習開始
0:	learn: 0.6473109	total: 806us	remaining: 157ms
100:	learn: 0.3309581	total: 91.9ms	remaining: 85.5ms
194:	learn: 0.2755400	total: 181ms	remaining: 0us
予測完了


In [14]:
# 最適なパラメータで5-Fold CVのスコアと標準偏差を計算
print("\n" + "="*60)
print("5-Fold CV でスコアと標準偏差を計算")
print("="*60)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
cv_scores = []
cv_accuracies = []

for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_train_full, y_train_full), 1):
    X_train_fold = X_train_full.iloc[train_idx]
    X_val_fold = X_train_full.iloc[val_idx]
    y_train_fold = y_train_full.iloc[train_idx]
    y_val_fold = y_train_full.iloc[val_idx]
    
    model = CatBoostClassifier(**best_params_catboost)
    model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), early_stopping_rounds=50, verbose=0)
    
    # ROC-AUC
    y_pred_proba = model.predict_proba(X_val_fold)[:, 1]
    roc_auc = roc_auc_score(y_val_fold, y_pred_proba)
    cv_scores.append(roc_auc)
    
    # Accuracy
    y_pred_fold = model.predict(X_val_fold)
    acc = accuracy_score(y_val_fold, y_pred_fold)
    cv_accuracies.append(acc)
    
    print(f"Fold {fold_idx}: ROC-AUC = {roc_auc:.4f}, Accuracy = {acc:.4f}")

print("\n" + "="*60)
print("Cross-Validation Results")
print("="*60)
print(f"ROC-AUC Mean: {np.mean(cv_scores):.4f}")
print(f"ROC-AUC Std:  {np.std(cv_scores):.4f}")
print(f"\nAccuracy Mean: {np.mean(cv_accuracies):.4f}")
print(f"Accuracy Std:  {np.std(cv_accuracies):.4f}")
print("="*60)


5-Fold CV でスコアと標準偏差を計算
Fold 1: ROC-AUC = 0.8762, Accuracy = 0.8192
Fold 2: ROC-AUC = 0.8760, Accuracy = 0.7784
Fold 3: ROC-AUC = 0.9333, Accuracy = 0.8636
Fold 4: ROC-AUC = 0.8905, Accuracy = 0.8352
Fold 5: ROC-AUC = 0.8811, Accuracy = 0.8750

Cross-Validation Results
ROC-AUC Mean: 0.8914
ROC-AUC Std:  0.0216

Accuracy Mean: 0.8343
Accuracy Std:  0.0343


In [15]:
# 提出用DataFrameを作成
submission = pd.DataFrame({
    'PassengerId': test_raw['PassengerId'],
    'Perished': y_pred.astype(int)
})

# outputフォルダが存在しない場合は作成
output_dir = '../output'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# CSVファイルを保存
submission.to_csv(f'{output_dir}/detaicatcat.csv', index=False)

print("Submission file created successfully!")
print(f"File saved to: {output_dir}/detaicatcat.csv")
print(f"\nFirst few rows of submission:")
print(submission.head(10))
print(f"\nSubmission shape: {submission.shape}")
print(f"Perished distribution:\n{submission['Perished'].value_counts()}")

Submission file created successfully!
File saved to: ../output/detaicatcat.csv

First few rows of submission:
   PassengerId  Perished
0          892         1
1          893         1
2          894         1
3          895         1
4          896         1
5          897         1
6          898         0
7          899         1
8          900         0
9          901         1

Submission shape: (418, 2)
Perished distribution:
Perished
1    271
0    147
Name: count, dtype: int64


In [16]:
# 最適なパラメータで5-Fold CVのスコアと標準偏差を計算
print("\n" + "="*60)
print("5-Fold CV でスコアと標準偏差を計算")
print("="*60)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
cv_scores = []
cv_accuracies = []

for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_train_full, y_train_full), 1):
    X_train_fold = X_train_full.iloc[train_idx]
    X_val_fold = X_train_full.iloc[val_idx]
    y_train_fold = y_train_full.iloc[train_idx]
    y_val_fold = y_train_full.iloc[val_idx]
    
    model = CatBoostClassifier(**best_params_catboost)
    model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), early_stopping_rounds=50, verbose=0)
    
    # ROC-AUC
    y_pred_proba = model.predict_proba(X_val_fold)[:, 1]
    roc_auc = roc_auc_score(y_val_fold, y_pred_proba)
    cv_scores.append(roc_auc)
    
    # Accuracy
    y_pred_fold = model.predict(X_val_fold)
    acc = accuracy_score(y_val_fold, y_pred_fold)
    cv_accuracies.append(acc)
    
    print(f"Fold {fold_idx}: ROC-AUC = {roc_auc:.4f}, Accuracy = {acc:.4f}")

print("\n" + "="*60)
print("Cross-Validation Results")
print("="*60)
print(f"ROC-AUC Mean: {np.mean(cv_scores):.4f}")
print(f"ROC-AUC Std:  {np.std(cv_scores):.4f}")
print(f"\nAccuracy Mean: {np.mean(cv_accuracies):.4f}")
print(f"Accuracy Std:  {np.std(cv_accuracies):.4f}")
print("="*60)


5-Fold CV でスコアと標準偏差を計算
Fold 1: ROC-AUC = 0.8762, Accuracy = 0.8192
Fold 2: ROC-AUC = 0.8760, Accuracy = 0.7784
Fold 3: ROC-AUC = 0.9333, Accuracy = 0.8636
Fold 4: ROC-AUC = 0.8905, Accuracy = 0.8352
Fold 5: ROC-AUC = 0.8811, Accuracy = 0.8750

Cross-Validation Results
ROC-AUC Mean: 0.8914
ROC-AUC Std:  0.0216

Accuracy Mean: 0.8343
Accuracy Std:  0.0343


In [17]:
X_train_full.columns

Index(['Age', 'SibSp', 'Parch', 'Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Fsize', 'family_size_0', 'family_size_1', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'T_A', 'T_A4', 'T_A5', 'T_AQ3', 'T_AQ4', 'T_AS', 'T_C',
       'T_CA', 'T_CASOTON', 'T_FC', 'T_FCC', 'T_Fa', 'T_LINE', 'T_LP', 'T_PC',
       'T_PP', 'T_PPP', 'T_SC', 'T_SCA3', 'T_SCA4', 'T_SCAH', 'T_SCOW',
       'T_SCPARIS', 'T_SCParis', 'T_SOC', 'T_SOP', 'T_SOPP', 'T_SOTONO2',
       'T_SOTONOQ', 'T_SP', 'T_STONO', 'T_STONO2', 'T_STONOQ', 'T_SWPP',
       'T_WC', 'T_WEP', 'T_x', 'Sex_0', 'Sex_1', 'Title_0', 'Title_1',
       'Title_2', 'Title_3'],
      dtype='object')

In [18]:
int(submission.head(10))  # 最初の10行
print(submission.shape)     # (418, 2) のような形
print(submission['Perished'].value_counts())  # 0と1の分布

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'DataFrame'