# Titanic Survival Prediction - Refactored

このノートブックは、Titanicデータセットを使用して生存予測を行います。

## 目次
1. セットアップとデータ読み込み
2. データ探索
3. データ前処理
4. モデルトレーニング
5. ハイパーパラメータチューニング
6. アンサンブルモデル
7. 予測と提出

## 1. セットアップとデータ読み込み

In [29]:
# ライブラリのインポート
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import string 
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# 定数設定
RANDOM_STATE = 42
TEST_SIZE = 0.3
N_JOBS = -1

print("ライブラリのインポート完了")

ライブラリのインポート完了


In [30]:
# データ読み込み
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")
df_train.head()

Train shape: (891, 12)
Test shape: (418, 11)


Unnamed: 0,PassengerId,Perished,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,1,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 2. データ探索

In [31]:
# データの基本情報
print("=" * 60)
print("Train Data Info")
print("=" * 60)
df_train.info()

print("\n" + "=" * 60)
print("Missing Values - Train")
print("=" * 60)
print(df_train.isnull().sum())

print("\n" + "=" * 60)
print("Missing Values - Test")
print("=" * 60)
print(df_test.isnull().sum())

# 生存率
survival_rate = (df_train['Perished'] == 0).sum() / len(df_train) * 100
print(f"\nSurvival rate: {survival_rate:.2f}%")
print(f"Death rate: {100 - survival_rate:.2f}%")

Train Data Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Perished     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

Missing Values - Train
PassengerId      0
Perished         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin         

## 3. データ前処理

前処理を関数化して再利用性を向上

In [32]:
# 数値化を保証
df_train['Age'] = pd.to_numeric(df_train['Age'], errors='coerce')
df_test['Age']  = pd.to_numeric(df_test['Age'],  errors='coerce')

# ここが重要：['Age'] を先に選んでから median
age_by_pclass_sex_train = df_train.groupby(['Sex', 'Pclass'])['Age'].median()
age_by_pclass_sex_test  = df_test.groupby(['Sex', 'Pclass'])['Age'].median()

for pclass in (1, 2, 3):
    for sex in ('female', 'male'):
        print(f"Median age of Pclass {pclass} {sex}s: {age_by_pclass_sex_train.loc[(sex, pclass)]}")
        print(f"Median age of Pclass {pclass} {sex}s (test): {age_by_pclass_sex_test.loc[(sex, pclass)]}")

print(f"Median age of all passengers: {df_train['Age'].median()}")
print(f"Median age of all passengers (test): {df_test['Age'].median()}")



Median age of Pclass 1 females: 35.0
Median age of Pclass 1 females (test): 41.0
Median age of Pclass 1 males: 40.0
Median age of Pclass 1 males (test): 42.0
Median age of Pclass 2 females: 28.0
Median age of Pclass 2 females (test): 24.0
Median age of Pclass 2 males: 30.0
Median age of Pclass 2 males (test): 28.0
Median age of Pclass 3 females: 21.5
Median age of Pclass 3 females (test): 22.0
Median age of Pclass 3 males: 25.0
Median age of Pclass 3 males (test): 24.0
Median age of all passengers: 28.0
Median age of all passengers (test): 27.0


In [33]:
df_test['Age'] = pd.qcut(df_test['Age'], duplicates='drop', q=10)
df_train['Age'] = pd.qcut(df_train['Age'], duplicates='drop', q=10)

df_test['Fare'] = pd.qcut(df_test['Fare'], duplicates='drop', q=13)
df_train['Fare'] = pd.qcut(df_train['Fare'], duplicates='drop', q=13)

In [34]:
# Creating Deck column from the first letter of the Cabin column (M stands for Missing)
df_train['Deck'] = df_train['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')
df_test['Deck']  = df_test['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')

# Train data processing
df_train_decks = df_train.groupby(['Deck', 'Pclass']).count().drop(
    columns=['Perished', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Cabin', 'PassengerId', 'Ticket']
).rename(columns={'Name': 'Count'}).transpose()

# Test data processing - exclude 'Perished' from drop list
df_test_decks = df_test.groupby(['Deck', 'Pclass']).count().drop(
    columns=['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Cabin', 'PassengerId', 'Ticket']
).rename(columns={'Name': 'Count'}).transpose()

def get_pclass_dist(df):
    
    # Creating a dictionary for every passenger class count in every deck
    deck_counts = {'A': {}, 'B': {}, 'C': {}, 'D': {}, 'E': {}, 'F': {}, 'G': {}, 'M': {}, 'T': {}}
    decks = df.columns.levels[0]    
    
    for deck in decks:
        for pclass in range(1, 4):
            try:
                count = df[deck][pclass][0]
                deck_counts[deck][pclass] = count 
            except KeyError:
                deck_counts[deck][pclass] = 0
                
    df_decks = pd.DataFrame(deck_counts)    
    deck_percentages = {}

    # Creating a dictionary for every passenger class percentage in every deck
    for col in df_decks.columns:
        deck_percentages[col] = [(count / df_decks[col].sum()) * 100 for count in df_decks[col]]
        
    return deck_counts, deck_percentages

def display_pclass_dist(percentages):
    
    df_percentages = pd.DataFrame(percentages).transpose()
    deck_names = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'M', 'T')
    bar_count = np.arange(len(deck_names))  
    bar_width = 0.85
    
    pclass1 = df_percentages[0]
    pclass2 = df_percentages[1]
    pclass3 = df_percentages[2]

df_train['Deck'] = df_train['Deck'].replace(['A', 'B', 'C'], 'ABC')
df_test ['Deck']  = df_test['Deck'].replace(['A', 'B', 'C'], 'ABC')

df_train['Deck'] = df_train['Deck'].replace(['D', 'E'], 'DE')
df_test ['Deck']  = df_test['Deck'].replace(['D', 'E'], 'DE')

df_train['Deck'] = df_train['Deck'].replace(['F', 'G'], 'FG')
df_test ['Deck']  = df_test['Deck'].replace(['F', 'G'], 'FG')

idx_train = df_train[df_train['Deck'] == 'T'].index
idx_test = df_test[df_test['Deck'] == 'T'].index

df_train.loc[idx_train, 'Deck'] = 'A'
df_test.loc[idx_test, 'Deck'] = 'A'

df_train['Deck'].value_counts() + df_test['Deck'].value_counts()



Deck
A         NaN
ABC     181.0
DE       87.0
FG       26.0
M      1014.0
Name: count, dtype: float64

In [35]:
# 前提: Age は数値
df_train['Age'] = pd.to_numeric(df_train['Age'], errors='coerce')
df_test['Age']  = pd.to_numeric(df_test['Age'],  errors='coerce')

# 学習データの分布で学習側を補完
df_train['Age'] = df_train['Age'].fillna(
    df_train.groupby(['Sex','Pclass'])['Age'].transform('median')
)

# 学習データで算出した中央値
age_med_train = df_train.groupby(['Sex','Pclass'])['Age'].median().rename('Age_med')

# テストへ結合して補完 → 余分列を削除
df_test = df_test.join(age_med_train, on=['Sex','Pclass'])
df_test['Age'] = df_test['Age'].fillna(df_test['Age_med'])
df_test['Age'] = df_test['Age'].fillna(df_train['Age'].median())
df_test = df_test.drop(columns=['Age_med'])

df_test.isnull().sum() 

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            418
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
Deck             0
dtype: int64

In [36]:
# Family size
df_train['Family_Size'] = df_train['SibSp'] + df_train['Parch'] + 1
df_test['Family_Size'] = df_test['SibSp'] + df_test['Parch'] + 1
# Group family size into bins
family_map = {
    1: 'Alone',
    2: 'Small', 3: 'Small', 4: 'Small',
    5: 'Medium', 6: 'Medium',
    7: 'Large', 8: 'Large', 11: 'Large'
}
df_train['Family_Size_Grouped'] = df_train['Family_Size'].map(family_map)
df_test['Family_Size_Grouped'] = df_test['Family_Size'].map(family_map)




In [37]:
df_train['Ticket_Frequency'] = df_train.groupby('Ticket')['Ticket'].transform('count')
df_test['Ticket_Frequency'] = df_test.groupby('Ticket')['Ticket'].transform('count')

df_train['Ticket_Frequency'].value_counts()

Ticket_Frequency
1    547
2    188
3     63
4     44
7     21
6     18
5     10
Name: count, dtype: int64

In [38]:
df_train['Name'].value_counts()

Name
Braund, Mr. Owen Harris                     1
Boulos, Mr. Hanna                           1
Frolicher-Stehli, Mr. Maxmillian            1
Gilinski, Mr. Eliezer                       1
Murdlin, Mr. Joseph                         1
                                           ..
Kelly, Miss. Anna Katherine "Annie Kate"    1
McCoy, Mr. Bernard                          1
Johnson, Mr. William Cahoone Jr             1
Keane, Miss. Nora A                         1
Dooley, Mr. Patrick                         1
Name: count, Length: 891, dtype: int64

In [39]:
df_train['Title'] = df_train['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
df_test ['Title'] = df_test['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]  

df_train['Is_Married'] = 0
df_test['Is_Married'] = 0

df_train['Is_Married'].loc[df_train['Title'] == 'Mrs'] = 1
df_test['Is_Married'].loc[df_test['Title'] == 'Mrs'] = 1


df_train['Title'] = df_train['Title'].replace(['Miss', 'Mrs','Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Mrs/Ms')
df_test['Title']  = df_test['Title'].replace (['Miss', 'Mrs','Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Mrs/Ms')

df_train['Title'] = df_train['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev'], 'Dr/Military/Noble/Clergy')
df_test['Title'] = df_test['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev'], 'Dr/Military/Noble/Clergy')


In [40]:
def extract_surname(data):    
    
    families = []
    
    for i in range(len(data)):        
        name = data.iloc[i]

        if '(' in name:
            name_no_bracket = name.split('(')[0] 
        else:
            name_no_bracket = name
            
        family = name_no_bracket.split(',')[0]
        title = name_no_bracket.split(',')[1].strip().split(' ')[0]
        
        for c in string.punctuation:
            family = family.replace(c, '').strip()
            
        families.append(family)
            
    return families

df_train['Family'] = extract_surname(df_train['Name'])
df_test['Family'] = extract_surname(df_test['Name'])

dfs = [df_train, df_test]

In [None]:
# Creating a list of families and tickets that are occuring in both training and test set
non_unique_families = [x for x in df_train['Family'].unique() if x in df_test['Family'].unique()]
non_unique_tickets = [x for x in df_train['Ticket'].unique() if x in df_test['Ticket'].unique()]

# 修正: グループ化のキー列は集計対象から除外する
df_family_survival_rate = df_train.groupby('Family')[['Perished', 'Family_Size']].median()
df_ticket_survival_rate = df_train.groupby('Ticket')[['Perished', 'Ticket_Frequency']].median()

family_rates = {}
ticket_rates = {}

for i in range(len(df_family_survival_rate)):
    # Checking a family exists in both training and test set, and has members more than 1
    if df_family_survival_rate.index[i] in non_unique_families and df_family_survival_rate.iloc[i, 1] > 1:
        family_rates[df_family_survival_rate.index[i]] = df_family_survival_rate.iloc[i, 0]

for i in range(len(df_ticket_survival_rate)):
    # Checking a ticket exists in both training and test set, and has members more than 1
    if df_ticket_survival_rate.index[i] in non_unique_tickets and df_ticket_survival_rate.iloc[i, 1] > 1:
        ticket_rates[df_ticket_survival_rate.index[i]] = df_ticket_survival_rate.iloc[i, 0]

print(f"Family survival rates calculated: {len(family_rates)} families")
print(f"Ticket survival rates calculated: {len(ticket_rates)} tickets")

In [None]:
def preprocess_data(df_train, df_test, verbose=True):
    """
    データの前処理を実行
    
    Args:
        df_train: トレーニングデータ
        df_test: テストデータ
        verbose: 進捗表示
    
    Returns:
        X: 特徴量（トレーニング）
        y: ターゲット
        X_test: 特徴量（テスト）
    """
    # データのコピー
    train = df_train.copy()
    test = df_test.copy()
    
    if verbose:
        print("データ前処理開始...")
    
    # 1. 欠損値補完
    # Age, Fareは全体の平均で補完
    age_combined = pd.concat([train['Age'], test['Age']])
    fare_combined = pd.concat([train['Fare'], test['Fare']])
    
    train['Age'].fillna(age_combined.mean(), inplace=True)
    test['Age'].fillna(age_combined.mean(), inplace=True)
    
    med_fare_train = df_train.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
    med_fare_test = df_test.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
# Filling the missing value in Fare with the median Fare of 3rd class alone passenger
    df_train['Fare'] = df_train['Fare'].fillna(med_fare_train)
    df_test['Fare'] = df_test['Fare'].fillna(med_fare_test)
    
    # Embarkedは最頻値'S'で補完
    train['Embarked'].fillna('S', inplace=True)
    test['Embarked'].fillna('S', inplace=True)
    
    if verbose:
        print("  ✓ 欠損値補完完了")
    
    # 2. 不要な列を削除
    drop_cols = ['Cabin', 'Name', 'Ticket']
    train.drop(drop_cols, axis=1, inplace=True)
    test.drop(drop_cols, axis=1, inplace=True)
    
    if verbose:
        print("  ✓ 不要な列を削除")
    
    # 3. Sex列のエンコーディング
    train['Sex'] = train['Sex'].map({'male': 0, 'female': 1})
    test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})
    
    if verbose:
        print("  ✓ Sex列をエンコーディング")
    
    # 4. Embarked列のOne-Hot Encoding
    embarked_combined = pd.concat([train['Embarked'], test['Embarked']])
    embarked_ohe = pd.get_dummies(embarked_combined, prefix='Embarked')
    
    train = pd.concat([train, embarked_ohe[:len(train)]], axis=1)
    test = pd.concat([test, embarked_ohe[len(train):]], axis=1)
    
    train.drop('Embarked', axis=1, inplace=True)
    test.drop('Embarked', axis=1, inplace=True)
    
    if verbose:
        print("  ✓ Embarked列をOne-Hot Encoding")
    
    # 5. 特徴量とターゲットに分割
    X = train.drop(['PassengerId', 'Perished'], axis=1).values
    y = train['Perished'].values
    X_test = test.drop('PassengerId', axis=1).values
    
    if verbose:
        print(f"  ✓ 特徴量形状: X={X.shape}, X_test={X_test.shape}")
        print("データ前処理完了!")
    
    return X, y, X_test

# 前処理実行
X, y, X_test = preprocess_data(df_train, df_test)

In [None]:
# トレーニングデータと検証データに分割
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

print(f"Train set: {X_train.shape}")
print(f"Valid set: {X_valid.shape}")

## 4. モデルトレーニング

In [None]:
def train_baseline_models(X_train, y_train, X_valid, y_valid):
    """
    複数のベースラインモデルをトレーニング
    
    Returns:
        dict: トレーニング済みモデルの辞書
    """
    models = {}
    
    print("=" * 60)
    print("Baseline Models Training")
    print("=" * 60)
    
    # RandomForest
    print("\n[1/6] Random Forest...")
    rfc = RandomForestClassifier(
        max_depth=10, min_samples_leaf=1, n_estimators=100,
        n_jobs=N_JOBS, random_state=RANDOM_STATE
    )
    rfc.fit(X_train, y_train)
    models['RandomForest'] = rfc
    print(f"  Train: {rfc.score(X_train, y_train):.3f}, Valid: {rfc.score(X_valid, y_valid):.3f}")
    
    # Logistic Regression
    print("\n[2/6] Logistic Regression...")
    lr = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)
    lr.fit(X_train, y_train)
    models['LogisticReg'] = lr
    print(f"  Train: {lr.score(X_train, y_train):.3f}, Valid: {lr.score(X_valid, y_valid):.3f}")
    
    # MLP
    print("\n[3/6] MLP Classifier...")
    mlpc = MLPClassifier(
        hidden_layer_sizes=(100, 100, 10),
        random_state=RANDOM_STATE,
        max_iter=1000
    )
    mlpc.fit(X_train, y_train)
    models['MLP'] = mlpc
    print(f"  Train: {mlpc.score(X_train, y_train):.3f}, Valid: {mlpc.score(X_valid, y_valid):.3f}")
    
    # CatBoost
    print("\n[4/6] CatBoost...")
    cbc = CatBoostClassifier(
        iterations=100, depth=6, learning_rate=0.1,
        random_state=RANDOM_STATE, verbose=0
    )
    cbc.fit(X_train, y_train)
    models['CatBoost'] = cbc
    print(f"  Train: {cbc.score(X_train, y_train):.3f}, Valid: {cbc.score(X_valid, y_valid):.3f}")
    
    # XGBoost
    print("\n[5/6] XGBoost...")
    xgb = XGBClassifier(
        n_estimators=100, max_depth=6, learning_rate=0.1,
        random_state=RANDOM_STATE, eval_metric="logloss"
    )
    xgb.fit(X_train, y_train)
    models['XGBoost'] = xgb
    print(f"  Train: {xgb.score(X_train, y_train):.3f}, Valid: {xgb.score(X_valid, y_valid):.3f}")
    
    # LightGBM
    print("\n[6/6] LightGBM...")
    lgb = LGBMClassifier(
        n_estimators=100, max_depth=6, learning_rate=0.1,
        random_state=RANDOM_STATE, verbose=-1
    )
    lgb.fit(X_train, y_train)
    models['LightGBM'] = lgb
    print(f"  Train: {lgb.score(X_train, y_train):.3f}, Valid: {lgb.score(X_valid, y_valid):.3f}")
    
    print("\nベースラインモデルのトレーニング完了!")
    return models

# ベースラインモデルのトレーニング
baseline_models = train_baseline_models(X_train, y_train, X_valid, y_valid)

## 5. ハイパーパラメータチューニング

In [None]:
def tune_models(X_train, y_train, X_valid, y_valid):
    """
    GridSearchでハイパーパラメータチューニング
    
    Returns:
        dict: チューニング済みモデルの辞書
    """
    tuned_models = {}
    
    print("=" * 60)
    print("Hyperparameter Tuning")
    print("=" * 60)
    
    # CatBoost
    print("\n[1/3] CatBoost...")
    cbc_params = {
        "iterations": [100, 200],
        "depth": [4, 6, 8],
        "learning_rate": [0.05, 0.1]
    }
    cbc_gs = GridSearchCV(
        CatBoostClassifier(random_state=RANDOM_STATE, verbose=0),
        cbc_params, cv=5, scoring="accuracy", n_jobs=N_JOBS
    )
    cbc_gs.fit(X_train, y_train)
    tuned_models['CatBoost'] = cbc_gs
    print(f"  Best params: {cbc_gs.best_params_}")
    print(f"  CV Score: {cbc_gs.best_score_:.3f}")
    print(f"  Valid Score: {cbc_gs.score(X_valid, y_valid):.3f}")
    
    # XGBoost
    print("\n[2/3] XGBoost...")
    xgb_params = {
        "n_estimators": [100, 200],
        "max_depth": [4, 6, 8],
        "learning_rate": [0.05, 0.1]
    }
    xgb_gs = GridSearchCV(
        XGBClassifier(random_state=RANDOM_STATE, eval_metric="logloss"),
        xgb_params, cv=5, scoring="accuracy", n_jobs=N_JOBS
    )
    xgb_gs.fit(X_train, y_train)
    tuned_models['XGBoost'] = xgb_gs
    print(f"  Best params: {xgb_gs.best_params_}")
    print(f"  CV Score: {xgb_gs.best_score_:.3f}")
    print(f"  Valid Score: {xgb_gs.score(X_valid, y_valid):.3f}")
    
    # LightGBM
    print("\n[3/3] LightGBM...")
    lgb_params = {
        "n_estimators": [100, 200],
        "max_depth": [4, 6, 8],
        "learning_rate": [0.05, 0.1]
    }
    lgb_gs = GridSearchCV(
        LGBMClassifier(random_state=RANDOM_STATE, verbose=-1),
        lgb_params, cv=5, scoring="accuracy", n_jobs=1
    )
    lgb_gs.fit(X_train, y_train)
    tuned_models['LightGBM'] = lgb_gs
    print(f"  Best params: {lgb_gs.best_params_}")
    print(f"  CV Score: {lgb_gs.best_score_:.3f}")
    print(f"  Valid Score: {lgb_gs.score(X_valid, y_valid):.3f}")
    
    print("\nハイパーパラメータチューニング完了!")
    return tuned_models

# ハイパーパラメータチューニング実行
tuned_models = tune_models(X_train, y_train, X_valid, y_valid)

In [None]:
# モデル性能比較
def compare_models(baseline_models, tuned_models, X_train, y_train, X_valid, y_valid):
    """
    全モデルの性能を比較
    """
    results = []
    
    # ベースラインモデル
    for name, model in baseline_models.items():
        if name not in tuned_models:  # チューニングされていないモデルのみ
            train_score = model.score(X_train, y_train)
            valid_score = model.score(X_valid, y_valid)
            results.append({
                'Model': name,
                'Train Score': train_score,
                'Valid Score': valid_score,
                'Overfit': train_score - valid_score
            })
    
    # チューニング済みモデル
    for name, model in tuned_models.items():
        train_score = model.best_estimator_.score(X_train, y_train)
        valid_score = model.score(X_valid, y_valid)
        results.append({
            'Model': f"{name} (Tuned)",
            'Train Score': train_score,
            'Valid Score': valid_score,
            'Overfit': train_score - valid_score
        })
    
    df_results = pd.DataFrame(results)
    df_results = df_results.round(3)
    df_results = df_results.sort_values('Valid Score', ascending=False)
    
    print("\n" + "=" * 60)
    print("Model Performance Comparison")
    print("=" * 60)
    print(df_results.to_string(index=False))
    
    best_model = df_results.iloc[0]
    print(f"\nBest Model: {best_model['Model']}")
    print(f"Valid Score: {best_model['Valid Score']:.3f}")
    
    return df_results

model_comparison = compare_models(
    baseline_models, tuned_models, X_train, y_train, X_valid, y_valid
)

## 6. Wandb統合（オプション）

In [None]:
# wandb統合（オプション）
USE_WANDB = False  # Trueに変更してwandbを有効化

if USE_WANDB:
    try:
        import wandb
        
        # 環境変数からAPIキーを読み込む（セキュアな方法）
        # os.environ["WANDB_API_KEY"] = "your-api-key"  # または.envファイルから読み込み
        
        wandb.login(timeout=30)
        
        run = wandb.init(
            project="titanic-classification",
            name="titanic-refactored",
            settings=wandb.Settings(start_method="fork"),
            config={
                "dataset": "Titanic",
                "test_size": TEST_SIZE,
                "random_state": RANDOM_STATE
            }
        )
        
        # モデル性能をログ
        for _, row in model_comparison.iterrows():
            wandb.log({
                f"{row['Model']}_train": row['Train Score'],
                f"{row['Model']}_valid": row['Valid Score'],
                f"{row['Model']}_overfit": row['Overfit']
            })
        
        wandb.log({"model_comparison": wandb.Table(dataframe=model_comparison)})
        
        print("wandb initialized successfully!")
        print(f"Run URL: {wandb.run.get_url()}")
        
    except Exception as e:
        print(f"wandb初期化失敗: {e}")
        print("wandbなしで続行します...")
        USE_WANDB = False
else:
    print("wandbは無効です。USE_WANDB=Trueに設定して有効化してください。")

## 7. アンサンブルモデルと予測

In [None]:
# アンサンブルモデルの作成
def create_ensemble(baseline_models, tuned_models):
    """
    Voting Classifierでアンサンブルモデルを作成
    """
    estimators = [
        ('rfc', baseline_models['RandomForest']),
        ('lr', baseline_models['LogisticReg']),
        ('mlpc', baseline_models['MLP']),
        ('cbc', tuned_models['CatBoost'].best_estimator_),
        ('xgb', tuned_models['XGBoost'].best_estimator_),
        ('lgb', tuned_models['LightGBM'].best_estimator_)
    ]
    
    ensemble = VotingClassifier(
        estimators=estimators,
        voting='soft',
        n_jobs=N_JOBS
    )
    
    return ensemble

# アンサンブルモデルのトレーニング
print("=" * 60)
print("Ensemble Model Training")
print("=" * 60)

ensemble = create_ensemble(baseline_models, tuned_models)
ensemble.fit(X_train, y_train)

train_score = ensemble.score(X_train, y_train)
valid_score = ensemble.score(X_valid, y_valid)

print(f"\nEnsemble Model (Soft Voting)")
print(f"  Train Score: {train_score:.3f}")
print(f"  Valid Score: {valid_score:.3f}")
print(f"  Overfit: {train_score - valid_score:.3f}")

if USE_WANDB:
    wandb.log({
        "ensemble_train": train_score,
        "ensemble_valid": valid_score,
        "ensemble_overfit": train_score - valid_score
    })
    wandb.run.summary["final_model"] = "Ensemble (Soft Voting)"
    wandb.run.summary["final_valid_score"] = valid_score

In [None]:
# 予測の生成
print("\n予測を生成中...")
predictions = ensemble.predict(X_test)

# 提出ファイルの作成
submission = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Perished': predictions
})

# 出力ディレクトリの作成
os.makedirs('../output', exist_ok=True)
output_path = '../output/submission_refactored.csv'
submission.to_csv(output_path, index=False)

print(f"\n提出ファイルを保存: {output_path}")
print(f"総予測数: {len(submission)}")
print(f"生存予測: {(predictions == 0).sum()} ({(predictions == 0).sum()/len(predictions)*100:.1f}%)")
print(f"死亡予測: {(predictions == 1).sum()} ({(predictions == 1).sum()/len(predictions)*100:.1f}%)")

if USE_WANDB:
    wandb.log({
        "total_predictions": len(submission),
        "survived_count": (predictions == 0).sum(),
        "died_count": (predictions == 1).sum(),
        "survival_rate": (predictions == 0).sum() / len(predictions)
    })
    
    artifact = wandb.Artifact('submission', type='predictions')
    artifact.add_file(output_path)
    wandb.log_artifact(artifact)

submission.head(10)

In [None]:
# wandbセッション終了
if USE_WANDB:
    wandb.finish()
    print("wandb run finished successfully!")
    print(f"View results at: https://wandb.ai/{wandb.run.entity}/{wandb.run.project}")

print("\n" + "=" * 60)
print("すべての処理が完了しました!")
print("=" * 60)