In [None]:
# ============================================
# 🧊 Titanic 生存予測プロジェクト
# Author: k-ohyeah
# Date: 2025-10-07
# Model: Logistic Regression + Feature Engineering
# ============================================

# 目的：
# Titanicデータセットを用いて、生存者を予測する分類モデルを作成する。
# データ理解 → 前処理 → 特徴量エンジニアリング → モデル構築 → チューニング → 評価 の流れで進める。

# 手順概要：
# 1. データの確認・可視化
# 2. 前処理（欠損補完、カテゴリ処理など）
# 3. 特徴量エンジニアリング（Title作成、FamilySizeなど）
# 4. モデル構築（ロジスティック回帰）
# 5. モデル比較とチューニング（GridSearchCV）
# 6. 最終評価と提出データ作成

In [19]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [20]:
#ライブラリ読み込み
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

#データ読み込み
train = pd.read_csv("/kaggle/input/titanic/train.csv") 
test= pd.read_csv("/kaggle/input/titanic/test.csv")

raw_test=test

print("trainデータ")
print(train.head())

print("testデータ")
print(test.head())

trainデータ
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN     

In [21]:
#欠損値チェック


print("trainデータ件数:", len(train)) 
print() #改行
print("各列の欠損数:")
print(train.isnull().sum())

print() #改行

print("testデータ件数:", len(test)) 
print() #改行
print("各列の欠損数:")
print(test.isnull().sum())


trainデータ件数: 891

各列の欠損数:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

testデータ件数: 418

各列の欠損数:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [22]:
#追加前処理（Title追加、年齢欠損値補完方法の変更）

# 名前からタイトルを抽出
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# 各タイトルの出現回数を確認
print("=== Title counts (Train) ===")
print(train['Title'].value_counts())
print("\n=== Title counts (Test) ===")
print(test['Title'].value_counts())

# タイトルごとの生存率を確認
train.groupby('Title')['Survived'].mean().sort_values(ascending=False)

=== Title counts (Train) ===
Title
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
Countess      1
Capt          1
Ms            1
Sir           1
Lady          1
Mme           1
Don           1
Jonkheer      1
Name: count, dtype: int64

=== Title counts (Test) ===
Title
Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Ms          1
Dr          1
Dona        1
Name: count, dtype: int64


Title
Sir         1.000000
Countess    1.000000
Ms          1.000000
Mme         1.000000
Lady        1.000000
Mlle        1.000000
Mrs         0.792000
Miss        0.697802
Master      0.575000
Col         0.500000
Major       0.500000
Dr          0.428571
Mr          0.156673
Jonkheer    0.000000
Don         0.000000
Rev         0.000000
Capt        0.000000
Name: Survived, dtype: float64

In [23]:
#追加前処理（Title追加、年齢欠損値補完方法の変更）

# Name列からTitle（敬称）を抽出 
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# === タイトルのグルーピング ===
# 主な4タイトルはそのまま使用（Mr, Miss, Mrs, Master）
# それ以外を Rare にまとめる

rare_titles = [
    'Dr', 'Rev', 'Col', 'Major', 'Capt', 'Countess', 
    'Lady', 'Sir', 'Don', 'Jonkheer', 'Mme', 'Mlle', 'Ms', 'Dona'
]

# タイトル変換マップ
title_map = {t: 'Rare' for t in rare_titles}
# Miss / Mrs 系をまとめて調整（Mlle→Miss、Mme→MrsはRareより前に処理）
title_map.update({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})

# 変換を適用
train['Title'] = train['Title'].replace(title_map)
test['Title'] = test['Title'].replace(title_map)

# タイトルの分布確認
print("--- trainデータ ---")
print(train['Title'].value_counts())
print("\n--- testデータ ---")
print(test['Title'].value_counts())

--- trainデータ ---
Title
Mr        517
Miss      185
Mrs       126
Master     40
Rare       23
Name: count, dtype: int64

--- testデータ ---
Title
Mr        240
Miss       79
Mrs        72
Master     21
Rare        6
Name: count, dtype: int64


In [24]:
#追加前処理（Title追加、年齢欠損値補完方法の変更）

# タイトルごとの年齢中央値を確認
print(train.groupby('Title')['Age'].median())

# タイトルごとの年齢中央値で欠損を補完
for dataset in [train, test]:
    dataset['Age'] = dataset['Age'].fillna(
        dataset.groupby('Title')['Age'].transform('median')
    )


# 確認
print(train['Age'].isnull().sum(), test['Age'].isnull().sum())

Title
Master     3.5
Miss      21.0
Mr        30.0
Mrs       35.0
Rare      48.5
Name: Age, dtype: float64
0 0


In [25]:
#追加前処理（Title追加、年齢欠損値補完方法の変更）

# One-Hotエンコーディング（ダミー変数化）
train = pd.get_dummies(train, columns=['Title'], drop_first=True, dtype=int)
test = pd.get_dummies(test, columns=['Title'], drop_first=True, dtype=int)

print("trainデータ")
print(train.head())

print("testデータ")
print(test.head())

trainデータ
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  Title_Miss  Title_Mr  \
0      0         A/5 21171   7.2500   NaN        S           0         1   
1      0          PC 17599  71.2833   C85        C           0         0   
2      0  STON/O2. 3101282   7.9250   NaN        S           1         0   
3      

In [26]:
#train前処理

#欠損値補完

# Embarked の欠損を最頻値で補完
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])

# Fare の欠損を中央値で補完
train['Fare'] = train['Fare'].fillna(train['Fare'].median())

# Cabin → 欠損が多すぎるので「有無フラグ」を作成
train['Cabin_flag'] = train['Cabin'].notnull().astype(int) 

#特徴量追加

# 家族人数 = 同乗の兄弟/配偶者 (SibSp) + 親/子 (Parch) + 自分1人
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1

train['Age*Class'] = train['Age'] * train['Pclass']

train['Fare_per_person'] = train['Fare'] / (train['FamilySize'])
#Titanicデータセットの  は チケット単位の合計運賃 を表しています。

train['IsAlone'] = (train['FamilySize'] == 1).astype(int)
#train["FamilySize"] == 1

#不要列削除

cols_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']  # Cabinはフラグ作成済みなので本体は削除
train = train.drop(columns=cols_to_drop)

#カテゴリ変数のエンコーディング

from sklearn.preprocessing import LabelEncoder

# Sexを0,1に変換
le = LabelEncoder()
train['Sex'] = le.fit_transform(train['Sex'])

# EmbarkedをOne-Hot
train = pd.get_dummies(train, columns=['Embarked'], drop_first=True, dtype=int)

#補完後の検査

print("trainデータ件数:", len(train)) #データの 行数（観測数）
print() #改行

print("各列の欠損数(補完後):")
print(train.isnull().sum())
print() #改行

print(train.head())

trainデータ件数: 891

各列の欠損数(補完後):
Survived           0
Pclass             0
Sex                0
Age                0
SibSp              0
Parch              0
Fare               0
Title_Miss         0
Title_Mr           0
Title_Mrs          0
Title_Rare         0
Cabin_flag         0
FamilySize         0
Age*Class          0
Fare_per_person    0
IsAlone            0
Embarked_Q         0
Embarked_S         0
dtype: int64

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Title_Miss  Title_Mr  \
0         0       3    1  22.0      1      0   7.2500           0         1   
1         1       1    0  38.0      1      0  71.2833           0         0   
2         1       3    0  26.0      0      0   7.9250           1         0   
3         1       1    0  35.0      1      0  53.1000           0         0   
4         0       3    1  35.0      0      0   8.0500           0         1   

   Title_Mrs  Title_Rare  Cabin_flag  FamilySize  Age*Class  Fare_per_person  \
0          0           

In [27]:
#test前処理

#欠損値補完

# Embarked の欠損を最頻値で補完
test['Embarked'] = test['Embarked'].fillna(test['Embarked'].mode()[0])

# Fare の欠損を中央値で補完
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

# Cabin → 欠損が多すぎるので「有無フラグ」を作成
test['Cabin_flag'] = test['Cabin'].notnull().astype(int) 

#特徴量追加

# 家族人数 = 同乗の兄弟/配偶者 (SibSp) + 親/子 (Parch) + 自分1人
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

test['Age*Class'] = test['Age'] * test['Pclass']

test['Fare_per_person'] = test['Fare'] / (test['FamilySize'])
#Titanicデータセットの  は チケット単位の合計運賃 を表しています。

test['IsAlone'] = (test['FamilySize'] == 1).astype(int)
#test[""FamilySize""] == 1

#不要列削除

cols_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']  # Cabinはフラグ作成済みなので本体は削除
test = test.drop(columns=cols_to_drop)

#カテゴリ変数のエンコーディング

from sklearn.preprocessing import LabelEncoder

# Sexを0,1に変換
le = LabelEncoder()
test['Sex'] = le.fit_transform(test['Sex'])

# EmbarkedをOne-Hot
test = pd.get_dummies(test, columns=['Embarked'], drop_first=True, dtype=int)

#補完後の検査

print("testデータ件数:", len(test)) #データの 行数（観測数）
print() #改行

print("各列の欠損数(補完後):")
print(test.isnull().sum())
print() #改行

print(test.head())

testデータ件数: 418

各列の欠損数(補完後):
Pclass             0
Sex                0
Age                0
SibSp              0
Parch              0
Fare               0
Title_Miss         0
Title_Mr           0
Title_Mrs          0
Title_Rare         0
Cabin_flag         0
FamilySize         0
Age*Class          0
Fare_per_person    0
IsAlone            0
Embarked_Q         0
Embarked_S         0
dtype: int64

   Pclass  Sex   Age  SibSp  Parch     Fare  Title_Miss  Title_Mr  Title_Mrs  \
0       3    1  34.5      0      0   7.8292           0         1          0   
1       3    0  47.0      1      0   7.0000           0         0          1   
2       2    1  62.0      0      0   9.6875           0         1          0   
3       3    1  27.0      0      0   8.6625           0         1          0   
4       3    0  22.0      1      1  12.2875           0         0          1   

   Title_Rare  Cabin_flag  FamilySize  Age*Class  Fare_per_person  IsAlone  \
0           0           0           1    

In [28]:
#説明変数と目的変数に分ける

X = train.drop("Survived", axis=1)  # 説明変数
y = train["Survived"]               # 目的変数

#学習データと検証データに分割する

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


# 形と型の確認
print("X shape:", X.shape)
print("X_train shape:", X_train.shape, "X_val shape:", X_val.shape)
print("y_train mean (survival rate):", y_train.mean())
print("y_val mean (survival rate):", y_val.mean())

# もし object 型が残っていたら表示
print("\nColumn dtypes:\n", X.dtypes.value_counts())
print("\nObject columns:", [c for c in X.columns if X[c].dtype == 'object'])


X shape: (891, 17)
X_train shape: (623, 17) X_val shape: (268, 17)
y_train mean (survival rate): 0.38362760834670945
y_val mean (survival rate): 0.3843283582089552

Column dtypes:
 int64      13
float64     4
Name: count, dtype: int64

Object columns: []


In [29]:
#GridSearchCVでロジスティック回帰

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score


# ハイパーパラメータの候補
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],   # 正則化の強さ
    'penalty': ['l1', 'l2'],        # 正則化の種類
    'solver': ['liblinear']         # solverはliblinearがl1対応
}

# モデル作成
log_reg = LogisticRegression(max_iter=1000)

# GridSearchCVの設定
grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    scoring='roc_auc',   # ROC-AUCで評価
    cv=5,                # 5分割交差検証
    n_jobs=-1            # 並列実行
)

# 学習
grid_search.fit(X_train, y_train)

# ベストなパラメータとスコア
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

# バリデーションデータで予測して確認
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)
probas = best_model.predict_proba(X_val)[:,1]

print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print("Validation ROC AUC:", roc_auc_score(y_val, probas))

Best Parameters: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best CV Score: 0.8677818291497603
Validation Accuracy: 0.8208955223880597
Validation ROC AUC: 0.8692262430126507


In [30]:
# testデータの前処理（trainと同じ処理を適用しておくこと！）
X_test = test  # 前処理済み

# 予測
test_pred = best_model.predict(X_test)

# 提出用データフレーム
submission = pd.DataFrame({
    'PassengerId': raw_test['PassengerId'],
    'Survived': test_pred
})

# CSV保存
submission.to_csv("submission_20251007_title_update.csv", index=False)