# Ageの欠損値データ予測

## パッケージの読み込み

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

FOLD = 5
OPTUNA_N_TRIALS = 1000
SEED = 0

## データ読み込み

In [None]:
# titanicの教師データ
t_train = pd.read_csv('../data/input/train.csv')

# titanicデータ
t_test = pd.read_csv('../data/input/test.csv')

In [None]:
t_train.head()

In [None]:
t_test.head()

## 特徴量エンジニアリング

In [None]:
# 教師データとテストデータのconcat
data = pd.concat([t_train, t_test], sort=False)

In [None]:
# Nameをもとに、敬称を表すTitleを作成
# 作成後、ラベルエンコーディングを適用
data['Title'] = data['Name'].map(lambda x: x.split(', ')[1].split('. ')[0])
data['Title'].replace(['Mlle'], 'Miss', inplace=True)
data['Title'].replace(['Ms', 'Mme'], 'Miss', inplace=True)
data['Title'].replace(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer', inplace=True)
data['Title'].replace(['Don', 'Lady', 'Sir', 'the Countess', 'Jonkheer', 'Dona'], 'Royalty', inplace=True)
data['Title'] = data['Title'].map({'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Officer': 4, 'Royalty': 5}).astype(int)

In [None]:
# Sexを0,1に変換
data['Sex'].replace(['male', 'female'], [0, 1], inplace = True)

In [None]:
# SibSpとParchをもとに、家族の人数を表すFamilySizeを作成
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

In [None]:
# FamilySizeをもとに、同乗した家族がいないことを表すIsAloneを作成
data['IsAlone'] = False
data.loc[data['FamilySize'] == 1, 'IsAlone'] = True

In [None]:
# Ticketをもとに、チケット番号が重複している数を表すTicketCountを作成
ticket_count = dict(data['Ticket'].value_counts())
data['TicketCount'] = data['Ticket'].map(ticket_count)

In [None]:
# Fareの欠損値フラグを作成
data['FareIsNull'] = data['Fare'].isnull()

In [None]:
# Fareの欠損値をPclassごとの平均値で補完
data_fare_is_null = data.loc[data['Fare'].isnull(), :]

fare_mean_grouped_pclass = data.groupby('Pclass')['Fare'].mean()

for pclass, fare_mean in fare_mean_grouped_pclass.items():
    data_fare_is_null.loc[data_fare_is_null['Pclass'] == pclass, 'Fare'] = fare_mean
    
data.loc[data['Fare'].isnull(), :] = data_fare_is_null

In [None]:
# Cabinの欠損値フラグを作成
data['CabinIsNull'] = data['Cabin'].isnull()

In [None]:
# Cabinをもとに、Cabinの先頭文字を表すCabinInitialsを作成
# 作成後、ラベルエンコーディングを適用
data['Cabin'] = data['Cabin'].fillna('Unknown')
data['CabinInitials'] = data['Cabin'].str[:1]
data['CabinInitials'] = data['CabinInitials'].map({'U': 0, 'C': 1, 'B': 2, 'D': 3, 'E': 4, 'A': 5, 'F': 6, 'G': 7, 'T':8}).astype(int)

In [None]:
# Embarkedの欠損値を最頻値で補完
# 補完後、ラベルエンコーディングを適用
data['Embarked'].fillna(('S'), inplace=True)
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

In [None]:
# ラベルエンコーディングしたカラムに対してOne-Hotエンコーディングを適用
data = pd.get_dummies(data, columns=['Pclass', 'Sex', 'Embarked', 'Title', 'CabinInitials'], drop_first=True)

In [None]:
# 学習に使用しないカラムを削除
delete_columns = ['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin']
data.drop(delete_columns, axis = 1, inplace = True)

## 学習・予測

In [None]:
# 教師データと予測対象データを分割
train = data.dropna()
test = data.loc[data['Age'].isnull(), :]

In [None]:
# 特徴量と目的変数にデータを分割
x_train_all = train.drop('Age', axis = 1).reset_index(drop=True)
y_train_all = train['Age'].reset_index(drop=True)
x_test = test.drop('Age', axis = 1)

In [None]:
# RandomForestで学習・予測
kf = KFold(n_splits = FOLD, shuffle = True, random_state = SEED)
models = []
train_scores = [] # R^2
valid_scores = [] # R^2
y_preds = []
studys = []

# 学習
for fold, (train_index, valid_index) in enumerate(kf.split(x_train_all)):

    # バリデーション（K-fold）
    x_train = x_train_all.loc[train_index, :]
    x_valid = x_train_all.loc[valid_index, :]
    y_train = y_train_all[train_index]
    y_valid = y_train_all[valid_index]
    
    # パラメータ調整
    def objective(trial):
        model = RandomForestRegressor(
            random_state = SEED,
            n_estimators = 500,
            max_features = trial.suggest_categorical('max_features', ['sqrt','log2', None]),
            max_depth = trial.suggest_int('max_depth', 1, 500),
            min_samples_split = trial.suggest_int('min_samples_split', 2, 20),
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
        )
        model.fit(x_train, y_train)
        
        
        y_pred_valid = model.predict(x_valid)
        score = mean_squared_error(y_valid, y_pred_valid, squared = True) # RMSE
        return score
    
    study = optuna.create_study(sampler = optuna.samplers.RandomSampler(seed = SEED))
    study.optimize(objective, n_trials = OPTUNA_N_TRIALS)
    
    studys.append(study)
    
    # モデル構築
    model = RandomForestRegressor(
        random_state = SEED,
        n_estimators = 500,
        max_features = study.best_params['max_features'],
        max_depth = study.best_params['max_depth'],
        min_samples_split = study.best_params['min_samples_split'],
        min_samples_leaf = study.best_params['min_samples_leaf']
    )
    model.fit(x_train, y_train)
    models.append(model)
    
    # モデル評価
    y_pred_train = model.predict(x_train)
    y_pred_valid = model.predict(x_valid)
    train_score = r2_score(y_train, y_pred_train) # R^2
    valid_score = r2_score(y_valid, y_pred_valid) # R^2
    train_scores.append(train_score)
    valid_scores.append(valid_score)
    
    # 予測
    y_pred = model.predict(x_test)
    y_preds.append(y_pred)
    
# CVスコアを算出
cv_train_score = sum(train_scores) / len(train_scores)
cv_valid_score = sum(valid_scores) / len(valid_scores)
print('＝＝＝＝＝＝＝＝＝＝')
print('CV train score:{}'.format(cv_train_score))
print('CV valid score:{}'.format(cv_valid_score))
print('＝＝＝＝＝＝＝＝＝＝')

# 最終的な予測値を算出
y_test = np.zeros(len(x_test))
for y_pred in y_preds:
    y_test = y_test + y_pred
y_test = y_test / FOLD

In [None]:
np.savetxt('../data/output/pred_age.csv', y_test ,delimiter=',')