# Kaggleスタートブック

## パッケージの読み込み

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pandas_profiling

from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
import optuna
from sklearn.metrics import log_loss

FOLD = 5
OPTUNA_N_TRIALS = 1000
SEED = 0

## データ読み込み

In [None]:
# 教師データ
train = pd.read_csv('../data/input/train.csv')

# テストデータ
test = pd.read_csv('../data/input/test.csv')

# サンプル提出データ
gender_submission = pd.read_csv('../data/input/gender_submission.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
gender_submission.head()

## EDA

In [None]:
# 教師データのPandas Profiling
# train.profile_report()

In [None]:
# Pclassと目的変数の関係を可視化
sns.countplot(data = train, x = 'Pclass', hue = 'Survived')
plt.legend(loc = 'upper right', title = 'Survived')

In [None]:
# Nameから抽出したTitleと目的変数の関係を可視化
train['Title'] = train['Name'].map(lambda x: x.split(', ')[1].split('. ')[0])
train['Title'].replace(['Mlle'], 'Miss', inplace=True)
train['Title'].replace(['Ms', 'Mme'], 'Miss', inplace=True)
train['Title'].replace(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer', inplace=True)
train['Title'].replace(['Don', 'Lady', 'Sir', 'the Countess', 'Jonkheer'], 'Royalty', inplace=True)
sns.countplot(data = train, x = 'Title', hue = 'Survived')
plt.legend(loc = 'upper right', title = 'Survived')

In [None]:
# Sexと目的変数の関係を可視化
sns.countplot(data = train, x = 'Sex', hue = 'Survived')
plt.legend(loc = 'upper right', title = 'Survived')

In [None]:
# Ageと目的変数の関係を可視化
plt.hist(train.loc[train['Survived'] == 0, 'Age'].dropna(), bins = 50, alpha = 0.5, label = '0')
plt.hist(train.loc[train['Survived'] == 1, 'Age'].dropna(), bins = 50, alpha = 0.5, label = '1')
plt.xlabel('Age')
plt.ylabel('count')
plt.legend(loc = 'upper right', title = 'Survived')

In [None]:
# SibSpと目的変数の関係を可視化
sns.countplot(data = train, x = 'SibSp', hue = 'Survived')
plt.legend(loc = 'upper right', title = 'Survived')

In [None]:
# Parchと目的変数の関係を可視化
sns.countplot(data = train, x = 'Parch', hue = 'Survived')
plt.legend(loc = 'upper right', title = 'Survived')

In [None]:
# SibSpとParchをもとに作成したFamilySizeと目的変数の関係を可視化
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
sns.countplot(data = train, x = 'FamilySize', hue = 'Survived')
plt.legend(loc = 'upper right', title = 'Survived')

In [None]:
# チケットの重複数と目的変数の関係を可視化
ticket_count = dict(train['Ticket'].value_counts())
train['TicketCount'] = train['Ticket'].map(ticket_count)
sns.countplot(data = train, x = 'TicketCount', hue = 'Survived')
plt.legend(loc = 'upper right', title = 'Survived')

In [None]:
# Fareと目的変数の関係を可視化
plt.hist(train.loc[train['Survived'] == 0, 'Fare'].dropna(), bins = 15, alpha = 0.5, label = '0')
plt.hist(train.loc[train['Survived'] == 1, 'Fare'].dropna(), bins = 15, alpha = 0.5, label = '1')
plt.xlabel('Fare')
plt.ylabel('count')
plt.legend(loc = 'upper right', title = 'Survived')

In [None]:
# Cabinの欠損有無と目的変数の関係を可視化
train['CabinIsNull'] = train['Cabin'].isnull()
sns.countplot(data = train, x = 'CabinIsNull', hue = 'Survived')
plt.legend(loc = 'upper right', title = 'Survived')

In [None]:
# Cabinの先頭文字と目的変数の関係を可視化
train['Cabin'] = train['Cabin'].fillna('Unknown')
train['CabinInitials'] = train['Cabin'].str[:1]
sns.countplot(data = train, x = 'CabinInitials', hue = 'Survived')
plt.legend(loc = 'upper right', title = 'Survived')

In [None]:
# Embarkedと目的変数の関係を可視化
sns.countplot(data = train, x = 'Embarked', hue = 'Survived')
plt.legend(loc = 'upper right', title = 'Survived')

## 特徴量エンジニアリング

In [None]:
# 教師データとテストデータのconcat
data = pd.concat([train, test], sort=False)

In [None]:
# 全データのPandas Profiling
# data.profile_report()

In [None]:
# Nameをもとに、敬称を表すTitleを作成
# 作成後、ラベルエンコーディングを適用
data['Title'] = data['Name'].map(lambda x: x.split(', ')[1].split('. ')[0])
data['Title'].replace(['Mlle'], 'Miss', inplace=True)
data['Title'].replace(['Ms', 'Mme'], 'Miss', inplace=True)
data['Title'].replace(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer', inplace=True)
data['Title'].replace(['Don', 'Lady', 'Sir', 'the Countess', 'Jonkheer', 'Dona'], 'Royalty', inplace=True)
data['Title'].replace(['Mr', 'Miss', 'Mrs', 'Master', 'Officer', 'Royalty'], [0, 1, 2, 3, 4, 5], inplace = True)

In [None]:
# Sexにラベルエンコーディングを適用
data['Sex'].replace(['male', 'female'], [0, 1], inplace = True)

In [None]:
# Ageの欠損値フラグを作成
data['AgeIsNull'] = data['Age'].isnull()

In [None]:
# Ageの欠損値をRandomForestで予測した結果を補完
age_pred = np.loadtxt('../data/output/pred_age.csv', delimiter=',')
data.loc[data['Age'].isnull(), 'Age'] = age_pred

In [None]:
# AgeとSexをもとに、16歳以上の男性、15歳以下の女性をそれぞれ表すIsGrownMan、IsLittleGirlを作成
data['IsGrownMan'] = (data['Age'] >= 16) & (data['Sex'] == 0)
data['IsLittleGirl'] = (data['Age'] < 16) & (data['Sex'] == 1)

In [None]:
# SibSpとParchをもとに、家族の人数を表すFamilySizeを作成
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

In [None]:
# FamilySizeをもとに、家族サイズをクラス分けしたFamilyClassを作成
data['FamilyClass'] = 0
data.loc[(data['FamilySize'] >= 2) & (data['FamilySize'] <= 4), 'FamilyClass'] = 1
data.loc[(data['FamilySize'] >= 5) & (data['FamilySize'] <= 7), 'FamilyClass'] = 2
data.loc[data['FamilySize'] >= 8, 'FamilyClass'] = 3

In [None]:
# FamilySizeをもとに、同乗した家族がいないことを表すIsAloneを作成
data['IsAlone'] = False
data.loc[data['FamilySize'] == 1, 'IsAlone'] = True

In [None]:
# Ticketをもとに、チケット番号が重複している数を表すTicketCountを作成
ticket_count = dict(data['Ticket'].value_counts())
data['TicketCount'] = data['Ticket'].map(ticket_count)

In [None]:
# Fareの欠損値フラグを作成
data['FareIsNull'] = data['Fare'].isnull()

In [None]:
# Fareの欠損値をPclassごとの平均値で補完
data_fare_is_null = data.loc[data['Fare'].isnull(), :]

fare_mean_grouped_pclass = data.groupby('Pclass')['Fare'].mean()

for pclass, fare_mean in fare_mean_grouped_pclass.items():
    data_fare_is_null.loc[data_fare_is_null['Pclass'] == pclass, 'Fare'] = fare_mean
    
data.loc[data['Fare'].isnull(), :] = data_fare_is_null

In [None]:
# Cabinの欠損値フラグを作成
data['CabinIsNull'] = data['Cabin'].isnull()

In [None]:
# Cabinをもとに、Cabinの先頭文字を表すCabinInitialsを作成
# 作成後、ラベルエンコーディングを適用
data['Cabin'] = data['Cabin'].fillna('Unknown')
data['CabinInitials'] = data['Cabin'].str[:1]
data['CabinInitials'].replace(['U', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'], [0, 1, 2, 3, 4, 5, 6, 7, 8], inplace = True)

In [None]:
# Embarkedの欠損値を最頻値で補完
# 補完後、ラベルエンコーディングを適用
data['Embarked'].fillna(('S'), inplace=True)
data['Embarked'].replace(['S', 'C', 'Q'], [0, 1, 2], inplace = True)

In [None]:
# 学習に使用しないカラムを削除
# 特徴量重要度を踏まえ削除する場合は、extendで加える
delete_columns = ['PassengerId', 'Name', 'Ticket', 'Cabin']
delete_columns.extend(['FamilyClass', 'CabinIsNull', 'FareIsNull', 'IsLittleGirl'])
data.drop(delete_columns, axis = 1, inplace = True)

In [None]:
# 特徴量エンジニアリングの結果を教師データ、テストデータに反映
train = data[:len(train)]
test = data[len(train):]

## 学習・予測

In [None]:
# 特徴量と目的変数にデータを分割
x_train_all = train.drop('Survived', axis=1)
y_train_all = train['Survived']
x_test = test.drop('Survived', axis=1)

In [None]:
# lightGBMで学習・予測
kf = StratifiedKFold(n_splits = FOLD, shuffle = True, random_state = SEED)
models = []
train_scores = []
valid_scores = []
cols = list(x_train_all.columns)
importances = pd.DataFrame(columns = cols)
y_preds = []
studys = []

# カテゴリ変数を指定
categorical_features = ['Pclass', 'Sex', 'Embarked', 'Title', 'CabinInitials']

# 学習
for fold, (train_index, valid_index) in enumerate(kf.split(x_train_all, y_train_all)):
    
    # バリデーション（K-fold）
    x_train = x_train_all.loc[train_index, :]
    x_valid = x_train_all.loc[valid_index, :]
    y_train = y_train_all[train_index]
    y_valid = y_train_all[valid_index]
    
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature = categorical_features, free_raw_data=False)
    lgb_valid = lgb.Dataset(x_valid, y_valid, categorical_feature = categorical_features, free_raw_data=False)
    
    # パラメータ調整
    def objective(trial):
        params = {
            'objective': 'binary',
            'learning_rate': 0.01,
            'num_leaves': trial.suggest_int('num_leaves', 2, 128),
            'max_bin': trial.suggest_int('max_bin', 200, 500)
        }

        model = lgb.train(
            params = params,
            train_set = lgb_train,
            valid_sets = [lgb_train, lgb_valid],
            num_boost_round = 1000,
            early_stopping_rounds = 10,
            verbose_eval = 10
        )
   
        y_pred_valid = model.predict(x_valid, num_iteration = model.best_iteration)
        score = log_loss(y_valid, y_pred_valid)
        return score

    study = optuna.create_study(sampler = optuna.samplers.RandomSampler(seed = SEED))
    study.optimize(objective, n_trials = OPTUNA_N_TRIALS)
    
    studys.append(study)
    
    # パラメータ設定
    params = {
        'objective': 'binary',
        'learning_rate': 0.01,
        'num_leaves': study.best_params['num_leaves'],
        'max_bin': study.best_params['max_bin']
    }
    
    # モデル構築
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        valid_sets = [lgb_train, lgb_valid],
        num_boost_round = 1000,
        early_stopping_rounds = 1,
        verbose_eval = 10
    )
    models.append(model)
    
    # モデル評価
    y_pred_train = model.predict(x_train, num_iteration = model.best_iteration)
    y_pred_valid = model.predict(x_valid, num_iteration = model.best_iteration)
    train_score = log_loss(y_train, y_pred_train)
    valid_score = log_loss(y_valid, y_pred_valid)
    train_scores.append(train_score)
    valid_scores.append(valid_score)

    # 特徴量の重要度
    f_importance = np.array(model.feature_importance())
    f_importance = f_importance / np.sum(f_importance)
    df_importance = pd.DataFrame(data = [f_importance], index = [fold], columns = cols)
    importances = pd.concat([importances, df_importance], axis=0)
    
    # 予測
    y_pred = model.predict(x_test, num_iteration = model.best_iteration)
    y_preds.append(y_pred)
    
# CVスコアを算出
cv_train_score = sum(train_scores) / len(train_scores)
cv_valid_score = sum(valid_scores) / len(valid_scores)
print('＝＝＝＝＝＝＝＝＝＝')
print('CV train score:{}'.format(cv_train_score))
print('CV valid score:{}'.format(cv_valid_score))
print('＝＝＝＝＝＝＝＝＝＝')

# 提出する予測値を算出
y_sub = sum(y_preds) / len(y_preds)
y_sub = (y_sub > 0.5).astype(int)

In [None]:
# 各バリデーションでの特徴量重要度を合算（必要に応じて特徴量削減に利用）
sum_importance = importances.sum()
sum_importance = sum_importance.sort_values(ascending = False)
display(sum_importance)

## データ出力

In [None]:
# 提出データ出力
sub = pd.read_csv('../data/input/gender_submission.csv')
sub['Survived'] = y_sub
sub.to_csv('../data/output/submission_lightgbm.csv', index = False)