## 0.ライブラリ・データの読み込み

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')

## 1.データの概観

In [3]:
print('訓練データのデータ数は{}、変数は{}種類です。'.format(train.shape[0], train.shape[1]))
print('テストデータのデータ数は{}、変数は{}種類です'.format(test.shape[0], test.shape[1]))

In [4]:
train.head()

In [5]:
test.head()

In [6]:
train.columns

In [7]:
test.columns

## 2.EDA

In [8]:
train.isnull().sum()

In [9]:
test.isnull().sum()

In [10]:
f,ax=plt.subplots(1,2,figsize=(18,8), facecolor='gray')
train['Survived'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('Survived')
ax[0].set_ylabel('')
sns.countplot('Survived',data=train,ax=ax[1])
ax[1].set_title('Survived')
plt.show()

In [11]:
f,ax=plt.subplots(1,2,figsize=(18,8), facecolor='gray')
train['Pclass'].value_counts().plot.bar(color=['#CD7F32','#FFDF00','#D3D3D3'],ax=ax[0])
ax[0].set_title('Number of Passengers By Pclass')
ax[0].set_ylabel('Count')
sns.countplot('Pclass',hue='Survived',data=train,ax=ax[1])
ax[1].set_title('Pclass:Perished vs Survived')
plt.show()

In [12]:
f,ax=plt.subplots(1,2,figsize=(18,8), facecolor='gray')
sns.violinplot("Pclass","Age", hue="Survived", data=train,split=True,ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(0,110,10))
sns.violinplot("Sex","Age", hue="Survived", data=train,split=True,ax=ax[1])
ax[1].set_title('Sex and Age vs Survived')
ax[1].set_yticks(range(0,110,10))
plt.show()

In [13]:
sns.heatmap(train.corr(),annot=True,cmap='bwr',linewidths=0.2)
fig=plt.gcf()
fig.set_size_inches(10,8)
plt.show()

## 3.前処理

In [14]:
# 推定に使用する項目を指定
age_df = train[['Age', 'Pclass','Sex','Parch','SibSp']]

# ラベル特徴量をワンホットエンコーディング
age_df=pd.get_dummies(age_df)

# 学習データとテストデータに分離し、numpyに変換
known_age = age_df[age_df.Age.notnull()].values  
unknown_age = age_df[age_df.Age.isnull()].values

# 学習データをX, yに分離
X = known_age[:, 1:]  
y = known_age[:, 0]

# ランダムフォレストで推定モデルを構築
rfr = RandomForestRegressor(random_state=0, n_estimators=100, n_jobs=-1)
rfr.fit(X, y)

# 推定モデルを使って、テストデータのAgeを予測し、補完
predictedAges = rfr.predict(unknown_age[:, 1::])
train.loc[(train.Age.isnull()), 'Age'] = predictedAges 

In [15]:
fare = pd.concat([train['Fare'], test['Fare']])

train['Fare'].fillna(fare.mean(), inplace=True)
test['Fare'].fillna(fare.mean(), inplace=True)

In [16]:
train.drop('Cabin', axis=1, inplace=True)
test.drop('Cabin', axis=1, inplace=True)

train.isnull().sum()

In [17]:
sns.countplot('Embarked',data=train)
plt.title('Number of Passengers Boarded')
plt.show()

In [18]:
train['Embarked'].fillna('S', inplace=True)
test['Embarked'].fillna('S', inplace=True)

train.isnull().sum()

In [19]:
train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)

train.drop('Ticket', axis=1, inplace=True)
test.drop('Ticket', axis=1, inplace=True)

train.columns

In [20]:
train.replace({'Sex': {'male': 0, 'female': 1}}, inplace=True)
test.replace({'Sex': {'male': 0, 'female': 1}}, inplace=True)

train.head()

In [21]:
embarked = pd.concat([train['Embarked'], test['Embarked']])

embarked_ohe = pd.get_dummies(embarked)

embarked_ohe_train = embarked_ohe[:891]
embarked_ohe_test = embarked_ohe[891:]

train = pd.concat([train, embarked_ohe_train], axis=1)
test = pd.concat([test, embarked_ohe_test], axis=1)

train.drop('Embarked', axis=1, inplace=True)
test.drop('Embarked', axis=1, inplace=True)

train.head()

## 4.ベースラインモデルの構築

In [22]:
X = train.iloc[:, 2:].values
y = train.iloc[:, 1].values

X_test = test.iloc[:, 1:].values

In [23]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

ランダムフォレストによる結果

In [24]:
rfc = RandomForestClassifier(max_depth=10, min_samples_leaf=1, n_estimators=100, n_jobs=-1, random_state=42)
rfc.fit(X_train, y_train)

In [25]:
print('Train Score: {}'.format(round(rfc.score(X_train, y_train), 3)))
print(' Test Score: {}'.format(round(rfc.score(X_valid, y_valid), 3)))

過学習が発生していると考えられる

グリッドサーチによるハイパラ調整

In [26]:
param_grid = {'max_depth': [3, 5, 7],
              'min_samples_leaf': [1, 2, 4]}

for max_depth in param_grid['max_depth']:
    for min_samples_leaf in param_grid['min_samples_leaf']:
        rfc_grid = RandomForestClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf, 
                                            n_estimators=100, n_jobs=-1, random_state=42)
        rfc_grid.fit(X_train, y_train)
        print('max_depth: {}, min_samples_leaf: {}'.format(max_depth, min_samples_leaf))
        print('    Train Score: {}, Test Score: {}'.format(round(rfc_grid.score(X_train, y_train), 3),
                                                           round(rfc_grid.score(X_valid, y_valid), 3)))

クロスバリデーション

In [27]:
rfc_gs = GridSearchCV(RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42), param_grid, cv=5)
rfc_gs.fit(X, y)

print('Best Parameters: {}'.format(rfc_gs.best_params_))
print('CV Score: {}'.format(round(rfc_gs.best_score_, 3)))

## 5.特徴量エンジニアリング

In [28]:
train_fe = train.copy()
test_fe = test.copy()

In [29]:
train_fe['Family'] = train['SibSp'] + train['Parch']
test_fe['Family'] = test['SibSp'] + test['Parch']

train_fe.head()

In [30]:
X_fe = train_fe.iloc[:, 2:].values
y_fe = train_fe.iloc[:, 1].values

X_fe_test = test_fe.iloc[:, 1:].values

X_fe_train, X_fe_valid, y_fe_train, y_fe_valid = train_test_split(X_fe, y_fe, test_size=0.3, random_state=42)

rfc_fe = RandomForestClassifier(max_depth=7, min_samples_leaf=1, n_estimators=100, n_jobs=-1, random_state=42)
rfc_fe.fit(X_fe_train, y_fe_train)

print('Train Score: {}'.format(round(rfc_fe.score(X_fe_train, y_fe_train), 3)))
print(' Test Score: {}'.format(round(rfc_fe.score(X_fe_valid, y_fe_valid), 3)))

スコアが下がったので不採用

## 6.モデル構築と調整

ロジスティック回帰モデル

In [31]:
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)

print('Logistic Regression \n')
print('Train Score: {}'.format(round(lr.score(X_train, y_train), 3)))
print(' Test Score: {}'.format(round(lr.score(X_valid, y_valid), 3)))

多層パーセプトロン

In [32]:
mlpc = MLPClassifier(hidden_layer_sizes=(100, 100, 10), random_state=0)
mlpc.fit(X_train, y_train)

print('Multilayer Perceptron \n')
print('Train Score: {}'.format(round(mlpc.score(X_train, y_train), 3)))
print(' Test Score: {}'.format(round(mlpc.score(X_valid, y_valid), 3)))

## 7.アンサンブリング

In [33]:
X_test = np.nan_to_num(X_test, nan=np.nanmean(X_test))

In [34]:
rfc_pred = rfc.predict_proba(X_test)
lr_pred = lr.predict_proba(X_test)
mlpc_pred = mlpc.predict_proba(X_test)

pred_proba = (rfc_pred + lr_pred + mlpc_pred) / 3
pred = pred_proba.argmax(axis=1)

## 8.提出コードの作成

In [35]:
pred.shape

In [37]:
pred

In [38]:
passengerID = np.array(test['PassengerId']).astype(int)

In [43]:
my_solution = pd.DataFrame(pred, passengerID, columns = ['Survived'])

In [44]:
my_solution.head(10)

In [45]:
my_solution.to_csv('submission.csv', index_label = ['PassengerId'])