<a href="https://colab.research.google.com/github/haidoro/Titanic/blob/master/GBDT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Titanic分類
## GBDT活用

In [0]:
# データ加工・処理・分析ライブラリ
import numpy as np
import numpy.random as random
import scipy as sp
from pandas import Series,DataFrame
import pandas as pd
import re
# 可視化ライブラリ
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline
sns.set()

# 機械学習ライブラリ
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import log_loss
# 小数第3位まで表示
%precision 3

pd.set_option('display.max_rows', 900)

In [0]:
# インポート
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Titanicデータを読み込み
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print('Train欠損値',train.isnull().sum(),len(train))
print('Test欠損値',test.isnull().sum(),len(train))


Train欠損値 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64 891
Test欠損値 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64 891


In [0]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [0]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [0]:
def change(v):
  data = re.search(r'Mr\.|Mrs|Miss', v)
  if data != None:
    return data.group()


train['hon'] = [change(v) for v in train['Name']]
test['hon'] = [change(v) for v in test['Name']]

In [0]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,hon
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr.
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr.


In [0]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,hon
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Mr.
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,Mrs
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Mr.
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Mr.
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Mrs


In [0]:
train['Age'] = train['Age'].fillna(train['Fare'].map(lambda x: 50 if x > 70 else 15))
train = train.dropna(subset=['Embarked'])
age_bins =[0,10,20,30,40,50,80]
train['age_group'] = pd.cut(train['Age'],age_bins)

In [0]:
train_X = pd.get_dummies(train[['Survived','Embarked','Pclass', 'age_group','Sex','hon']],dummy_na=True)
# train_X['sex'] = train['Sex'].apply(lambda x : 0 if x=='male' else 1 )
train_X.head()

Unnamed: 0,Survived,Pclass,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan,"age_group_(0, 10]","age_group_(10, 20]","age_group_(20, 30]","age_group_(30, 40]","age_group_(40, 50]","age_group_(50, 80]",age_group_nan,Sex_female,Sex_male,Sex_nan,hon_Miss,hon_Mr.,hon_Mrs,hon_nan
0,0,3,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0
1,1,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0
2,1,3,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0
3,1,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0
4,0,3,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0


In [0]:
train_y = train_X['Survived']
train_X = train_X.drop('Survived',axis=1)

# 訓練データとテストデータに分ける
# X_train, X_test, y_train, y_test = train_test_split(
#    train_X, train_y, random_state=0)

In [0]:
# 学習データを学習データとバリデーションデータに分ける
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
from sklearn.model_selection import KFold

train_X.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in train_X.columns.values]

kf = KFold(n_splits=6, shuffle=True, random_state=42)
tr_idx, va_idx = list(kf.split(train_X))[0]
tr_x, va_x = train_X.iloc[tr_idx], train_X.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]



In [0]:


# 特徴量と目的変数をxgboostのデータ構造に変換する
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)



# ハイパーパラメータの設定
params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 42}
num_round = 13


watchlist = [(dtrain,'train'),(dvalid,'eval')]

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model = xgb.train(params, dtrain, num_round, evals=watchlist)

# バリデーションデータでのスコアの確認
va_pred = model.predict(dvalid)
score = log_loss(va_y, va_pred)
print(f'logloss: {score:.4f}')

[0]	train-error:0.162162	eval-error:0.161074
[1]	train-error:0.162162	eval-error:0.161074
[2]	train-error:0.163514	eval-error:0.161074
[3]	train-error:0.163514	eval-error:0.161074
[4]	train-error:0.162162	eval-error:0.154362
[5]	train-error:0.162162	eval-error:0.154362
[6]	train-error:0.162162	eval-error:0.154362
[7]	train-error:0.162162	eval-error:0.154362
[8]	train-error:0.162162	eval-error:0.154362
[9]	train-error:0.162162	eval-error:0.167785
[10]	train-error:0.159459	eval-error:0.167785
[11]	train-error:0.159459	eval-error:0.167785
[12]	train-error:0.159459	eval-error:0.167785
logloss: 0.3725


  if getattr(data, 'base', None) is not None and \


In [0]:
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())
test['Age'] = test['Age'].fillna(test['Fare'].map(lambda x: 50 if x > 70 else 15))
test['age_group'] = pd.cut(test['Age'],age_bins)

In [0]:
test_X = pd.get_dummies(test[['Embarked','Pclass', 'age_group','Sex','hon']],dummy_na=True)
# test_X['sex'] = test['Sex'].apply(lambda x : 0 if x=='male' else 1 )
test_X.head()

Unnamed: 0,Pclass,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan,"age_group_(0, 10]","age_group_(10, 20]","age_group_(20, 30]","age_group_(30, 40]","age_group_(40, 50]","age_group_(50, 80]",age_group_nan,Sex_female,Sex_male,Sex_nan,hon_Miss,hon_Mr.,hon_Mrs,hon_nan
0,3,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0
1,3,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0
2,2,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0
3,3,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0
4,3,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0


In [0]:
test_X.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in test_X.columns.values]
dtest = xgb.DMatrix(test_X)
prediction = model.predict(dtest)

In [0]:
test['Survived'] = prediction
test['Survived'] = test['Survived'].map(lambda x: 1 if x >= 0.5 else 0)
my_submission = test[['PassengerId', 'Survived']]
my_submission.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0


In [0]:
my_submission.to_csv('./submission.csv',index=False)