Kaggleのタイタニックタスクのパラメータチューニング
https://www.kaggle.com/c/titanic/

k_yoshida 2018/04/19

In [0]:
import pandas as pd
import io
import requests
import xgboost as xgb
import numpy as np
from sklearn import preprocessing
from google.colab import files
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV, train_test_split

In [0]:
url="https://raw.githubusercontent.com/kuiski/kaggle_titanic/master/input/train.csv"
s=requests.get(url).content
c=pd.read_csv(io.StringIO(s.decode('utf-8')))

In [44]:
c[:3]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [45]:
feature_data = c.loc[:, ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
feature_data[:4]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S


In [46]:
label_data = c.loc[:, ['Survived']]
label_data[:3]

Unnamed: 0,Survived
0,0
1,1
2,1


In [0]:
sex_encorder = preprocessing.LabelEncoder()
sex_encorder.fit(c['Sex'].astype(str))
feature_data['Sex'] = sex_encorder.transform(feature_data['Sex'])

In [0]:
embarked_encorder = preprocessing.LabelEncoder()
embarked_encorder.fit(c['Embarked'].astype(str))
feature_data['Embarked'] = embarked_encorder.transform(feature_data['Embarked'].astype(str))

In [49]:
feature_data.loc[:3]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,2
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,2
3,1,0,35.0,1,0,53.1,2


In [50]:
X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.2)
X_train[:3], X_test[:3], y_train[:3], y_test[:3]

(     Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
 216       3    0  27.0      0      0   7.9250         2
 3         1    0  35.0      1      0  53.1000         2
 388       3    1   NaN      0      0   7.7292         1,
      Pclass  Sex   Age  SibSp  Parch      Fare  Embarked
 557       1    1   NaN      0      0  227.5250         0
 653       3    0   NaN      0      0    7.8292         1
 861       2    1  21.0      1      0   11.5000         2,
      Survived
 216         1
 3           1
 388         0,
      Survived
 557         0
 653         1
 861         0)

In [54]:
xgb_model = xgb.XGBClassifier()
param_distributions={
    'max_depth': randint(3, 7),
    'subsample': uniform(0.5,0.5),
    'colsample_bytree': uniform(0.5,0.5),
    'learning_rate': uniform(0.05,0.3)
}
rs = RandomizedSearchCV(xgb_model,
                        param_distributions,
                        cv=5,
                        n_iter=30,
                        scoring="neg_log_loss",
                        n_jobs=1,
                        verbose=1)
rs.fit(X_train, y_train['Survived'])

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    6.6s finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=30, n_jobs=1,
          param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff5d5ca4f98>, 'subsample': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff5d5bc2a58>, 'colsample_bytree': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff5d5bc2240>, 'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff5d5bc28d0>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', s

In [55]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, rs.predict(X_test))

  if diff:


0.8268156424581006

In [56]:
rs.best_params_

{'colsample_bytree': 0.9550645410717153,
 'learning_rate': 0.10306685530415823,
 'max_depth': 3,
 'subsample': 0.8043919410903095}