In [43]:
#モジュール用意

import pandas as pd
import csv as csv
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.externals import joblib
from sklearn.model_selection import cross_val_score, GridSearchCV
import numpy as np
from sklearn.pipeline import Pipeline


In [26]:
#設定

#データ読み込み
train_df = pd.read_csv("kaggle Titanic train.csv", header=0)
test_df = pd.read_csv("kaggle Titanic test.csv", header=0)



#ohe-hot-encodingのカラム設定
ohe_cols = ["Pclass", "Sex", "SibSp", "Parch", "Cabin"]



#評価指標
scoring_index = "accuracy"
scoring_index = "roc_auc"



#アルゴリズム
#RandomForestClassifier
model_name = 'rf_001'
clf = Pipeline([('scl', StandardScaler()), ('est', RandomForestClassifier(n_estimators=100, max_depth=3, random_state=2))])      

#GradientBoostingClassifier
model_name = 'gb_001'
clf = Pipeline([('scl', StandardScaler()), ('est', GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=2))])



In [27]:
#学習

#準備
#trainデータの特徴量スライス
ID = train_df.iloc[:, 0]
X = train_df.iloc[:, 2:-1]
y = train_df.iloc[:, 1]

#testデータの特徴量スライス
IDs = test_df.iloc[:, [0]]
X_pred = test_df.iloc[:, 1:-1]

#hold-out
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)



#trainデータの整備
#one-hot encoding
X_train_ohe = pd.get_dummies(X_train, dummy_na=True, columns=ohe_cols)
X_train_ohe = X_train_ohe.drop("Name", axis=1)                               #名前とチケット名はとりあえず削除
X_train_ohe = X_train_ohe.drop("Ticket", axis=1)

#imputer
imp = Imputer(missing_values="NaN", strategy="mean", axis=0)
imp.fit(X_train_ohe)
X_train_ohe = pd.DataFrame(imp.transform(X_train_ohe), columns=X_train_ohe.columns.values)
#print(X_train_ohe.columns.values)

#RFECV
selector = RFECV(estimator=RandomForestClassifier(random_state=0), step=0.05)
selector.fit(X_train_ohe, y_train)
X_train_selected = selector.transform(X_train_ohe)
X_train_selected = pd.DataFrame(X_train_selected, columns=X_train_ohe.columns.values[selector.support_])
#print(X_train_ohe_selected)



#testデータの整備
#one-hot encoding
X_test_ohe = pd.get_dummies(X_test, dummy_na=True, columns=ohe_cols)
X_test_ohe = X_test_ohe.drop("Name", axis=1)
X_test_ohe = X_test_ohe.drop("Ticket", axis=1)

#列合わせ(①modelデータにしかないものは0を挿入 ②testデータにしかないものは削除)
X_train_ohe_columns = pd.DataFrame(None, columns=X_train_ohe.columns.values, dtype=float)
X_test_exp = pd.concat([X_train_ohe_columns, X_test_ohe])
X_test_exp.loc[:, list(set(X_train_ohe_columns)-set(X_test_ohe.columns.values))] = \
    X_test_exp.loc[:, list(set(X_train_ohe_columns)-set(X_test_ohe.columns.values))].fillna(0, axis=1)          #①
X_test_exp = X_test_exp.drop(list(set(X_test_exp.columns.values)-set(X_train_ohe_columns)), axis=1)     #②
#X_test_exp

#並び替え
X_test_exp = X_test_exp.reindex_axis(X_train_ohe.columns.values, axis=1)
#X_test_exp

#imputer
X_test_exp = pd.DataFrame(imp.transform(X_test_exp), columns=X_train_ohe.columns.values)
#X_test_exp

#RFECV
X_test_selected = X_test_exp.loc[:, X_train_ohe.columns.values[selector.support_]]
#X_test_selected



#モデリング
#学習
clf.fit(X_train_selected, y_train)

#分類器の保存
joblib.dump(clf, 'titanic_pred_'+ model_name + '.pkl')    #分離器を保存



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




['titanic_pred_gb_001.pkl']

In [46]:
#評価①

#アルゴリズム準備
pipe_rf = Pipeline([('scl', StandardScaler()), ('est', RandomForestClassifier(random_state=2))])      
pipe_gb = Pipeline([('scl', StandardScaler()), ('est', GradientBoostingClassifier(random_state=2))])
pipe_svc = Pipeline([('scl', StandardScaler()), ('est', SVC())])

#パラメーター準備
param_grid_rf = {'est__n_estimators': [10, 50, 100], 'est__max_depth': [3, 5, 7]}
param_grid_gb = {'est__n_estimators': [10, 50, 100], 'est__max_depth': [3, 5, 7]}
param_grid_svc = {'est__gamma': [0.01, 0.1, 1], 'est__C': [0.01, 0.1, 1]}

#scorer準備
scorer = ['accuracy', 'f1', 'roc_auc']                        #scorerの選択が不要であればコメントアウト、その場合はグリッドサーチも変更



#グリッドサーチ
for argorithm, param_grid, scorer in zip([pipe_rf, pipe_gb, pipe_svc], [param_grid_rf, param_grid_gb, param_grid_svc], scorer):

    grid_search = GridSearchCV(argorithm, param_grid=param_grid, scoring=scorer, cv=10)
    grid_search = grid_search.fit(X_train_selected, y_train)

    
    
#グリッドサーチ結果最適
print('Best estimator:', grid_search.best_estimator_)
print('Best score:', grid_search.best_score_)
print('Best param:', grid_search.best_params_)
print('scoreer:', grid_search.scorer_)

#グリッドサーチ結果詳細
#results = pd.DataFrame(grid_search.cv_results_)
#display(results.T)



#モデルのランキング





Best estimator: Pipeline(memory=None,
     steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('est', SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
Best score: 0.8424520284009842
Best param: {'est__C': 1, 'est__gamma': 0.01}
scoreer: make_scorer(roc_auc_score, needs_threshold=True)


In [39]:
#評価②

#交差検証
##results = cross_val_score(clf, X_train_selected, y_train, scoring="roc_auc", cv=5)
#results = cross_val_score(clf, X_test_selected, y_test, scoring=scoring_index, cv=5)
#print('cv score', np.average(results), '+-', np.std(results))
#print('cv raw score', results)

#predict_proba
#score = pd.DataFrame(clf.predict_proba(X_test_selected)[:, 1], columns=['pred_score'])
#score.to_csv('predict_proba_' + model_name + '.csv', index=False)

#feature importances
#importance = pd.DataFrame([clf.feature_importances_], columns=X_train_ohe.columns.values[selector.support_])
#importance.T.to_csv('feature_importance_' + model_name + '.csv', index=True)


In [36]:
#予測

#one-hot encoding
X_pred_ohe = pd.get_dummies(X_pred, dummy_na=True, columns=ohe_cols)
X_pred_ohe = X_pred_ohe.drop("Name", axis=1)
X_pred_ohe = X_pred_ohe.drop("Ticket", axis=1)

#列合わせ(①modelデータにしかないものは0を挿入 ②testデータにしかないものは削除)
X_train_ohe_columns = pd.DataFrame(None, columns=X_train_ohe.columns.values, dtype=float)
X_pred_exp = pd.concat([X_train_ohe_columns, X_pred_ohe])
X_pred_exp.loc[:, list(set(X_train_ohe_columns)-set(X_pred_ohe.columns.values))] = \
    X_pred_exp.loc[:, list(set(X_train_ohe_columns)-set(X_pred_ohe.columns.values))].fillna(0, axis=1)          #①
X_pred_exp = X_pred_exp.drop(list(set(X_pred_exp.columns.values)-set(X_train_ohe_columns)), axis=1)     #②
#X_test_exp

#並び替え
X_pred_exp = X_pred_exp.reindex_axis(X_train_ohe.columns.values, axis=1)
#X_test_exp

#imputer
X_pred_exp = pd.DataFrame(imp.transform(X_pred_exp), columns=X_train_ohe.columns.values)
#X_test_exp

#RFECV
X_pred_selected = X_pred_exp.loc[:, X_train_ohe.columns.values[selector.support_]]

#predict
prediction = pd.DataFrame(clf.predict(X_pred_selected), columns=set(['Survived']))
IDs.join(prediction).to_csv('titanic_pred_'+ model_name + '.csv', index=False)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  # Remove the CWD from sys.path while we load stuff.
  app.launch_new_instance()
