# タイタニックデータセットのための簡易Auto ML

このノートブックでは、タイタニックのテーブルデータに対して、複数のアルゴリズムをパイプラインに並べ、同時にGridSearchを行う一連の流れを整理しました。  
簡易的に複数のアルゴリズムを使用し、最適なパラメータをGridSearchにて探索します。  

1. 準備（ライブラリのインポート）
2. データの準備、設定
3. データセットの読み込み
4. データの前処理  
  4-1. 敬称の抽出  
  4-2. 欠損値の補完(年齢データを予測するモデルを構築し、予測結果にて欠損値を補完する)  
  4-3. One Hot エンコーディング  
  4-4. Simple Imputerを用いた欠損値補完  
  4-5. RFEを用いた次元圧縮  
5. pipelineの設定  
  5-1. 使用するアルゴリズムをPipelineとして並べる  
  5-2. GridSearchに使用するパラメータの設定  
6. pipelineにて定義したアルゴリズムをGrid Searchとともに実行する  
7. 最終的に使用するアルゴリズムの選択  
8. 全データを使った学習  
9. テストデータの予測  
  9-1. テストデータの欠損値の確認  
  9-2. 敬称の抽出  
  9-3. テストデータと訓練データの比較  
  9-4. One Hot エンコーディング  
  9-5. カラムの選択  
  9-6. カラムの並び順序の担保
  9-7. 欠損値補完  
  9-8. 次元圧縮(selectorを使って使用するカラムを選択する)  
10. 予測
11. 提出用ファイルの作成

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## 1. 準備

In [None]:
import numpy as np
import pandas as pd
import os
import pickle
import time

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# missing value compensation
from sklearn.impute import SimpleImputer

# Feature Decompression
from sklearn.feature_selection import RFE

# Train Algorithm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import lightgbm as lgb

 # select model
from ipywidgets import interact,interactive,fixed,interact_manual
from IPython.display import display
import ipywidgets as widgets

In [None]:
# setting display option
pd.options.display.max_columns = 50

## 2. データの設定

In [None]:
# setting of path to dataset 
train_file = '/kaggle/input/titanic/train.csv'
test_file = '/kaggle/input/titanic/test.csv'

# setting of submission file
os.makedirs('/kaggle/working/submit/', exist_ok=True)
submit_file_dir = '/kaggle/working/'
submit_file_name = 'submission.csv'

In [None]:
# ID column in submission file
ID_column = 'PassengerId'

# target data column of submission file
target_value = 'Survived'

In [None]:
# one-hot-encoding columns (categorical data)
ohe_columns = ['Pclass', 'Sex', 'SibSp','Parch', 'Embarked', 'Title_Group']

# create dic to set data type for each column
my_dtype = {k: object  for k in ohe_columns}
print(my_dtype)

In [None]:
# Reject column list
score_reject_items =[ID_column,'Ticket', 'Cabin']
train_reject_items = score_reject_items + [target_value]

## 3. データセットの読み出し
### 3-1. train data

In [None]:
# Read csv file
dataset = pd.read_csv(train_file,
                     header=0,    # CSVデータの1行目が見出し(header)で有ることを指定。データは1行目が[0]
                     dtype=my_dtype)

# 1列目のID情報、推論対象は特徴量から削除
# pandas.core.frame.DataFrame
X = pd.DataFrame(dataset).drop(columns=train_reject_items, axis=1)

# pandas.core.series.Series
y = pd.Series(dataset[target_value])

# check the shape
print('----------------------------------------------------------------')
print('X shape: (%i,%i)' %X.shape)
print('----------------------------------------------------------------')
print('y shape: (%i,)' %y.shape)
print('----------------------------------------------------------------')
print(y.value_counts())
print('Survived（1：yes、0：no）')
print('----------------------------------------------------------------')
print()
print('dataset:raw data')
display(dataset.head())
print('X:train dataset')
X.join(y).head()

### 3-2. test data

In [None]:
# csvファイルからの読み出し
dataset_s = pd.read_csv(test_file,
                          header=0,    # CSVデータの1行目が見出し(header)で有ることを指定。データは1行目が[0]
                          dtype=my_dtype)

# 1列目のID情報、推論対象は特徴量から削除
ID_s = dataset_s.iloc[:,[0]]    # 第０列はPassengerIDなので、IDとしてセット
X_s = dataset_s.drop(score_reject_items, axis=1)    # 1列目はID情報のため特徴量から削除

# 形状の確認
print('-----------------------------------')
print('Raw Shape: (%i, %i)' %dataset_s.shape)
print('X_s Shape: (%i, %i)' %X_s.shape)
print('-----------------------------------')
print(X_s.dtypes)
ID_s.join(X_s).head()

In [None]:
# Missing values in each columns
dataset.isnull().sum()

In [None]:
# data type in each column
dataset.dtypes

## 4. データ前処理
### 4-1. 敬称の抽出

In [None]:
# To extract title from name
X['Title'] = X.Name.str.extract(r',\s*([^\.]*)\s*\.', expand = False)
print(X.Title.unique())

In [None]:
# Binning Title into 'Title Group'
Mr = ['Mr']
Crew1 = ['Don', 'Rev', 'Capt']
Crew2 = ['Major', 'Col', 'Dr']
Women_Masters = ['Mrs', 'Miss', 'Master']
Affluence = ['Mme', 'Ms', 'Lady', 'Sir', 'Mile', 'the Countess', 'Jonkheer']

X['Title_Group'] = np.where(X['Title'] == Mr[0], 'Mr', 'Affluence')
X['Title_Group'] = np.where(X['Title'].isin(Crew1),  "Crew1", X['Title_Group'])
X['Title_Group'] = np.where(X['Title'].isin(Crew2),  "Crew2", X['Title_Group'])
X['Title_Group'] = np.where(X['Title'].isin(Women_Masters),  "Women_Masters", X['Title_Group'])
X['Title_Group'] = np.where(X['Title'].isin(Affluence),  "Affluence", X['Title_Group'])
X = X.drop(columns=['Name', 'Title'])

In [None]:
# Average for each Title Group
Titleplot = X.join(y)['Survived'].groupby(X['Title_Group']).mean()
Titleplot

### 4-2. 欠損値の補完(年齢データを予測するモデルを構築し、予測結果にて欠損値を補完する)

In [None]:
# column list for age prediction
cols_age_prediction = ['Age', 'Pclass', 'Sex', 'Parch', 'SibSp']

# One-Hot-Encoding
age_X_df = X[cols_age_prediction]
age_X_df = pd.get_dummies(age_X_df)

# Diveide into 2 groups , train data and test data for age prediction
known_age_df = age_X_df[age_X_df.Age.notnull()]
unknown_age_df = age_X_df[age_X_df.Age.isnull()]

In [None]:
Age_X, Age_y = known_age_df.drop(columns=['Age'],axis=1) , known_age_df['Age']
# Holdout(Split data)
Age_X_train, Age_X_test, Age_y_train, Age_y_test=train_test_split(Age_X,
                                               Age_y,
                                               test_size=0.3,
                                               random_state=1)
# Training
Age_lgb = lgb.LGBMRegressor()
Age_lgb.fit(Age_X_train, Age_y_train)

# Prediction
Age_y_pred = Age_lgb.predict(unknown_age_df.drop(columns = ['Age'], axis=1))

# Compensate missing value with predicted value
X.loc[(X.Age.isnull()), 'Age'] = Age_y_pred

In [None]:
# Comfirmation of missing value
X.isnull().sum()

### 4-3. One Hot エンコーディング
ohe_columnsにて定義したColumnに対してOne Hotエンコーディングを実施する

In [None]:
def one_hot_encoding(data, ohe_columns):
    X_ohe = pd.get_dummies(data,
                       dummy_na=True,    # 欠損値もダミー化
                       columns=ohe_columns)
    print('X_ohe shape:(%i,%i)' % X_ohe.shape)
    display(X_ohe.head())
    return X_ohe
    
X_ohe = one_hot_encoding(X, ohe_columns)

In [None]:
X_ohe_columns = X_ohe.columns.values
X_ohe_columns

### 4-4. Simple Imputerを用いた欠損値補完

In [None]:
def imputing_nan(X_ohe_for_training, X_ohe_apply_to):
    
    imp = SimpleImputer()    # default設定で平均値
    imp.fit(X_ohe_for_training)             # impにて計算するデータ
    
    X_ohe_columns =  X_ohe_for_training.columns.values
    X_ohe = pd.DataFrame(imp.transform(X_ohe_apply_to), columns=X_ohe_columns)
    display(X_ohe.head())
    return X_ohe,  X_ohe_columns, imp

X_ohe, X_ohe_columns, imp = imputing_nan(X_ohe, X_ohe)

In [None]:
X_ohe.shape

### 4-4. RFEを用いた次元圧縮

In [None]:
def dimension_compression(X_ohe, y):
    start = time.time()
    selector = RFE(RandomForestClassifier(n_estimators=100, random_state=1),
               n_features_to_select=35, # 圧縮後の次元数
               step=.05)
    selector.fit(X_ohe,y)
    X_ohe_columns =  X_ohe.columns.values
    
    # selector.support_ list of True or False
    X_fin = X_ohe.loc[:, X_ohe_columns[selector.support_]]
    
    # Duration time
    duration = time.time() - start
    print(duration,'s')
    
    print('Duration Time:', 'X_fin shape:(%i,%i)' % X_fin.shape)
    display(X_fin.head())
    return X_fin, selector
    
X_fin, selector = dimension_compression(X_ohe, y)

In [None]:
print('-----------------------------------')
print('X_fin shape: (%i,%i)' %X_fin.shape)
print('-----------------------------------')
print(y.value_counts())
print('--------------------------------------------------')
print('Survived（1：yes、0：no）:')
print('--------------------------------------------------')
print('y shape: (%i,)' %y.shape)
print('--------------------------------------------------')

## 5. Pipelineの定義
### 5-1. Pipeline  
使用するアルゴリズムをPipelineとして並べる

In [None]:
# set pipelines for different algorithms
# ディクショナリー型のpipelinesに、モデル名をkey、値にpipelineのリストを登録
pipelines = {
    'knn':
        Pipeline([('scl',StandardScaler()),
                    ('est',KNeighborsClassifier())]),
    'logistic':
        Pipeline([('scl',StandardScaler()),
                    ('est',LogisticRegression(random_state=1))]),
    'rsvc':
        Pipeline([('scl',StandardScaler()),
                    ('est',SVC(C=1.0, kernel='rbf', class_weight='balanced', random_state=1))]),
    'lsvc':
        Pipeline([('scl',StandardScaler()),
                    ('est',LinearSVC(C=1.0, class_weight='balanced',random_state=1))]),
    'tree':
        Pipeline([('scl',StandardScaler()),
                    ('est',DecisionTreeClassifier(random_state=1))]),
    'rf':
        Pipeline([('scl',StandardScaler()),
                    ('est',RandomForestClassifier(random_state=1))]),
    'gb':
        Pipeline([('scl',StandardScaler()),
                    ('est',GradientBoostingClassifier(random_state=1))]),
    'mlp':
        Pipeline([('scl',StandardScaler()),
                    ('est',MLPClassifier(hidden_layer_sizes=(3,3), max_iter=1000, random_state=1))]),
    'xgb':
        Pipeline([('scl',StandardScaler()),
                    ('est',xgb.XGBClassifier(verbosity = 0))]),
    'lgbm':
        Pipeline([('scl',StandardScaler()),
                    ('est',lgb.LGBMClassifier())]),
}

### 5-2. GridSearchに使用するパラメータの設定

In [None]:
params = {
    'knn' : {'est__n_neighbors':[5,7,10], 'est__weights':['uniform','distance'],},
    
    'logistic': {'est__C':[1, 100],},
    
    'rsvc': {'est__C':[1, 100],},
    
    'lsvc': {'est__C':[1, 100],},
    
    'tree': {'est__max_depth': list(range(10, 20)),
             'est__criterion': ['gini', 'entropy'],},
    
    'rf': {'est__n_estimators':[320, 340],
            'est__max_depth': [8, 10,16],
            'est__random_state': [0],},
    
    'gb': {'est__loss':['deviance'],
            'est__learning_rate': [0.01, 0.1],
            'est__min_samples_split': np.linspace(0.1, 0.5, 2),
            'est__min_samples_leaf': np.linspace(0.1, 0.5, 2),
            'est__max_depth':[3,5],
            'est__max_features':['log2','sqrt'],
            'est__criterion': ['friedman_mse',  'mae'],
            'est__subsample':[0.5, 1.0],
            'est__n_estimators':[10],},
    
    'mlp': {'est__solver': ['lbfgs'],
            'est__max_iter': [10000],
            'est__alpha': 10.0 ** -np.arange(1, 3),
            'est__hidden_layer_sizes':np.arange(10, 12),},
    
    'xgb': {'est__n_estimators':[100,500,],
            'est__max_depth':[6, 8,10,],
            'est__learning_rate':[0.001, 0.01, 0.1, 1,],
            'est__min_child_weight': [1,6],},
    
    'lgbm': {'est__max_depth':[20, 50, 60,100],
            'est__learning_rate':[0.001, 0.01, 0.1,1],
            'est__num_leaves':[15, 31,64, 128],
            'est__n_estimators':[100, 500, 700],},
}

In [None]:
# Holdout
X_train, X_test, y_train, y_test=train_test_split(X_fin,
                                               y,
                                               test_size=0.3,
                                               random_state=1)

## 6. pipelineにて定義したアルゴリズムをGrid Searchとともに実行する

In [None]:
import warnings
warnings.simplefilter('ignore')

#  Metrics
evaluation_scoring = 'f1'

# Dict instance Initialization
scores = {}
best_params ={}
best_scores ={}

# algorithm pipeline and gridsearch
for pipe_name, pipeline in pipelines.items():
    print(pipe_name)
    print(params[pipe_name])
    start = time.time()
    gs = GridSearchCV(estimator=pipeline,
                     param_grid = params[pipe_name],
                     scoring=evaluation_scoring,
                     cv=5,
                     return_train_score=False)
    # training
    gs.fit(X_train, y_train)
    
    print('time', time.time()-start)
    scores[(pipe_name,'train')] = accuracy_score(y_train, gs.predict(X_train))
    scores[(pipe_name,'test')] = accuracy_score(y_test, gs.predict(X_test))
    best_params[pipe_name] = gs.best_params_
    best_scores[pipe_name] = gs.best_score_
    
    # Create directories to save each model data
    os.makedirs('/kaggle/working/models/pipeline_models', exist_ok=True)
    # Save model
    file_name = '/kaggle/working/models/pipeline_models/'+pipe_name+str(time.time())+'.pkl'
    pickle.dump(pipeline, open(file_name, 'wb'))

print('---accuracy---')
pd.Series(scores).unstack()

In [None]:
print(scores)
print()
print(best_scores)
print()
print(best_params)

## 7. 最終的に使用するアルゴリズムの選択

In [None]:
def get_answer(x):
    return x

Select alogorithm with radio button

In [None]:
model_selection = get_answer(widgets.RadioButtons(options=pipelines.keys()))
display(model_selection)

In [None]:
selected_model_name = model_selection.value
print(selected_model_name)

In [None]:
best_params[selected_model_name]

In [None]:
# best_params[seleced_model_name]のvalueをリスト化する
def params_parser(best_params, selected_model_name):
    return {k:[v] for k,v in best_params[selected_model_name].items()}

In [None]:
best_param_parsed = params_parser(best_params, selected_model_name)
print(best_param_parsed)

### 8. 全データを使った学習

In [None]:
def train_on_selected_model(pipe_name, pipelines, params, evaluation_scoring, tag):
    start = time.time()
    gs = GridSearchCV(estimator=pipelines[pipe_name],
                     param_grid = params,
                     scoring=evaluation_scoring,
                     cv=5,
                     return_train_score=False)
    # fit
    model = gs.fit(X_fin, y)
    
    print('time', time.time()-start)
    scores[(pipe_name,'train')] = accuracy_score(y_train, gs.predict(X_train))
    scores[(pipe_name,'test')] = accuracy_score(y_test, gs.predict(X_test))
    
    # Create directories
    os.makedirs('/kaggle/working/models/', exist_ok=True)
     # To save model
    file_name = '/kaggle/working/models/' + pipe_name + '_' + tag + str(time.time())+'.pkl'
    pickle.dump(model, open(file_name, 'wb'))
    
    return model

In [None]:
selected_model = train_on_selected_model(selected_model_name, pipelines, best_param_parsed, evaluation_scoring, 'selected')

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np

y_pred_m = selected_model.predict(X_fin)

# FPR, TPR(
fpr, tpr, thresholds = metrics.roc_curve(y, y_pred_m)

# AUC
auc = metrics.auc(fpr, tpr)

# Plot ROC curve
plt.plot(fpr, tpr, label='ROC curve (area = %.2f)'%auc)
plt.legend()
plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)

In [None]:
from sklearn.metrics import f1_score
print(f1_score(y, y_pred_m))

## 9. テストデータの予測
### 9-1. テストデータの欠損値の確認

In [None]:
X_s.isnull().sum()

### 9-2. 敬称の抽出

In [None]:
# Nameから敬称(Title)の抽出
X_s['Title'] = X_s.Name.str.extract(r',\s*([^\.]*)\s*\.', expand = False)

X_s['Title_Group'] = np.where(X_s['Title'] == Mr[0], 'Mr', 'Affluence')
X_s['Title_Group'] = np.where(X_s['Title'].isin(Crew1),  "Crew1", X_s['Title_Group'])
X_s['Title_Group'] = np.where(X_s['Title'].isin(Crew2),  "Crew2", X_s['Title_Group'])
X_s['Title_Group'] = np.where(X_s['Title'].isin(Women_Masters),  "Women_Masters", X_s['Title_Group'])
X_s['Title_Group'] = np.where(X_s['Title'].isin(Affluence),  "Affluence", X_s['Title_Group'])
X_s = X_s.drop(columns=['Name', 'Title'])

In [None]:
X_s.head()

### 9-3. テストデータと訓練データの比較

In [None]:
# One-Hot-Encoding
age_df_s = X_s[cols_age_prediction]
age_df_s = pd.get_dummies(age_df_s)


# Select data by Age value condition
unknown_age_s = age_df_s[age_df_s.Age.isnull()].values
unknown_age_s_df = age_df_s[age_df_s.Age.isnull()]

cols_train_Age= set(unknown_age_df.columns.values)
cols_test_Age = set(unknown_age_s_df.columns.values)

diff1 = cols_train_Age - cols_test_Age
print('Columns of existing in training data: %s' %diff1)

diff2 = cols_test_Age - cols_train_Age
print('Columns of existing in test data for submission: %s' %diff2)

# Predict age for test data 
Age_s_y_pred = Age_lgb.predict(unknown_age_s_df.drop(columns = ['Age']+list(diff2), axis=1))
X_s.loc[(X_s.Age.isnull()), 'Age'] = Age_s_y_pred

In [None]:
X_s.isnull().sum()

### 9-4. One-hot-encoding Process

In [None]:
X_ohe_s = pd.get_dummies(X_s,
                         dummy_na=True,
                         columns=ohe_columns)
print('X_ohe_s shape:(%i,%i)' % X_ohe_s.shape)
X_ohe_s.head(3)

### 9-5. カラムの選択

Data cleaning after one-hot-encoding

In [None]:
cols_model= set(X_ohe.columns.values)
cols_score = set(X_ohe_s.columns.values)

diff1 = cols_model - cols_score
print('Columns of existing in training data:: %s' %diff1)

diff2 = cols_score - cols_model
print('Columns of existing in test data for submission: %s' %diff2)

In [None]:
dataset_cols_m = pd.DataFrame(None,
                         columns=X_ohe_columns,
                         dtype=float)
display(dataset_cols_m)

In [None]:
X_ohe_s = pd.concat([dataset_cols_m, X_ohe_s])
print(X_ohe_s.shape)
display(X_ohe_s.head(3))

In [None]:
set_Xm = set(X_ohe.columns.values)
set_Xs = set(X_ohe_s.columns.values)
print(set_Xs-set_Xm)
X_ohe_s = X_ohe_s.drop(list(set_Xs-set_Xm),axis=1)

print(X_ohe_s.shape)
display(X_ohe_s.head(3))

In [None]:
print(set_Xm-set_Xs)
X_ohe_s.loc[:,list(set_Xm-set_Xs)] = X_ohe_s.loc[:,list(set_Xm-set_Xs)].fillna(0,axis=1)
X_ohe_s.head(3)

### 9-6. カラムの並び順序の担保  

In [None]:
X_ohe_s = X_ohe_s.reindex(X_ohe_columns, axis=1)
X_ohe_s.head(3)
print(X_ohe_s.shape)

### 9-7. 欠損値補完  
訓練データを使って生成したimpを用いて欠損値を補完する。テストデータで計算するわけではない。

In [None]:
X_ohe_s.isnull().sum()

In [None]:
print('欠損個数（数値変数の欠損補完前）',X_ohe_s.isnull().sum().sum())    # rowをsum()して、columnをsum()する

# (重要)モデリングデータで作ったimpを使ってtransformする
# もしここで改めてimpしたら、改めて計算されてしまう。そのためモデリングデータで使った平均値データを使ってtransformする
X_ohe_s = pd.DataFrame(imp.transform(X_ohe_s),columns=X_ohe_columns)

print('欠損個数（数値変数の欠損補完後）',X_ohe_s.isnull().sum().sum())

### 9-8. 次元圧縮(selectorを使って使用するカラムを選択する)

In [None]:
X_fin_s = X_ohe_s.loc[:, X_ohe_columns[selector.support_]]
print(X_fin_s.shape)
X_fin_s.head(3)
X_fin_s = X_ohe_s

In [None]:
print('-----------------------------------')
print('X_fin_s shape: (%i,%i)' %X_fin_s.shape)
print('-----------------------------------')

## 10. 予測

In [None]:
y_pred = selected_model.predict(X_fin_s)
print(len(y_pred))

## 11. 提出用ファイルの作成

In [None]:
result_for_submit = pd.DataFrame({ID_column:dataset_s[ID_column], target_value:y_pred})

os.makedirs(submit_file_dir, exist_ok=True)
result_for_submit.to_csv(submit_file_dir+ selected_model_name + '_' + submit_file_name , index=False)