# 【問題1】クロスバリデーション
事前学習期間では検証データをはじめに分割しておき、それに対して指標値を計算することで検証を行っていました。（ホールドアウト法）しかし、分割の仕方により精度は変化します。実践的には クロスバリデーション（交差検証） を行います。分割を複数回行い、それぞれに対して学習と検証を行う方法です。複数回の分割のためにscikit-learnにはKFoldクラスが用意されています。<br>
事前学習期間の課題で作成したベースラインモデルに対してKFoldクラスによるクロスバリデーションを行うコードを作成し実行してください。<br>

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import sklearn.datasets
import sklearn.ensemble
import sklearn.model_selection
import optuna

from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
#データ読み込み
df = pd.read_csv("/Users/takahashihideyuki/dive/データ格納/Week3/application_train.csv")
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
#学習から検証までを関数化 　　KFoldを使用
def learning_to_verification(X, y, model):
    X = np.array(X)
    y = np.array(y)
    
    # クロスバリデーションで分割する。
    auc_list = []
    cnt = 1
    kf = KFold(n_splits=5, random_state=0, shuffle=True)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        #標準化
        scaler = StandardScaler()
        scaler.fit(X_train) #訓練用のデータでfit
        X_train_std = scaler.transform(X_train) #訓練用データをtransform
        X_test_std = scaler.transform(X_test) #検証用データをtransform

        #学習〜予測
        clf = model
        clf.fit(X_train_std, y_train) # 学習
        pred = clf.predict(X_test_std) #クラスの予測
        pred_proba = clf.predict_proba(X_test_std) #クラスの予測確率
        pred_proba_posi = pred_proba[:, 0] #クラスの予測確率（ポジティブクラスを抜粋）

        # 評価
        # ROC曲線の計算
        fpr, tpr, thresholds = metrics.roc_curve(y_test, pred_proba_posi, pos_label=0)

        #AUC（Area Under the Curve）を計算
        auc_value = auc(fpr, tpr)
        print("{}回目のAUC値は{:.2f}です。".format(cnt, auc_value))
        auc_list.append(auc_value)
        cnt += 1 
        
    auc_mean = sum(auc_list) / len(auc_list)
    print("AUC平均値は{:.2f}です。".format(auc_mean))

In [4]:
#ベースラインモデル（特徴量は前回の課題で選定したもの、重要度0.01以上の上位26変数を抽出）
base_columns = np.array(['EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'DAYS_ID_PUBLISH',
       'DAYS_REGISTRATION', 'AMT_ANNUITY', 'DAYS_LAST_PHONE_CHANGE',
       'AMT_CREDIT', 'DAYS_EMPLOYED', 'AMT_INCOME_TOTAL',
       'REGION_POPULATION_RELATIVE', 'EXT_SOURCE_1',
       'HOUR_APPR_PROCESS_START', 'AMT_REQ_CREDIT_BUREAU_YEAR',
       'OBS_30_CNT_SOCIAL_CIRCLE', 'YEARS_BEGINEXPLUATATION_AVG',
       'APARTMENTS_AVG', 'LANDAREA_AVG', 'OWN_CAR_AGE',
       'BASEMENTAREA_AVG', 'NONLIVINGAREA_AVG', 'YEARS_BUILD_AVG',
       'COMMONAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'ENTRANCES_AVG',
       'CNT_CHILDREN'])

X = df[base_columns].fillna(df[base_columns].median()) #欠損値を中央値で補完
y = df.loc[:, 'TARGET']

learning_to_verification(X, y, model=SGDClassifier(loss="log")) #学習から検証までを関数で実行

1回目のAUC値は0.71です。
2回目のAUC値は0.72です。
3回目のAUC値は0.72です。
4回目のAUC値は0.71です。
5回目のAUC値は0.72です。
AUC平均値は0.72です。


# 【問題2】グリッドサーチ
これまで分類器のパラメータには触れず、デフォルトの設定を使用していました。パラメータの詳細は今後のSprintで学んでいくことになります。<br>
機械学習の前提として、パラメータは状況に応じて最適なものを選ぶ必要があります。<br>
最適なパラメータを探していくことを パラメータチューニング と呼びます。<br>
パラメータチューニングをある程度自動化する単純な方法としては グリッドサーチ があります。<br>
scikit-learnのGridSearchCVを使い、グリッドサーチを行うコードを作成してください。<br>
そして、ベースラインモデルに対して何らかしらのパラメータチューニングを行なってください。<br>
どのパラメータをチューニングするかは、使用した手法の公式ドキュメントを参考にしてください。<br>
GridSearchCVクラスには引数としてモデル、探索範囲、さらにクロスバリデーションを何分割で行うかを与えます。<br>
クロスバリデーションの機能も含まれているため、これを使用する場合はKFoldクラスを利用する必要はありません。

In [5]:
#学習から検証までを関数化　　　GridSearchCVを使用
def learn_to_verifi_GS(X, y, model, parameters):
    # 訓練データと検証データの分割。訓練データ75%、検証データ25%として分割する。
    X = np.array(X)
    y = np.array(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)
    
    #標準化
    scaler = StandardScaler()
    scaler.fit(X_train) #訓練用のデータでfit
    X_train_std = scaler.transform(X_train) #訓練用データをtransform
    X_test_std = scaler.transform(X_test) #検証用データをtransform
    
    #学習〜予測
    clf = GridSearchCV(model, parameters, cv=5)    
    clf.fit(X_train_std, y_train) # 学習
    display(pd.DataFrame(clf.cv_results_))
    print(clf.best_estimator_)
    print(clf.best_params_)
    pred = clf.predict(X_test_std) #クラスの予測
    pred_proba = clf.predict_proba(X_test_std) #クラスの予測確率
    pred_proba_posi = pred_proba[:, 0] #クラスの予測確率（ポジティブクラスを抜粋）

    # ROC曲線の計算
    fpr, tpr, thresholds = metrics.roc_curve(y_test, pred_proba_posi, pos_label=0)

    #AUC（Area Under the Curve）を計算
    auc_value = auc(fpr, tpr)
    print("AUC平均値は{:.2f}です。".format(auc_value))

In [6]:
learn_to_verifi_GS(X, y , model = SGDClassifier(loss="log"), 
                                  parameters = {'alpha':[0.0001, 0.001, 0.01], "validation_fraction":[0.1, 0.5, 0.9], 
                                                           "early_stopping":[True, False]})

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_early_stopping,param_validation_fraction,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.553661,0.037973,0.004352,0.00044,0.0001,True,0.1,"{'alpha': 0.0001, 'early_stopping': True, 'val...",0.919483,0.919006,0.918876,0.919481,0.919481,0.919266,0.000268,12
1,0.520926,0.058091,0.004226,0.000225,0.0001,True,0.5,"{'alpha': 0.0001, 'early_stopping': True, 'val...",0.919158,0.917857,0.917315,0.918723,0.919243,0.918459,0.000754,17
2,0.559192,0.122802,0.005943,0.001065,0.0001,True,0.9,"{'alpha': 0.0001, 'early_stopping': True, 'val...",0.897435,0.9104,0.9117,0.915015,0.914647,0.909839,0.006443,18
3,0.648053,0.114152,0.003405,0.000133,0.0001,False,0.1,"{'alpha': 0.0001, 'early_stopping': False, 'va...",0.919093,0.919418,0.919418,0.918397,0.919503,0.919166,0.000409,15
4,0.635268,0.076781,0.00377,0.000458,0.0001,False,0.5,"{'alpha': 0.0001, 'early_stopping': False, 'va...",0.918876,0.91905,0.918985,0.919481,0.919568,0.919192,0.000279,14
5,0.783893,0.162229,0.003801,0.000753,0.0001,False,0.9,"{'alpha': 0.0001, 'early_stopping': False, 'va...",0.91957,0.919266,0.919461,0.918289,0.919677,0.919253,0.000501,13
6,0.65375,0.05839,0.004822,0.000382,0.001,True,0.1,"{'alpha': 0.001, 'early_stopping': True, 'vali...",0.919657,0.919353,0.919288,0.919243,0.919611,0.91943,0.00017,10
7,0.518256,0.01027,0.004615,0.000111,0.001,True,0.5,"{'alpha': 0.001, 'early_stopping': True, 'vali...",0.919353,0.919353,0.919115,0.91907,0.919503,0.919279,0.000163,11
8,0.42742,0.021352,0.004952,0.000123,0.001,True,0.9,"{'alpha': 0.001, 'early_stopping': True, 'vali...",0.919418,0.919093,0.917662,0.919481,0.919438,0.919019,0.000692,16
9,0.419083,0.01927,0.003424,0.000174,0.001,False,0.1,"{'alpha': 0.001, 'early_stopping': False, 'val...",0.919635,0.919396,0.919353,0.919265,0.919655,0.919461,0.000156,8


SGDClassifier(alpha=0.01, early_stopping=True, loss='log')
{'alpha': 0.01, 'early_stopping': True, 'validation_fraction': 0.1}
AUC平均値は0.72です。


##### #覚書：グリッドサーチでscoringを指定しない場合、split(n)_test_scoreにはestimatorのscoreメソッドが使われる。(今回の場合はaccuracy)

# 【問題3】Kaggle Notebooksからの調査
KaggleのNotebooksから様々なアイデアを見つけ出して、列挙してください。

- ベイズ最適化<br>
  https://www.kaggle.com/willkoehrsen/automated-model-tuning<br>
  自動ハイパーパラメーター調整：勾配降下法、`ベイジアン最適化`、または進化アルゴリズムなどの方法を使用して、<br>
  最適なハイパーパラメーターのガイド付き検索を実行します。<br>
  https://towardsdatascience.com/an-introductory-example-of-bayesian-optimization-in-python-with-hyperopt-aae40fff4ff0<br>
  最近では、機械学習モデルの`ベイズハイパーパラメーター最適化`が、手動、ランダム、またはグリッド検索よりも効率的であることを示唆しています。<br>
　　- テストセット全体のパフォーマンスが向上<br>
　　- 最適化に必要な時間の短縮<br>

  optuna  https://optuna.org/
  

- LightGBM<br>
　　「Kaggler」の上位6割以上が LightGBM を用いている。<br>
  https://rightcode.co.jp/blog/information-technology/lightgbm-useful-for-kaggler<br>
  Optuna の拡張機能 LightGBM Tuner によるハイパーパラメータ自動最適化<br>
  https://tech.preferred.jp/ja/blog/hyperparameter-tuning-with-optuna-integration-lightgbm-tuner/

# 【問題4】高い汎化性能のモデル作成
問題3で見つけたアイデアと、独自のアイデアを組み合わせ高い汎化性能のモデル作りを進めてください。<br>
その過程として、何を行うことで、クロスバリデーションの結果がどの程度変化したかを表にまとめてください。

### ロジスティック回帰（ベイズ最適化）

In [7]:
def objective(trial):
    X = df[base_columns].fillna(df[base_columns].median()) #欠損値を中央値で補完
    y = df.loc[:, 'TARGET']
    X = np.array(X)
    y = np.array(y)
    
    # クロスバリデーションで分割する。
    auc_list = []
    cnt = 1
    kf = KFold(n_splits=5, random_state=0, shuffle=True)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        #標準化
        scaler = StandardScaler()
        scaler.fit(X_train) #訓練用のデータでfit
        X_train_std = scaler.transform(X_train) #訓練用データをtransform
        X_test_std = scaler.transform(X_test) #検証用データをtransform

        #学習〜予測
        alpha = trial.suggest_categorical('alpha', [0.0001, 0.001, 0.01])
        early_stopping = trial.suggest_categorical("early_stopping", [True, False])
        validation_fraction = trial.suggest_categorical("validation_fraction", [0.1, 0.5, 0.9])
        clf = SGDClassifier(loss="log", alpha=alpha, early_stopping=early_stopping, 
                                         validation_fraction=validation_fraction)
        clf.fit(X_train_std, y_train) # 学習
        pred = clf.predict(X_test_std) #クラスの予測
        pred_proba = clf.predict_proba(X_test_std) #クラスの予測確率
        pred_proba_posi = pred_proba[:, 0] #クラスの予測確率（ポジティブクラスを抜粋）

        # 評価
        # ROC曲線の計算
        fpr, tpr, thresholds = metrics.roc_curve(y_test, pred_proba_posi, pos_label=0)

        #AUC（Area Under the Curve）を計算
        auc_value = auc(fpr, tpr)
#         print("{}回目のAUC値は{:.2f}です。".format(cnt, auc_value))
        auc_list.append(auc_value)
        cnt += 1 
        
    auc_mean = sum(auc_list) / len(auc_list)
    return auc_mean
#     print("AUC平均値は{:.2f}です。".format(auc_mean))    

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

trial = study.best_trial

print('AUC平均値: {:.2f}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[I 2020-08-15 02:49:31,494] Trial 0 finished with value: 0.7105943671667247 and parameters: {'alpha': 0.0001, 'early_stopping': True, 'validation_fraction': 0.1}. Best is trial 0 with value: 0.7105943671667247.
[I 2020-08-15 02:49:37,586] Trial 1 finished with value: 0.71106722065669 and parameters: {'alpha': 0.0001, 'early_stopping': True, 'validation_fraction': 0.1}. Best is trial 1 with value: 0.71106722065669.
[I 2020-08-15 02:49:41,690] Trial 2 finished with value: 0.7217064678063714 and parameters: {'alpha': 0.001, 'early_stopping': False, 'validation_fraction': 0.1}. Best is trial 2 with value: 0.7217064678063714.
[I 2020-08-15 02:49:46,479] Trial 3 finished with value: 0.7213227958638029 and parameters: {'alpha': 0.01, 'early_stopping': True, 'validation_fraction': 0.9}. Best is trial 2 with value: 0.7217064678063714.
[I 2020-08-15 02:49:50,682] Trial 4 finished with value: 0.7228416556807715 and parameters: {'alpha': 0.01, 'early_stopping': False, 'validation_fraction': 0.9}. 

[I 2020-08-15 02:52:23,830] Trial 38 finished with value: 0.7228457165085392 and parameters: {'alpha': 0.01, 'early_stopping': False, 'validation_fraction': 0.1}. Best is trial 19 with value: 0.7229370407121546.
[I 2020-08-15 02:52:30,208] Trial 39 finished with value: 0.7225769513475765 and parameters: {'alpha': 0.001, 'early_stopping': True, 'validation_fraction': 0.1}. Best is trial 19 with value: 0.7229370407121546.
[I 2020-08-15 02:52:34,475] Trial 40 finished with value: 0.7228681621508125 and parameters: {'alpha': 0.01, 'early_stopping': False, 'validation_fraction': 0.1}. Best is trial 19 with value: 0.7229370407121546.
[I 2020-08-15 02:52:38,670] Trial 41 finished with value: 0.7228453553699535 and parameters: {'alpha': 0.01, 'early_stopping': False, 'validation_fraction': 0.1}. Best is trial 19 with value: 0.7229370407121546.
[I 2020-08-15 02:52:42,860] Trial 42 finished with value: 0.7229499031576841 and parameters: {'alpha': 0.01, 'early_stopping': False, 'validation_fracti

[I 2020-08-15 02:55:08,210] Trial 76 finished with value: 0.7228562123609885 and parameters: {'alpha': 0.01, 'early_stopping': False, 'validation_fraction': 0.1}. Best is trial 62 with value: 0.7229626407066112.
[I 2020-08-15 02:55:12,240] Trial 77 finished with value: 0.7228667700385529 and parameters: {'alpha': 0.01, 'early_stopping': False, 'validation_fraction': 0.5}. Best is trial 62 with value: 0.7229626407066112.
[I 2020-08-15 02:55:18,232] Trial 78 finished with value: 0.7227685675432525 and parameters: {'alpha': 0.01, 'early_stopping': True, 'validation_fraction': 0.1}. Best is trial 62 with value: 0.7229626407066112.
[I 2020-08-15 02:55:22,306] Trial 79 finished with value: 0.7228596267921559 and parameters: {'alpha': 0.01, 'early_stopping': False, 'validation_fraction': 0.5}. Best is trial 62 with value: 0.7229626407066112.
[I 2020-08-15 02:55:26,419] Trial 80 finished with value: 0.7228311371905158 and parameters: {'alpha': 0.01, 'early_stopping': False, 'validation_fractio

AUC平均値: 0.72
Best hyperparameters: {'alpha': 0.01, 'early_stopping': False, 'validation_fraction': 0.1}


### LightGBM（グリッドサーチ）

In [8]:
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [9]:
learn_to_verifi_GS(X, y , model = lgb.LGBMClassifier(), 
                                  parameters = {'learning_rate':[0.001, 0.01, 0.1], "reg_alpha":[0.0, 0.1, 1.0], 
                                                           "reg_lambda":[0.0, 0.1, 1.0]})

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_reg_alpha,param_reg_lambda,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.379331,0.038106,0.102046,0.003422,0.001,0.0,0.0,"{'learning_rate': 0.001, 'reg_alpha': 0.0, 're...",0.919548,0.919548,0.919527,0.919546,0.919546,0.919543,8e-06,10
1,2.506601,0.523708,0.10301,0.003779,0.001,0.0,0.1,"{'learning_rate': 0.001, 'reg_alpha': 0.0, 're...",0.919548,0.919548,0.919527,0.919546,0.919546,0.919543,8e-06,10
2,2.876561,0.256737,0.115882,0.005402,0.001,0.0,1.0,"{'learning_rate': 0.001, 'reg_alpha': 0.0, 're...",0.919548,0.919548,0.919527,0.919546,0.919546,0.919543,8e-06,10
3,2.298124,0.025415,0.102583,0.002102,0.001,0.1,0.0,"{'learning_rate': 0.001, 'reg_alpha': 0.1, 're...",0.919548,0.919548,0.919527,0.919546,0.919546,0.919543,8e-06,10
4,2.297094,0.036188,0.103316,0.002285,0.001,0.1,0.1,"{'learning_rate': 0.001, 'reg_alpha': 0.1, 're...",0.919548,0.919548,0.919527,0.919546,0.919546,0.919543,8e-06,10
5,2.307605,0.032908,0.102702,0.002356,0.001,0.1,1.0,"{'learning_rate': 0.001, 'reg_alpha': 0.1, 're...",0.919548,0.919548,0.919527,0.919546,0.919546,0.919543,8e-06,10
6,2.258341,0.008137,0.103308,0.003376,0.001,1.0,0.0,"{'learning_rate': 0.001, 'reg_alpha': 1.0, 're...",0.919548,0.919548,0.919527,0.919546,0.919546,0.919543,8e-06,10
7,2.266639,0.025702,0.101899,0.000919,0.001,1.0,0.1,"{'learning_rate': 0.001, 'reg_alpha': 1.0, 're...",0.919548,0.919548,0.919527,0.919546,0.919546,0.919543,8e-06,10
8,2.309863,0.077294,0.104044,0.003093,0.001,1.0,1.0,"{'learning_rate': 0.001, 'reg_alpha': 1.0, 're...",0.919548,0.919548,0.919527,0.919546,0.919546,0.919543,8e-06,10
9,2.290113,0.014111,0.113692,0.00146,0.01,0.0,0.0,"{'learning_rate': 0.01, 'reg_alpha': 0.0, 'reg...",0.919548,0.919548,0.919527,0.919546,0.919546,0.919543,8e-06,10


LGBMClassifier(reg_alpha=0.1, reg_lambda=1.0)
{'learning_rate': 0.1, 'reg_alpha': 0.1, 'reg_lambda': 1.0}
AUC平均値は0.74です。


### LightGBM（ベイズ最適化）

In [10]:
def objective_lightGBM(trial):
    X = df[base_columns].fillna(df[base_columns].median()) #欠損値を中央値で補完
    y = df.loc[:, 'TARGET']
    X = np.array(X)
    y = np.array(y)
    
    # クロスバリデーションで分割する。
    auc_list = []
    cnt = 1
    kf = KFold(n_splits=5, random_state=0, shuffle=True)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        #標準化
        scaler = StandardScaler()
        scaler.fit(X_train) #訓練用のデータでfit
        X_train_std = scaler.transform(X_train) #訓練用データをtransform
        X_test_std = scaler.transform(X_test) #検証用データをtransform
     
        learning_rate = trial.suggest_categorical('learning_rate', [0.001, 0.01, 0.1])
        reg_alpha = trial.suggest_categorical('reg_alpha', [0.0, 0.1, 1.0])
        reg_lambda = trial.suggest_categorical('reg_lambda', [0.0, 0.1, 1.0])
        
        #学習〜予測
        clf = lgb.LGBMClassifier(learning_rate=learning_rate, reg_alpha=reg_alpha, reg_lambda=reg_lambda)
        clf.fit(X_train_std, y_train) # 学習
        pred = clf.predict(X_test_std) #クラスの予測
        pred_proba = clf.predict_proba(X_test_std) #クラスの予測確率
        pred_proba_posi = pred_proba[:, 0] #クラスの予測確率（ポジティブクラスを抜粋）

        # 評価
        # ROC曲線の計算
        fpr, tpr, thresholds = metrics.roc_curve(y_test, pred_proba_posi, pos_label=0)

        #AUC（Area Under the Curve）を計算
        auc_value = auc(fpr, tpr)
        auc_list.append(auc_value)
        cnt += 1 
        
    auc_mean = sum(auc_list) / len(auc_list)
    return auc_mean

study = optuna.create_study(direction='maximize')
study.optimize(objective_lightGBM, n_trials=100)

trial = study.best_trial

print('AUC平均値: {:.2f}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[I 2020-08-15 03:02:20,535] Trial 0 finished with value: 0.7261078363796309 and parameters: {'learning_rate': 0.01, 'reg_alpha': 1.0, 'reg_lambda': 0.0}. Best is trial 0 with value: 0.7261078363796309.
[I 2020-08-15 03:02:36,851] Trial 1 finished with value: 0.7442468799316831 and parameters: {'learning_rate': 0.1, 'reg_alpha': 1.0, 'reg_lambda': 0.0}. Best is trial 1 with value: 0.7442468799316831.
[I 2020-08-15 03:02:51,559] Trial 2 finished with value: 0.7431192624093862 and parameters: {'learning_rate': 0.1, 'reg_alpha': 0.0, 'reg_lambda': 0.0}. Best is trial 1 with value: 0.7442468799316831.
[I 2020-08-15 03:03:09,560] Trial 3 finished with value: 0.7261078363796309 and parameters: {'learning_rate': 0.01, 'reg_alpha': 1.0, 'reg_lambda': 0.0}. Best is trial 1 with value: 0.7442468799316831.
[I 2020-08-15 03:03:27,600] Trial 4 finished with value: 0.7261054508250437 and parameters: {'learning_rate': 0.01, 'reg_alpha': 1.0, 'reg_lambda': 0.1}. Best is trial 1 with value: 0.7442468799

[I 2020-08-15 03:13:13,650] Trial 40 finished with value: 0.7134954161546071 and parameters: {'learning_rate': 0.001, 'reg_alpha': 1.0, 'reg_lambda': 1.0}. Best is trial 14 with value: 0.744698766207899.
[I 2020-08-15 03:13:29,227] Trial 41 finished with value: 0.744698766207899 and parameters: {'learning_rate': 0.1, 'reg_alpha': 1.0, 'reg_lambda': 1.0}. Best is trial 14 with value: 0.744698766207899.
[I 2020-08-15 03:13:47,779] Trial 42 finished with value: 0.744698766207899 and parameters: {'learning_rate': 0.1, 'reg_alpha': 1.0, 'reg_lambda': 1.0}. Best is trial 14 with value: 0.744698766207899.
[I 2020-08-15 03:14:03,571] Trial 43 finished with value: 0.744698766207899 and parameters: {'learning_rate': 0.1, 'reg_alpha': 1.0, 'reg_lambda': 1.0}. Best is trial 14 with value: 0.744698766207899.
[I 2020-08-15 03:14:19,257] Trial 44 finished with value: 0.744698766207899 and parameters: {'learning_rate': 0.1, 'reg_alpha': 1.0, 'reg_lambda': 1.0}. Best is trial 14 with value: 0.744698766

[I 2020-08-15 03:23:57,382] Trial 80 finished with value: 0.7446319773604497 and parameters: {'learning_rate': 0.1, 'reg_alpha': 0.1, 'reg_lambda': 1.0}. Best is trial 14 with value: 0.744698766207899.
[I 2020-08-15 03:24:12,989] Trial 81 finished with value: 0.744698766207899 and parameters: {'learning_rate': 0.1, 'reg_alpha': 1.0, 'reg_lambda': 1.0}. Best is trial 14 with value: 0.744698766207899.
[I 2020-08-15 03:24:28,628] Trial 82 finished with value: 0.744698766207899 and parameters: {'learning_rate': 0.1, 'reg_alpha': 1.0, 'reg_lambda': 1.0}. Best is trial 14 with value: 0.744698766207899.
[I 2020-08-15 03:24:44,448] Trial 83 finished with value: 0.744698766207899 and parameters: {'learning_rate': 0.1, 'reg_alpha': 1.0, 'reg_lambda': 1.0}. Best is trial 14 with value: 0.744698766207899.
[I 2020-08-15 03:25:00,199] Trial 84 finished with value: 0.744698766207899 and parameters: {'learning_rate': 0.1, 'reg_alpha': 1.0, 'reg_lambda': 1.0}. Best is trial 14 with value: 0.74469876620

AUC平均値: 0.74
Best hyperparameters: {'learning_rate': 0.1, 'reg_alpha': 1.0, 'reg_lambda': 1.0}


## まとめ

In [11]:
conclusion = pd.DataFrame([0.72, 0.72, 0.74, 0.74])
conclusion.index = ["No.1: ロジスティック回帰（グリッドサーチ）", "No.2: ロジスティック回帰（ベイズ最適化）", "No.3: LightGBM（グリッドサーチ）", "No.4: LightGBM（ベイズ最適化）"]
conclusion.columns = ["AUC（平均値）"]
conclusion

Unnamed: 0,AUC（平均値）
No.1: ロジスティック回帰（グリッドサーチ）,0.72
No.2: ロジスティック回帰（ベイズ最適化）,0.72
No.3: LightGBM（グリッドサーチ）,0.74
No.4: LightGBM（ベイズ最適化）,0.74


### PCの処理能力の問題もあり多くは出来ていないが、ロジスティック回帰とLightGBMにてグリッドサーチとベイズ最適化を実施。LightGBMが若干良いが、ほぼ変わらない結果となった。

# 【問題5】最終的なモデルの選定
最終的にこれは良いというモデルを選び、推定した結果をKaggleに提出してスコアを確認してください。<br>
どういったアイデアを取り入れ、どの程度のスコアになったかを記載してください。

In [12]:
#testデータ読み込み
df_t = pd.read_csv("/Users/takahashihideyuki/dive/データ格納/Week3/application_test.csv")
df_t.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,


In [13]:
#ベースラインモデル（特徴量は前回の課題で選定したもの、重要度0.01以上の上位26変数を抽出）
base_columns = np.array(['EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'DAYS_ID_PUBLISH',
       'DAYS_REGISTRATION', 'AMT_ANNUITY', 'DAYS_LAST_PHONE_CHANGE',
       'AMT_CREDIT', 'DAYS_EMPLOYED', 'AMT_INCOME_TOTAL',
       'REGION_POPULATION_RELATIVE', 'EXT_SOURCE_1',
       'HOUR_APPR_PROCESS_START', 'AMT_REQ_CREDIT_BUREAU_YEAR',
       'OBS_30_CNT_SOCIAL_CIRCLE', 'YEARS_BEGINEXPLUATATION_AVG',
       'APARTMENTS_AVG', 'LANDAREA_AVG', 'OWN_CAR_AGE',
       'BASEMENTAREA_AVG', 'NONLIVINGAREA_AVG', 'YEARS_BUILD_AVG',
       'COMMONAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'ENTRANCES_AVG',
       'CNT_CHILDREN'])

X_t = df_t[base_columns].fillna(df_t[base_columns].median()) #欠損値を中央値で補完

In [14]:
#testデータ用　学習から検証までを関数化　　　GridSearchCVを使用
def learn_to_verifi_GS_test(X, y, X_t, model, parameters):
    # 訓練データと検証データの分割。訓練データ75%、検証データ25%として分割する。
    X = np.array(X)
    y = np.array(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)
    
    #標準化
    scaler = StandardScaler()
    scaler.fit(X_train) #訓練用のデータでfit
    X_train_std = scaler.transform(X_train) #訓練用データをtransform
    X_test_std = scaler.transform(X_test) #検証用データをtransform
    X_t_std = scaler.transform(X_t) #testデータをtransform
    
    #学習〜予測
    clf = GridSearchCV(model, parameters, cv=5)    
    clf.fit(X_train_std, y_train) # 学習
    pred = clf.predict(X_t_std) #testデータのクラスを予測
    pred_proba = clf.predict_proba(X_t_std) #testデータのクラスの予測確率
    pred_proba_posi = pred_proba[:, 0] #testデータのクラスの予測確率（ポジティブクラスを抜粋）

    return pred_proba_posi

In [15]:
pred_proba_posi = learn_to_verifi_GS_test(X, y, X_t, model = lgb.LGBMClassifier(), 
                                                                             parameters = {'learning_rate':[0.001, 0.01, 0.1], "reg_alpha":[0.0, 0.1, 1.0], 
                                                                                                       "reg_lambda":[0.0, 0.1, 1.0]})

In [16]:
#kaglle提出用のデータフレームを作成
pred_X_t = pd.Series(pred_proba_posi)
df_submission = pd.concat([df_t["SK_ID_CURR"], pred_X_t], axis=1)
df_submission.columns = ["SK_ID_CURR", "TARGET"]
df_submission

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.964660
1,100005,0.934697
2,100013,0.983271
3,100028,0.965880
4,100038,0.827023
...,...,...
48739,456221,0.973997
48740,456222,0.912871
48741,456223,0.969193
48742,456224,0.935711


In [17]:
#kaggle提出用にcsvを出力
df_submission.to_csv('df_submission.csv', index=False)

### LightGBM（グリッドサーチ、説明変数は重要度0.01以上の上位26変数を抽出）AUC値:0.74のモデルをkaggleへ提出。Public Scoreは0.27358とかなり低い結果となった。