# インポート

In [1]:

import numpy as np
import pandas as pd
import os
import pickle
import gc 

# 分布確認
import ydata_profiling as pdp

# 可視化
import matplotlib.pyplot as plt

# 前処理
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder

# バリデーション
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

# 評価指標
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

# モデリング: lightgbm
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

import sys

# matplotilbで日本語表示したい場合はこれをinstallしてインポートする
import japanize_matplotlib

from pathlib import Path

# Path().resolve()
# sys.path.append(os.path.join(Path().resolve(), '../source/'))




## データ確認

In [2]:

df_train = pd.read_csv("../data/input/train.csv") # 学習データ
df_test = pd.read_csv("../data/input/test.csv")   # テストデータ

# データセットの作成

* 性別は数値でないためOneHotVectorで数値化する

In [3]:
# 性別をベクトル化
df_one = pd.get_dummies(df_train[["Sex"]], dummy_na=False, drop_first=False)
df_one = df_one.astype(np.int64)

df_sex_test = pd.get_dummies(df_test[["Sex"]], dummy_na=False, drop_first=False)
df_sex_test = df_sex_test.astype(np.int64)


In [4]:
# データセット作成
x_train = pd.concat([df_one, df_train[["Pclass", "Fare"]]], axis=1)
y_train = df_train[["Survived"]]
id_train = df_train[["PassengerId"]]

x_test = pd.concat([df_sex_test, df_test[["Pclass", "Fare"]]], axis=1)

# 検証の方針

## 検証データ
* 交差検証を用いる


## モデル
* LightGBMを用いる

# ベースラインの作成

## ハイパーパラメータの探索

In [5]:

# 自作関数インポート
from src import util
import importlib
importlib.reload(util)

# 最適化パラメータ探索
best_trial = util.optimize(util.objective_func, x_train, y_train)
print(best_trial.params)
print(util.params)

util.params.update(best_trial.params)
util.params

[I 2023-10-22 21:52:54,039] A new study created in memory with name: no-name-644fcb45-4975-4100-8403-e9271323ecf7




[I 2023-10-22 21:52:54,405] Trial 0 finished with value: 0.7541711129244868 and parameters: {'num_leaves': 181, 'min_data_in_leaf': 61, 'min_sum_hessian_in_leaf': 4.792414358623587e-05, 'feature_fraction': 0.7756573845414456, 'bagging_fraction': 0.8597344848927815, 'lambda_l1': 0.492522233779106, 'lambda_l2': 83.76388146302445}. Best is trial 0 with value: 0.7541711129244868.




[I 2023-10-22 21:52:54,715] Trial 1 finished with value: 0.7588600841127363 and parameters: {'num_leaves': 178, 'min_data_in_leaf': 99, 'min_sum_hessian_in_leaf': 0.00015009027543233888, 'feature_fraction': 0.6715890080754348, 'bagging_fraction': 0.8645248536920208, 'lambda_l1': 0.567922374174008, 'lambda_l2': 0.01732652966363563}. Best is trial 1 with value: 0.7588600841127363.




[I 2023-10-22 21:52:54,906] Trial 2 finished with value: 0.7867553825874083 and parameters: {'num_leaves': 107, 'min_data_in_leaf': 149, 'min_sum_hessian_in_leaf': 3.52756635172055e-05, 'feature_fraction': 0.5877258780737462, 'bagging_fraction': 0.7657756869209191, 'lambda_l1': 1.3406343673102123, 'lambda_l2': 3.4482904089131434}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:52:55,176] Trial 3 finished with value: 0.7867553825874083 and parameters: {'num_leaves': 219, 'min_data_in_leaf': 146, 'min_sum_hessian_in_leaf': 0.0006808799287054756, 'feature_fraction': 0.8612216912851107, 'bagging_fraction': 0.6614794569265892, 'lambda_l1': 0.2799978022399009, 'lambda_l2': 0.08185645330667264}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:52:55,414] Trial 4 finished with value: 0.7867553825874083 and parameters: {'num_leaves': 81, 'min_data_in_leaf': 128, 'min_sum_hessian_in_leaf': 1.889360449174926e-05, 'feature_fraction': 0.7168505863397641, 'bagging_fraction': 0.7154313816648219, 'lambda_l1': 0.9434967110751797, 'lambda_l2': 0.5050346330980694}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:52:55,668] Trial 5 finished with value: 0.7519239219132509 and parameters: {'num_leaves': 85, 'min_data_in_leaf': 88, 'min_sum_hessian_in_leaf': 0.004788147156768277, 'feature_fraction': 0.9720800091019398, 'bagging_fraction': 0.7509183379421682, 'lambda_l1': 3.1319282717196035, 'lambda_l2': 0.029005047452739414}. Best is trial 2 with value: 0.7867553825874083.
[I 2023-10-22 21:52:55,797] Trial 6 finished with value: 0.6161634548992531 and parameters: {'num_leaves': 87, 'min_data_in_leaf': 86, 'min_sum_hessian_in_leaf': 0.003971252247766701, 'feature_fraction': 0.6252276826982534, 'bagging_fraction': 0.7415171321313522, 'lambda_l1': 87.54657140659076, 'lambda_l2': 1.1965765212602313}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:52:56,261] Trial 7 finished with value: 0.7642834724750486 and parameters: {'num_leaves': 160, 'min_data_in_leaf': 28, 'min_sum_hessian_in_leaf': 0.0030131614432849746, 'feature_fraction': 0.8015300642054637, 'bagging_fraction': 0.7725340032332324, 'lambda_l1': 0.23499322154972468, 'lambda_l2': 0.1646202117975735}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:52:56,474] Trial 8 finished with value: 0.7867553825874083 and parameters: {'num_leaves': 111, 'min_data_in_leaf': 138, 'min_sum_hessian_in_leaf': 0.00423029374725911, 'feature_fraction': 0.7552111687390055, 'bagging_fraction': 0.8346568914811361, 'lambda_l1': 2.206714812711709, 'lambda_l2': 3.1594683442464033}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:52:56,824] Trial 9 finished with value: 0.7822610005649364 and parameters: {'num_leaves': 175, 'min_data_in_leaf': 170, 'min_sum_hessian_in_leaf': 1.776580803025408e-05, 'feature_fraction': 0.8818414207216692, 'bagging_fraction': 0.6218331872684371, 'lambda_l1': 0.05982625838323253, 'lambda_l2': 1.9490717640641542}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:52:57,042] Trial 10 finished with value: 0.716163454899253 and parameters: {'num_leaves': 32, 'min_data_in_leaf': 200, 'min_sum_hessian_in_leaf': 8.920338990414188e-05, 'feature_fraction': 0.5040305717020104, 'bagging_fraction': 0.9940542446575642, 'lambda_l1': 0.010612397212799442, 'lambda_l2': 18.289897792948295}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:52:57,258] Trial 11 finished with value: 0.6386353650116126 and parameters: {'num_leaves': 243, 'min_data_in_leaf': 150, 'min_sum_hessian_in_leaf': 0.0006685126747113572, 'feature_fraction': 0.5850272097958577, 'bagging_fraction': 0.5204920216297158, 'lambda_l1': 6.343590915843685, 'lambda_l2': 0.07778945107272228}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:52:57,611] Trial 12 finished with value: 0.7867553825874083 and parameters: {'num_leaves': 256, 'min_data_in_leaf': 179, 'min_sum_hessian_in_leaf': 0.0005743102897337068, 'feature_fraction': 0.8634109743354949, 'bagging_fraction': 0.6568977276069294, 'lambda_l1': 0.09732674745359815, 'lambda_l2': 0.01029965396374567}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:52:57,831] Trial 13 finished with value: 0.7575418994413408 and parameters: {'num_leaves': 219, 'min_data_in_leaf': 131, 'min_sum_hessian_in_leaf': 1.0828053071219166e-05, 'feature_fraction': 0.5018388179694605, 'bagging_fraction': 0.6116401896337238, 'lambda_l1': 8.398203621097178, 'lambda_l2': 0.18579132432411227}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:52:58,128] Trial 14 finished with value: 0.7519239219132509 and parameters: {'num_leaves': 27, 'min_data_in_leaf': 165, 'min_sum_hessian_in_leaf': 0.00025111611178878413, 'feature_fraction': 0.6825093053054962, 'bagging_fraction': 0.6704686734571154, 'lambda_l1': 0.16687286319156577, 'lambda_l2': 6.433085314484019}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:52:58,338] Trial 15 finished with value: 0.750800326407633 and parameters: {'num_leaves': 138, 'min_data_in_leaf': 120, 'min_sum_hessian_in_leaf': 0.001215432268892957, 'feature_fraction': 0.5847432736056486, 'bagging_fraction': 0.5281566468339491, 'lambda_l1': 0.03848513214435228, 'lambda_l2': 0.5839440501965582}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:52:58,882] Trial 16 finished with value: 0.7227104387671834 and parameters: {'num_leaves': 216, 'min_data_in_leaf': 6, 'min_sum_hessian_in_leaf': 6.308169236196065e-05, 'feature_fraction': 0.8281194759550805, 'bagging_fraction': 0.7996604755114874, 'lambda_l1': 0.35049776025647444, 'lambda_l2': 0.07096214030849714}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:52:59,161] Trial 17 finished with value: 0.7755194275312285 and parameters: {'num_leaves': 127, 'min_data_in_leaf': 191, 'min_sum_hessian_in_leaf': 0.00029213848222772206, 'feature_fraction': 0.93890695089651, 'bagging_fraction': 0.7017228274653975, 'lambda_l1': 1.4242686509049345, 'lambda_l2': 0.4478620870734111}. Best is trial 2 with value: 0.7867553825874083.
[I 2023-10-22 21:52:59,382] Trial 18 finished with value: 0.7159688657334756 and parameters: {'num_leaves': 64, 'min_data_in_leaf': 153, 'min_sum_hessian_in_leaf': 0.0001402088130472093, 'feature_fraction': 0.741012426616628, 'bagging_fraction': 0.612176646078341, 'lambda_l1': 0.8030387388559193, 'lambda_l2': 5.372294855684903}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:52:59,595] Trial 19 finished with value: 0.7867553825874083 and parameters: {'num_leaves': 204, 'min_data_in_leaf': 113, 'min_sum_hessian_in_leaf': 0.0017255827970869692, 'feature_fraction': 0.9050192251205349, 'bagging_fraction': 0.5680242926216419, 'lambda_l1': 0.20767859726212454, 'lambda_l2': 1.1743561635328021}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:53:00,067] Trial 20 finished with value: 0.7330173874835226 and parameters: {'num_leaves': 152, 'min_data_in_leaf': 63, 'min_sum_hessian_in_leaf': 0.0005615868726974036, 'feature_fraction': 0.83128840684204, 'bagging_fraction': 0.6843014804540657, 'lambda_l1': 0.030654691807557245, 'lambda_l2': 0.0429279051193992}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:53:00,291] Trial 21 finished with value: 0.7867553825874083 and parameters: {'num_leaves': 57, 'min_data_in_leaf': 142, 'min_sum_hessian_in_leaf': 3.915555273944388e-05, 'feature_fraction': 0.717234697124583, 'bagging_fraction': 0.7187026569751934, 'lambda_l1': 1.0456517316541138, 'lambda_l2': 0.345790157472468}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:53:00,549] Trial 22 finished with value: 0.7867553825874083 and parameters: {'num_leaves': 116, 'min_data_in_leaf': 121, 'min_sum_hessian_in_leaf': 2.527352002561534e-05, 'feature_fraction': 0.7097128253270057, 'bagging_fraction': 0.7163989045903905, 'lambda_l1': 0.5082139072171981, 'lambda_l2': 0.15713254479051275}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:53:00,876] Trial 23 finished with value: 0.7867553825874083 and parameters: {'num_leaves': 96, 'min_data_in_leaf': 157, 'min_sum_hessian_in_leaf': 1.01501827040368e-05, 'feature_fraction': 0.788761423267231, 'bagging_fraction': 0.6545353244312025, 'lambda_l1': 1.601664292282324, 'lambda_l2': 0.6316387258745321}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:53:01,096] Trial 24 finished with value: 0.7867553825874083 and parameters: {'num_leaves': 67, 'min_data_in_leaf': 178, 'min_sum_hessian_in_leaf': 2.39707589308838e-05, 'feature_fraction': 0.9996311567105678, 'bagging_fraction': 0.7777958439193616, 'lambda_l1': 0.16897778747758307, 'lambda_l2': 0.2956512946370067}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:53:01,363] Trial 25 finished with value: 0.7867553825874083 and parameters: {'num_leaves': 47, 'min_data_in_leaf': 131, 'min_sum_hessian_in_leaf': 8.487882337152676e-05, 'feature_fraction': 0.6499234858545734, 'bagging_fraction': 0.7309054283754788, 'lambda_l1': 3.8090488693339095, 'lambda_l2': 1.1626919405799148}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:53:01,690] Trial 26 finished with value: 0.7867553825874083 and parameters: {'num_leaves': 10, 'min_data_in_leaf': 93, 'min_sum_hessian_in_leaf': 4.047888403355595e-05, 'feature_fraction': 0.7347599648632755, 'bagging_fraction': 0.6906680173840315, 'lambda_l1': 0.7404512199588413, 'lambda_l2': 0.07075679412507242}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:53:01,928] Trial 27 finished with value: 0.7867553825874083 and parameters: {'num_leaves': 107, 'min_data_in_leaf': 110, 'min_sum_hessian_in_leaf': 1.5675552319385728e-05, 'feature_fraction': 0.8270010486440649, 'bagging_fraction': 0.5642713113720017, 'lambda_l1': 0.3269580935097336, 'lambda_l2': 0.799520459204495}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:53:02,174] Trial 28 finished with value: 0.6800138095537003 and parameters: {'num_leaves': 138, 'min_data_in_leaf': 74, 'min_sum_hessian_in_leaf': 0.007707394018185613, 'feature_fraction': 0.7568074608774176, 'bagging_fraction': 0.8079390078064106, 'lambda_l1': 0.9953208739743914, 'lambda_l2': 0.28022987043123815}. Best is trial 2 with value: 0.7867553825874083.




[I 2023-10-22 21:53:02,380] Trial 29 finished with value: 0.7867553825874083 and parameters: {'num_leaves': 76, 'min_data_in_leaf': 145, 'min_sum_hessian_in_leaf': 3.216329135917105e-05, 'feature_fraction': 0.6984804982745697, 'bagging_fraction': 0.6397548071515035, 'lambda_l1': 0.4224534932644461, 'lambda_l2': 16.64848046417389}. Best is trial 2 with value: 0.7867553825874083.


acc(best)=0.7868


{'num_leaves': 107,
 'min_data_in_leaf': 149,
 'min_sum_hessian_in_leaf': 3.52756635172055e-05,
 'feature_fraction': 0.5877258780737462,
 'bagging_fraction': 0.7657756869209191,
 'lambda_l1': 1.3406343673102123,
 'lambda_l2': 3.4482904089131434}

{'num_leaves': 107, 'min_data_in_leaf': 149, 'min_sum_hessian_in_leaf': 3.52756635172055e-05, 'feature_fraction': 0.5877258780737462, 'bagging_fraction': 0.7657756869209191, 'lambda_l1': 1.3406343673102123, 'lambda_l2': 3.4482904089131434}
{'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.1, 'num_leaves': 16, 'n_estimators': 100000, 'random_state': 123, 'importance_type': 'gain'}


{'boosting_type': 'gbdt',
 'objective': 'binary',
 'metric': 'auc',
 'learning_rate': 0.1,
 'num_leaves': 107,
 'n_estimators': 100000,
 'random_state': 123,
 'importance_type': 'gain',
 'min_data_in_leaf': 149,
 'min_sum_hessian_in_leaf': 3.52756635172055e-05,
 'feature_fraction': 0.5877258780737462,
 'bagging_fraction': 0.7657756869209191,
 'lambda_l1': 1.3406343673102123,
 'lambda_l2': 3.4482904089131434}

### モデルの学習

In [6]:

# CV実行
imp, metrics, model_list = util.train_cv(x_train, y_train, id_train, util.params, n_splits=5)

-------------------- 0 --------------------
x_train (712, 4) y_valid (712, 1)
x_valid (179, 4) y_valid (179, 1)
y_train:0.384, y_tr:0.383, y_va:0.385
[100]	training's auc: 0.857026	valid_1's auc: 0.805072
[accuracy] tr: 0.79, va: 0.79
-------------------- 1 --------------------
x_train (713, 4) y_valid (713, 1)
x_valid (178, 4) y_valid (178, 1)
y_train:0.384, y_tr:0.384, y_va:0.382
[100]	training's auc: 0.850918	valid_1's auc: 0.820922
[accuracy] tr: 0.79, va: 0.76
-------------------- 2 --------------------
x_train (713, 4) y_valid (713, 1)
x_valid (178, 4) y_valid (178, 1)
y_train:0.384, y_tr:0.384, y_va:0.382
[100]	training's auc: 0.847991	valid_1's auc: 0.838636
[accuracy] tr: 0.78, va: 0.80
-------------------- 3 --------------------
x_train (713, 4) y_valid (713, 1)
x_valid (178, 4) y_valid (178, 1)
y_train:0.384, y_tr:0.384, y_va:0.382
[100]	training's auc: 0.849068	valid_1's auc: 0.859358
[accuracy] tr: 0.79, va: 0.79
-------------------- 4 --------------------
x_train (713, 4)

### 重要度の評価

In [7]:
imp.sort_values("imp", ascending=False, ignore_index=True)

Unnamed: 0,col,imp,imp_std
0,Sex_female,761.003072,46.443744
1,Sex_male,325.90129,24.337061
2,Pclass,260.195238,35.713838
3,Fare,71.360444,31.700849


# モデル推論

In [8]:
df_sex_test = pd.get_dummies(df_test[["Sex"]], dummy_na=False, drop_first=False)
df_sex_test = df_sex_test.astype(np.int64)
x_test = pd.concat([df_sex_test, df_test[["Pclass", "Fare"]]], axis=1)
id_test = df_test[["PassengerId"]]

In [9]:
# 結果を辞書に保存
solution = {}
 
# 各モデルで予測
for i, model in enumerate(model_list):
    test_pred = model.predict(x_test)
    solution[str(i) + "_model"] = test_pred

# 辞書からDataFrameに変更
solution = pd.DataFrame(solution)
solution

Unnamed: 0,0_model,1_model,2_model,3_model,4_model
0,0,0,0,0,0
1,1,1,1,1,1
2,0,0,0,0,0
3,0,0,0,0,0
4,1,1,1,1,1
...,...,...,...,...,...
413,0,0,0,0,0
414,1,1,1,1,1
415,0,0,0,0,0
416,0,0,0,0,0


In [10]:
# 多数決 (最頻値)を取得
solution_max = solution.mode(axis = 1).values

In [11]:
# PassengerIdを取得
PassengerId = np.array(df_test["PassengerId"]).astype(int)
 
# my_prediction(予測データ）とPassengerIdをデータフレームへ落とし込む
my_solution = pd.DataFrame(solution_max.astype(int), index = PassengerId, columns = ["Survived"])
 
# my_tree_one.csvとして書き出し
my_solution.to_csv("../data/output/002/submission.csv", index_label = ["PassengerId"])