lightgbmを用いた結果の予測。ハイパーパラメーターの探索はoptunaを用いた。

In [1]:
# 必要に応じで実行
# !pip install optuna

# ライブラリーのインポート

In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

import lightgbm as lgb
import optuna

# データのインポート

In [3]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

データの分析は'titanic_randamforest_optuna.ipynb'に書いてあるので省略

# データの前処理

In [4]:
# 結果の提出時に使用
test_id = test_data["PassengerId"]

# データの統合
data = pd.concat([train_data, test_data], sort=False)  # テストデータ、訓練データを結合

# 欠損値の補填
data['Age'].fillna(data['Age'].mean(),inplace=True)
data["Embarked"].fillna('S', inplace=True)
data['Fare'].fillna(data['Fare'].median(), inplace=True)

# カテゴリーデータの変換
data["Embarked"] = data["Embarked"].map({"S": 0, "C": 1, "Q": 2})
data["Sex"].replace(["male", "female"], [0, 1], inplace=True)

# 不要な特徴量の削除
data.drop(["Name", "PassengerId", "Ticket", "Cabin"], axis=1, inplace=True)

# データの再分割
train_data = data[:len(train_data)]
test_data = data[len(train_data):]

# 入力(x)と正解(y)の分割
x_train = train_data.iloc[:, 1:]
y_train = train_data.iloc[:, :1]
x_test = test_data.iloc[:, 1:]

# 訓練用と検証用にデータを分割

In [5]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.3, stratify=y_train)

# データセットの作成

In [8]:
categorical_features = ["Pclass","Sex","Age", "SibSp", "Parch","Fare", "Embarked"]

lgb_train = lgb.Dataset(x_train, y_train, free_raw_data=False, categorical_feature=categorical_features)
lgb_valid = lgb.Dataset(x_valid, y_valid, free_raw_data=False, reference=lgb_train, categorical_feature=categorical_features)

# モデルの構築

In [9]:
# optunaを用いたハイパーパラメータの最適化
def objective(trial):
    # ハイパーパラメータの探索範囲
    params = {
        "objective": "regression",  # 回帰
        "learning_rate": trial.suggest_float("learnig_rate", 0.01, 0.2),  # 学習率
        "num_iterations": trial.suggest_int("num_iterations", 16, 128), # 木の数
        "num_leaves": trial.suggest_int("num_leaves", 16, 128)  # 木にある分岐の個数
    }


    # モデルの訓練
    model = lgb.train(params,
                      lgb_train,
                      valid_sets=[lgb_train, lgb_valid],
                      verbose_eval=20,  # 学習過程の表示間隔
                      num_boost_round=500,  # 学習回数の最大値
                      early_stopping_rounds=10)  # 連続して10回性能が向上しなければ終了

    y_pred = model.predict(x_valid, num_iteration=model.best_iteration)  # 訓練済みのモデルを使用
    score = log_loss(y_valid, y_pred)  # 二値の交差エントロピー誤差
    return score

if __name__ == '__main__':
    study = optuna.create_study(sampler=optuna.samplers.RandomSampler())
    study.optimize(objective, n_trials=30)

[32m[I 2022-12-14 12:07:21,596][0m A new study created in memory with name: no-name-edc029ae-53b2-409a-be0a-e943bc9a4dd5[0m
[32m[I 2022-12-14 12:07:21,641][0m Trial 0 finished with value: 0.43375177794382536 and parameters: {'learnig_rate': 0.18953850536169975, 'num_iterations': 120, 'num_leaves': 85}. Best is trial 0 with value: 0.43375177794382536.[0m
[32m[I 2022-12-14 12:07:21,679][0m Trial 1 finished with value: 0.43309052780649687 and parameters: {'learnig_rate': 0.10779439387614834, 'num_iterations': 124, 'num_leaves': 82}. Best is trial 1 with value: 0.43309052780649687.[0m
[32m[I 2022-12-14 12:07:21,700][0m Trial 2 finished with value: 0.4496001533442619 and parameters: {'learnig_rate': 0.19642343271464835, 'num_iterations': 88, 'num_leaves': 108}. Best is trial 1 with value: 0.43309052780649687.[0m
[32m[I 2022-12-14 12:07:21,740][0m Trial 3 finished with value: 0.43495182470783544 and parameters: {'learnig_rate': 0.03721203792199908, 'num_iterations': 86, 'num_le

[32m[I 2022-12-14 12:07:21,802][0m Trial 5 finished with value: 0.44450368253941136 and parameters: {'learnig_rate': 0.10495334371057316, 'num_iterations': 19, 'num_leaves': 55}. Best is trial 1 with value: 0.43309052780649687.[0m


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 126
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] Start training from score 0.383628
Training until validation scores don't improve for 10 rounds
[20]	training's l2: 0.123087	valid_1's l2: 0.137844
Early stopping, best iteration is:
[18]	training's l2: 0.124538	valid_1's l2: 0.137435
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 126
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] Start training from score 0.383628
Training until validation scores don't improve for 10 rounds
[20]	training's l2: 0.133516	valid_1's l2: 0.139395
[40]	training's l2: 0.121224	valid_1's l2: 0.136606
Early stopping, best iteration is:
[40]

[32m[I 2022-12-14 12:07:21,839][0m Trial 6 finished with value: 0.43590510690720713 and parameters: {'learnig_rate': 0.09529175681000031, 'num_iterations': 36, 'num_leaves': 74}. Best is trial 1 with value: 0.43309052780649687.[0m
[32m[I 2022-12-14 12:07:21,875][0m Trial 7 finished with value: 0.4385353221649326 and parameters: {'learnig_rate': 0.04731195986222645, 'num_iterations': 55, 'num_leaves': 102}. Best is trial 1 with value: 0.43309052780649687.[0m
[32m[I 2022-12-14 12:07:21,912][0m Trial 8 finished with value: 0.4423480411855757 and parameters: {'learnig_rate': 0.028562159213974446, 'num_iterations': 78, 'num_leaves': 49}. Best is trial 1 with value: 0.43309052780649687.[0m
[32m[I 2022-12-14 12:07:21,936][0m Trial 9 finished with value: 0.43456668971075213 and parameters: {'learnig_rate': 0.12510603045806243, 'num_iterations': 110, 'num_leaves': 70}. Best is trial 1 with value: 0.43309052780649687.[0m
[32m[I 2022-12-14 12:07:21,966][0m Trial 10 finished with val

[32m[I 2022-12-14 12:07:22,000][0m Trial 11 finished with value: 0.4353838061203591 and parameters: {'learnig_rate': 0.042697676632543834, 'num_iterations': 70, 'num_leaves': 121}. Best is trial 1 with value: 0.43309052780649687.[0m


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 126
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] Start training from score 0.383628
Training until validation scores don't improve for 10 rounds
[20]	training's l2: 0.137718	valid_1's l2: 0.14217
Did not meet early stopping. Best iteration is:
[36]	training's l2: 0.125123	valid_1's l2: 0.138042
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 126
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] Start training from score 0.383628
Training until validation scores don't improve for 10 rounds
[20]	training's l2: 0.152447	valid_1's l2: 0.153828
[40]	training's l2: 0.13843	valid_1's l2: 0.141878
Did not meet early stopping.

[32m[I 2022-12-14 12:07:22,084][0m Trial 12 finished with value: 0.4342085250746539 and parameters: {'learnig_rate': 0.1949220847421678, 'num_iterations': 120, 'num_leaves': 43}. Best is trial 1 with value: 0.43309052780649687.[0m
[32m[I 2022-12-14 12:07:22,102][0m Trial 13 finished with value: 0.4310932211789753 and parameters: {'learnig_rate': 0.10173522700455469, 'num_iterations': 67, 'num_leaves': 107}. Best is trial 13 with value: 0.4310932211789753.[0m
[32m[I 2022-12-14 12:07:22,120][0m Trial 14 finished with value: 0.4334060005098295 and parameters: {'learnig_rate': 0.17016926364653856, 'num_iterations': 54, 'num_leaves': 48}. Best is trial 13 with value: 0.4310932211789753.[0m
[32m[I 2022-12-14 12:07:22,138][0m Trial 15 finished with value: 0.4337401092910711 and parameters: {'learnig_rate': 0.14213264234096412, 'num_iterations': 125, 'num_leaves': 53}. Best is trial 13 with value: 0.4310932211789753.[0m
[32m[I 2022-12-14 12:07:22,169][0m Trial 16 finished with va

[32m[I 2022-12-14 12:07:22,217][0m Trial 18 finished with value: 0.4336606214838728 and parameters: {'learnig_rate': 0.07115580711897779, 'num_iterations': 105, 'num_leaves': 69}. Best is trial 13 with value: 0.4310932211789753.[0m
[32m[I 2022-12-14 12:07:22,252][0m Trial 19 finished with value: 0.457553910343315 and parameters: {'learnig_rate': 0.016970068317248278, 'num_iterations': 92, 'num_leaves': 40}. Best is trial 13 with value: 0.4310932211789753.[0m


[LightGBM] [Info] Start training from score 0.383628
Training until validation scores don't improve for 10 rounds
[20]	training's l2: 0.124098	valid_1's l2: 0.138032
Early stopping, best iteration is:
[14]	training's l2: 0.128305	valid_1's l2: 0.137737
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 126
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] Start training from score 0.383628
Training until validation scores don't improve for 10 rounds
[20]	training's l2: 0.135616	valid_1's l2: 0.14052
[40]	training's l2: 0.12246	valid_1's l2: 0.136991
Early stopping, best iteration is:
[32]	training's l2: 0.126304	valid_1's l2: 0.13676
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 126
[LightGBM] [Info] Number of data points in th

[32m[I 2022-12-14 12:07:22,268][0m Trial 20 finished with value: 0.43453593489198833 and parameters: {'learnig_rate': 0.14374668642358668, 'num_iterations': 119, 'num_leaves': 91}. Best is trial 13 with value: 0.4310932211789753.[0m
[32m[I 2022-12-14 12:07:22,299][0m Trial 21 finished with value: 0.4362447716950644 and parameters: {'learnig_rate': 0.05488880788193643, 'num_iterations': 52, 'num_leaves': 73}. Best is trial 13 with value: 0.4310932211789753.[0m
[32m[I 2022-12-14 12:07:22,316][0m Trial 22 finished with value: 0.4529039384058766 and parameters: {'learnig_rate': 0.0786886094910236, 'num_iterations': 21, 'num_leaves': 126}. Best is trial 13 with value: 0.4310932211789753.[0m
[32m[I 2022-12-14 12:07:22,366][0m Trial 23 finished with value: 0.45446845556051935 and parameters: {'learnig_rate': 0.013493803430709243, 'num_iterations': 125, 'num_leaves': 47}. Best is trial 13 with value: 0.4310932211789753.[0m
[32m[I 2022-12-14 12:07:22,399][0m Trial 24 finished with

[32m[I 2022-12-14 12:07:22,424][0m Trial 25 finished with value: 0.43142276980682814 and parameters: {'learnig_rate': 0.12465302962121681, 'num_iterations': 73, 'num_leaves': 50}. Best is trial 24 with value: 0.42950255161148276.[0m
[32m[I 2022-12-14 12:07:22,455][0m Trial 26 finished with value: 0.43479030481907244 and parameters: {'learnig_rate': 0.06960989694176617, 'num_iterations': 45, 'num_leaves': 118}. Best is trial 24 with value: 0.42950255161148276.[0m


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 126
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] Start training from score 0.383628
Training until validation scores don't improve for 10 rounds
[20]	training's l2: 0.148297	valid_1's l2: 0.150091
[40]	training's l2: 0.133651	valid_1's l2: 0.140099
Did not meet early stopping. Best iteration is:
[52]	training's l2: 0.128591	valid_1's l2: 0.138338
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 126
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] Start training from score 0.383628
Training until validation scores don't improve for 10 rounds
[20]	training's l2: 0.140978	valid_1's l2: 0.143146
Did not meet early stoppin

[32m[I 2022-12-14 12:07:22,522][0m Trial 27 finished with value: 0.4422193080329448 and parameters: {'learnig_rate': 0.01814351394102402, 'num_iterations': 126, 'num_leaves': 64}. Best is trial 24 with value: 0.42950255161148276.[0m
[32m[I 2022-12-14 12:07:22,545][0m Trial 28 finished with value: 0.433941349395174 and parameters: {'learnig_rate': 0.12842405017896763, 'num_iterations': 116, 'num_leaves': 26}. Best is trial 24 with value: 0.42950255161148276.[0m
[32m[I 2022-12-14 12:07:22,569][0m Trial 29 finished with value: 0.43687168117296205 and parameters: {'learnig_rate': 0.16790596423764448, 'num_iterations': 56, 'num_leaves': 41}. Best is trial 24 with value: 0.42950255161148276.[0m


[40]	training's l2: 0.161534	valid_1's l2: 0.161996
[60]	training's l2: 0.148955	valid_1's l2: 0.150736
[80]	training's l2: 0.14266	valid_1's l2: 0.144975
[100]	training's l2: 0.139484	valid_1's l2: 0.142391
[120]	training's l2: 0.134241	valid_1's l2: 0.140438
Did not meet early stopping. Best iteration is:
[126]	training's l2: 0.13307	valid_1's l2: 0.14004
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 126
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] Start training from score 0.383628
Training until validation scores don't improve for 10 rounds
[20]	training's l2: 0.128873	valid_1's l2: 0.138548
Early stopping, best iteration is:
[28]	training's l2: 0.123615	valid_1's l2: 0.136779
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] To

In [10]:
# ベストなハイパーパラメーターを表示
print(study.best_params)

{'learnig_rate': 0.14289386207184387, 'num_iterations': 103, 'num_leaves': 61}


# 最適なパラメーターの組み合わせを用いて訓練

In [11]:
# ベストなハイパーパラメーターを使って予測
params = {
        "objective": "regression",  # 回帰
        "learning_rate": study.best_params['learnig_rate'],  # 学習率
        "num_iterations": study.best_params['num_iterations'], # 木の数
        "num_leaves": study.best_params['num_leaves']  # 木にある分岐の個数
    }

# モデルの訓練
model = lgb.train(params,
                  lgb_train,
                  valid_sets=[lgb_train, lgb_valid],
                  verbose_eval=20,  # 学習過程の表示間隔
                  num_boost_round=500,  # 学習回数の最大値
                  early_stopping_rounds=10)  # 連続して10回性能が向上しなければ終了

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 126
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 7
[LightGBM] [Info] Start training from score 0.383628
Training until validation scores don't improve for 10 rounds
[20]	training's l2: 0.126978	valid_1's l2: 0.137939
[40]	training's l2: 0.115996	valid_1's l2: 0.135797
Early stopping, best iteration is:
[34]	training's l2: 0.118474	valid_1's l2: 0.135572




# 結果の予測と提出

In [12]:
y_pred = model.predict(x_test, num_iteration=model.best_iteration)

y_pred = (y_pred > 0.5).astype(int) # 結果を0か1に
y_pred = pd.Series(y_pred, name='Survived')

submission_data = pd.concat([test_id, y_pred], axis=1)
submission_data.to_csv('submission_titanic_lightgbm.csv', index=False)

In [13]:
submission_data.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
