In [55]:
import numpy as np
import pandas as pd
import os
import pickle
import gc 

# 分布確認
import ydata_profiling as pdp

# 可視化
import matplotlib.pyplot as plt

# 前処理
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder

# バリデーション
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

# 評価指標
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

# モデリング: lightgbm
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

import sys

# matplotilbで日本語表示したい場合はこれをinstallしてインポートする
import japanize_matplotlib

from pathlib import Path

# Path().resolve()
# sys.path.append(os.path.join(Path().resolve(), '../source/'))

# 自作関数インポート
from src import util



## データ確認

In [2]:

df_train = pd.read_csv("../data/input/train.csv") # 学習データ
df_test = pd.read_csv("../data/input/test.csv")   # テストデータ

# データセットの作成

* 性別は数値でないためOneHotVectorで数値化する

In [3]:
# 性別をベクトル化
df_one = pd.get_dummies(df_train[["Sex"]], dummy_na=False, drop_first=False)
df_one = df_one.astype(np.int64)

df_sex_test = pd.get_dummies(df_test[["Sex"]], dummy_na=False, drop_first=False)
df_sex_test = df_sex_test.astype(np.int64)


In [21]:
# データセット作成
x_train = pd.concat([df_one, df_train[["Pclass", "Fare"]]], axis=1)
y_train = df_train[["Survived"]]
id_train = df_train[["PassengerId"]]

x_test = pd.concat([df_sex_test, df_test[["Pclass", "Fare"]]], axis=1)

Unnamed: 0,Sex_female,Sex_male,Pclass,Fare
0,0,1,3,7.25
1,1,0,1,71.2833
2,1,0,3,7.925
3,1,0,1,53.1
4,0,1,3,8.05


# 検証の方針

## 検証データ
* 交差検証を用いる


## モデル
* LightGBMを用いる

# ベースラインの作成

## ハイパーパラメータの探索

#### 最適化処理（探索の実行）

## 探索結果の確認

In [28]:
trial = study.best_trial
print("acc(best)={:.4f}".format(trial.value))
display(trial.params)

acc(best)=0.7868


{'num_leaves': 107,
 'min_data_in_leaf': 149,
 'min_sum_hessian_in_leaf': 3.52756635172055e-05,
 'feature_fraction': 0.5877258780737462,
 'bagging_fraction': 0.7657756869209191,
 'lambda_l1': 1.3406343673102123,
 'lambda_l2': 3.4482904089131434}

### モデルの学習

In [32]:
# CV実行
imp, metrics, model_list = train_cv(x_train, y_train, id_train, params, n_splits=5)

-------------------- 0 --------------------
x_train (712, 4) y_valid (712, 1)
x_valid (179, 4) y_valid (179, 1)
y_train:0.384, y_tr:0.383, y_va:0.385
[100]	training's auc: 0.857026	valid_1's auc: 0.805072
[accuracy] tr: 0.79, va: 0.79
-------------------- 1 --------------------
x_train (713, 4) y_valid (713, 1)
x_valid (178, 4) y_valid (178, 1)
y_train:0.384, y_tr:0.384, y_va:0.382
[100]	training's auc: 0.850918	valid_1's auc: 0.820922
[accuracy] tr: 0.79, va: 0.76
-------------------- 2 --------------------
x_train (713, 4) y_valid (713, 1)
x_valid (178, 4) y_valid (178, 1)
y_train:0.384, y_tr:0.384, y_va:0.382
[100]	training's auc: 0.847991	valid_1's auc: 0.838636
[accuracy] tr: 0.78, va: 0.80
-------------------- 3 --------------------
x_train (713, 4) y_valid (713, 1)
x_valid (178, 4) y_valid (178, 1)
y_train:0.384, y_tr:0.384, y_va:0.382
[100]	training's auc: 0.849068	valid_1's auc: 0.859358
[accuracy] tr: 0.79, va: 0.79
-------------------- 4 --------------------
x_train (713, 4)

### 重要度の評価

In [33]:
imp.sort_values("imp", ascending=False, ignore_index=True)

Unnamed: 0,col,imp,imp_std
0,Sex_female,761.003072,46.443744
1,Sex_male,325.90129,24.337061
2,Pclass,260.195238,35.713838
3,Fare,71.360444,31.700849


# モデル推論

In [34]:
df_sex_test = pd.get_dummies(df_test[["Sex"]], dummy_na=False, drop_first=False)
df_sex_test = df_sex_test.astype(np.int64)
x_test = pd.concat([df_sex_test, df_test[["Pclass", "Fare"]]], axis=1)
id_test = df_test[["PassengerId"]]
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Sex_female  418 non-null    int64  
 1   Sex_male    418 non-null    int64  
 2   Pclass      418 non-null    int64  
 3   Fare        417 non-null    float64
dtypes: float64(1), int64(3)
memory usage: 13.2 KB


In [36]:
# 結果を辞書に保存
solution = {}
 
# 各モデルで予測
for i, model in enumerate(model_list):
    test_pred = model.predict(x_test)
    solution[str(i) + "_model"] = test_pred

# 辞書からDataFrameに変更
solution = pd.DataFrame(solution)
solution

Unnamed: 0,0_model,1_model,2_model,3_model,4_model
0,0,0,0,0,0
1,1,1,1,1,1
2,0,0,0,0,0
3,0,0,0,0,0
4,1,1,1,1,1
...,...,...,...,...,...
413,0,0,0,0,0
414,1,1,1,1,1
415,0,0,0,0,0
416,0,0,0,0,0


In [37]:
# 多数決 (最頻値)を取得
solution_max = solution.mode(axis = 1).values

In [39]:
# PassengerIdを取得
PassengerId = np.array(df_test["PassengerId"]).astype(int)
 
# my_prediction(予測データ）とPassengerIdをデータフレームへ落とし込む
my_solution = pd.DataFrame(solution_max.astype(int), index = PassengerId, columns = ["Survived"])
 
# my_tree_one.csvとして書き出し
my_solution.to_csv("../data/output/001/submission.csv", index_label = ["PassengerId"])