In [18]:
import numpy as np
import pandas as pd

In [19]:
train = pd.read_csv("../data/train_preprocessed.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]

test_x = pd.read_csv("../data/test_preprocessed.csv")
test_x = test_x.dropna(how="any")

In [20]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=4, shuffle=True, random_state=71)
tr_idx, va_idx = list(kf.split(train_x))[0]

tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]


In [21]:
import lightgbm as lgb
from sklearn.metrics import log_loss



In [22]:
def evaluate(features):
    dtrain = lgb.Dataset(tr_x[features], label=tr_y)
    dvalid = lgb.Dataset(va_x[features], label=va_y)

    params = {'objective': 'binary', 'seed': 71, 'verbose': 0, 'metrics': 'binary_logloss'}
    num_round = 10

    model = lgb.train(
        params, dtrain, num_boost_round=num_round, valid_names=["train", "valid"], valid_sets=[dtrain, dvalid]
    )

    va_pred = model.predict(va_x[features])

    return log_loss(va_y, va_pred)


In [23]:
#Greedy forward Selection
# すべての特徴の組み合わせを探索して、最適な組み合わせを探す
# 単純化のため、選択されたものに追加してスコアを改善したものを追加していく

best_score = float("inf")
selected = set([])

In [24]:
while True:
    if len(selected) == len(train_x.columns):
        break

    scores = []
    for feature in train_x.columns:
        if feature not in selected:
            features_selected = list(selected) + [feature]

            score = evaluate(features_selected)
            scores.append((feature, score))

    
    best_feature, best_score_in_loop = sorted(scores, key=lambda tpl: tpl[1])[0]

    if best_score_in_loop < best_score:
        selected.add(best_feature)
        best_score = best_score_in_loop

        print("Best score", best_score, "Selected features", selected)
    else:
        break


print("Final selected features", selected)

This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[1]	train's binary_logloss: 0.481952	valid's binary_logloss: 0.491663
[2]	train's binary_logloss: 0.477722	valid's binary_logloss: 0.488398
[3]	train's binary_logloss: 0.47433	valid's binary_logloss: 0.485966
[4]	train's binary_logloss: 0.471627	valid's binary_logloss: 0.484193
[5]	train's binary_logloss: 0.469454	valid's binary_logloss: 0.482878
[6]	train's binary_logloss: 0.467708	valid's binary_logloss: 0.481963
[7]	train's binary_logloss: 0.466295	valid's binary_logloss: 0.481144
[8]	train's binary_logloss: 0.465128	valid's binary_logloss: 0.480764
[9]	train's binary_logloss: 0.464177	valid's binary_logloss: 0.480541
[10]	train's binary_logloss: 0.463425	valid's binary_logloss: 0.480289
This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=fals