**Reference:** https://towardsdatascience.com/kagglers-guide-to-lightgbm-hyperparameter-tuning-with-optuna-in-2021-ed048d9838b5

In [32]:
%run ../common_utils.py
import lightgbm as lgbm
import optuna
from IPython.utils import io

In [11]:
random_state = 1
number_of_splits = 5

In [12]:
def load_data(val_data=False):
    train, test, metadata = load_all_data()
    nonCategorical, categorical = get_cat_and_non_cat_data(metadata)
    categorical.remove('district')
    all_features = list(train.columns)
    all_features.remove('price')
    numerical_features = ['area_total','area_kitchen','area_living','floor','rooms','ceiling',
        'bathrooms_shared','bathrooms_private','balconies','loggias','phones','building_id','constructed','stories']
    categorical_to_numerical(train, ['street','address'])
    categorical_to_numerical(test, ['street','address'])
    if not val_data:
        X_train, y_train, test_labels = pre_process_numerical(features = all_features, numerical_features = numerical_features, train = train, test = test,
                            outliers_value=7, val_data=val_data, val_split=0.2, random_state=42, scaler="std",
                            add_R="True", add_rel_height="True", droptable=[],
                            one_hot_encode=False, cat_features=categorical, drop_old=True)
        y_train_log = np.log(y_train)
        return X_train, y_train, y_train_log, test_labels
    else:
        X_train, y_train, X_test, y_test, test_labels = pre_process_numerical(features = all_features, numerical_features = numerical_features, train = train, test = test,
                            outliers_value=7, val_data=val_data, val_split=0.2, random_state=42, scaler="std",
                            add_R="True", add_rel_height="True", droptable=[],
                            one_hot_encode=False, cat_features=categorical, drop_old=True)
        y_train_log = np.log(y_train)
        return X_train, y_train, y_train_log, X_test, y_test, test_labels
X_train, y_train, y_train_log, test_labels = load_data()

Std


In [60]:
from optuna.integration import LightGBMPruningCallback

def objective(trial, X, y):
    
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "num_iterations": trial.suggest_int("num_iterations",1e2,1e5,log=True),
        "n_estimators": trial.suggest_int("n_estimators", 1e1,1e4,log=True),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3,log=True),
        "num_leaves": trial.suggest_int("num_leaves", 5, 2000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 18),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.5, 0.95, step=0.1
        ),
        "bagging_freq": trial.suggest_int("bagging_freq", 1,11,step=1),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.1
        ),
    }
    
    X_train = X
    y_train = y
    
    scores = []
        
    cv = GroupKFold(n_splits=number_of_splits)
    groups = X_train["building_id"]

    with io.capture_output() as captured:
        for train_index, test_index in cv.split(X_train, y_train, groups):
            X_train2, X_test = X_train.iloc[train_index], X_train.iloc[test_index]
            y_train2, y_test = y_train.iloc[train_index], y_train.iloc[test_index]

            model = lgbm.LGBMRegressor(**param_grid, random_state=random_state)

            model.fit(
                X_train2,
                y_train2,
                eval_set=[(X_test, y_test)],
                eval_metric='rmse',
                verbose=False,
                early_stopping_rounds=100,
                callbacks = [LightGBMPruningCallback(trial, 'rmse')],
            )
            prediction = np.exp(model.predict(X_test))
            score = root_mean_squared_log_error(prediction, np.exp(y_test))
            scores.append(score)
    return np.average(scores)

In [61]:
study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
with io.capture_output() as captured:
    func = lambda trial: objective(trial, X_train, y_train_log)
    study.optimize(func, n_trials=100)

[32m[I 2021-11-08 17:02:48,432][0m A new study created in memory with name: LGBM Classifier[0m
[32m[I 2021-11-08 17:03:34,840][0m Trial 0 finished with value: 0.3892327397815446 and parameters: {'num_iterations': 28575, 'n_estimators': 107, 'learning_rate': 0.0012743551434119557, 'num_leaves': 1245, 'max_depth': 6, 'min_data_in_leaf': 3100, 'lambda_l1': 30, 'lambda_l2': 80, 'min_gain_to_split': 8.108115609100546, 'bagging_fraction': 0.6, 'bagging_freq': 5, 'feature_fraction': 0.2}. Best is trial 0 with value: 0.3892327397815446.[0m
[32m[I 2021-11-08 17:03:35,467][0m Trial 1 finished with value: 0.8433245390866679 and parameters: {'num_iterations': 4653, 'n_estimators': 134, 'learning_rate': 0.04773111221019114, 'num_leaves': 525, 'max_depth': 17, 'min_data_in_leaf': 9500, 'lambda_l1': 55, 'lambda_l2': 35, 'min_gain_to_split': 7.987673188857646, 'bagging_fraction': 0.5, 'bagging_freq': 1, 'feature_fraction': 0.8}. Best is trial 0 with value: 0.3892327397815446.[0m
[32m[I 2021-

In [62]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value (rmse): 0.28736
	Best params:
		num_iterations: 669
		n_estimators: 444
		learning_rate: 0.14046319634459542
		num_leaves: 365
		max_depth: 12
		min_data_in_leaf: 500
		lambda_l1: 65
		lambda_l2: 15
		min_gain_to_split: 14.66837136723361
		bagging_fraction: 0.8
		bagging_freq: 8
		feature_fraction: 0.7


In [64]:
# print(f"\tBest value (rmse): {study.best_value:.5f}")
# print(f"\tBest params:")

# for key, value in study.best_params.items():
#     print(f"\t\t{key}: {value}")

	Best value (rmse): 0.26229
	Best params:
		num_iterations: 405
		n_estimators: 4596
		learning_rate: 0.22799605867389866
		num_leaves: 65
		max_depth: 12
		min_data_in_leaf: 200
		lambda_l1: 80
		lambda_l2: 5
		min_gain_to_split: 2.2609116007082672
		bagging_fraction: 0.9
		bagging_freq: 7
		feature_fraction: 0.7


In [96]:
from optuna.integration import LightGBMPruningCallback

def objective(trial, X, y):
    
    param_grid = {
        "num_iterations": trial.suggest_categorical("num_iterations",[10000]),
        "n_estimators": trial.suggest_int("n_estimators", 1e2,1e3,log=True),
        "learning_rate": trial.suggest_categorical("learning_rate", [0.05]),
        "num_leaves": trial.suggest_categorical("num_leaves", [40]),
        "max_depth": trial.suggest_categorical("max_depth", [10]),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20,500, step=50),
#         "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
#         "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
#         "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_categorical(
            "bagging_fraction",[ 0.9]
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [5]),
        "feature_fraction": trial.suggest_categorical(
            "feature_fraction",[0.8]
        ),
    }
    
    X_train = X
    y_train = y
    
    scores = []
        
    cv = GroupKFold(n_splits=number_of_splits)
    groups = X_train["building_id"]

    with io.capture_output() as captured:
        for train_index, test_index in cv.split(X_train, y_train, groups):
            X_train2, X_test = X_train.iloc[train_index], X_train.iloc[test_index]
            y_train2, y_test = y_train.iloc[train_index], y_train.iloc[test_index]

            model = lgbm.LGBMRegressor(**param_grid, random_state=random_state,early_stopping_round=100,silent=True,metric='regression',num_threads=4)

            model.fit(
                X_train2,
                y_train2,
                eval_set=[(X_test, y_test)],
                eval_metric='rmse',
                verbose=False,
                early_stopping_rounds=100,
                callbacks = [LightGBMPruningCallback(trial, 'rmse')],
            )
            prediction = np.exp(model.predict(X_test))
            score = root_mean_squared_log_error(prediction, np.exp(y_test))
            scores.append(score)
    return np.average(scores)

In [97]:
study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
with io.capture_output() as captured:
    func = lambda trial: objective(trial, X_train, y_train_log)
    study.optimize(func, n_trials=20)

[32m[I 2021-11-09 21:10:12,163][0m A new study created in memory with name: LGBM Classifier[0m
[32m[I 2021-11-09 21:10:49,925][0m Trial 0 finished with value: 0.21078099747803117 and parameters: {'num_iterations': 10000, 'n_estimators': 102, 'learning_rate': 0.05, 'num_leaves': 40, 'max_depth': 10, 'min_data_in_leaf': 470, 'bagging_fraction': 0.9, 'bagging_freq': 5, 'feature_fraction': 0.8}. Best is trial 0 with value: 0.21078099747803117.[0m
[32m[I 2021-11-09 21:11:23,376][0m Trial 1 finished with value: 0.20877721690751275 and parameters: {'num_iterations': 10000, 'n_estimators': 312, 'learning_rate': 0.05, 'num_leaves': 40, 'max_depth': 10, 'min_data_in_leaf': 320, 'bagging_fraction': 0.9, 'bagging_freq': 5, 'feature_fraction': 0.8}. Best is trial 1 with value: 0.20877721690751275.[0m
[32m[I 2021-11-09 21:11:55,633][0m Trial 2 finished with value: 0.20012411838768807 and parameters: {'num_iterations': 10000, 'n_estimators': 152, 'learning_rate': 0.05, 'num_leaves': 40, 'm

In [98]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value (rmse): 0.20012
	Best params:
		num_iterations: 10000
		n_estimators: 152
		learning_rate: 0.05
		num_leaves: 40
		max_depth: 10
		min_data_in_leaf: 20
		bagging_fraction: 0.9
		bagging_freq: 5
		feature_fraction: 0.8


In [99]:
study.best_params

{'num_iterations': 10000,
 'n_estimators': 152,
 'learning_rate': 0.05,
 'num_leaves': 40,
 'max_depth': 10,
 'min_data_in_leaf': 20,
 'bagging_fraction': 0.9,
 'bagging_freq': 5,
 'feature_fraction': 0.8}