In [32]:
%run ../common_utils.py
import lightgbm as lgbm
import optuna
from IPython.utils import io

In [11]:
random_state = 1
number_of_splits = 5

In [12]:
def load_data(val_data=False):
    train, test, metadata = load_all_data()
    nonCategorical, categorical = get_cat_and_non_cat_data(metadata)
    categorical.remove('district')
    all_features = list(train.columns)
    all_features.remove('price')
    numerical_features = ['area_total','area_kitchen','area_living','floor','rooms','ceiling',
        'bathrooms_shared','bathrooms_private','balconies','loggias','phones','building_id','constructed','stories']
    categorical_to_numerical(train, ['street','address'])
    categorical_to_numerical(test, ['street','address'])
    if not val_data:
        X_train, y_train, test_labels = pre_process_numerical(features = all_features, numerical_features = numerical_features, train = train, test = test,
                            outliers_value=7, val_data=val_data, val_split=0.2, random_state=42, scaler="std",
                            add_R="True", add_rel_height="True", droptable=[],
                            one_hot_encode=False, cat_features=categorical, drop_old=True)
        y_train_log = np.log(y_train)
        return X_train, y_train, y_train_log, test_labels
    else:
        X_train, y_train, X_test, y_test, test_labels = pre_process_numerical(features = all_features, numerical_features = numerical_features, train = train, test = test,
                            outliers_value=7, val_data=val_data, val_split=0.2, random_state=42, scaler="std",
                            add_R="True", add_rel_height="True", droptable=[],
                            one_hot_encode=False, cat_features=categorical, drop_old=True)
        y_train_log = np.log(y_train)
        return X_train, y_train, y_train_log, X_test, y_test, test_labels
X_train, y_train, y_train_log, test_labels = load_data()

Std


In [43]:
from optuna.integration import LightGBMPruningCallback

def objective(trial, X, y):
    
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        # "num_iterations": trail.suggest_int("num_iterations",100,100000),
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.1
        ),
    }
    
    X_train = X
    y_train = y
    
    scores = []
        
    cv = GroupKFold(n_splits=number_of_splits)
    groups = X_train["building_id"]

    with io.capture_output() as captured:
        for train_index, test_index in cv.split(X_train, y_train, groups):
            X_train2, X_test = X_train.iloc[train_index], X_train.iloc[test_index]
            y_train2, y_test = y_train.iloc[train_index], y_train.iloc[test_index]

            model = lgbm.LGBMRegressor(**param_grid, random_state=random_state)

            model.fit(
                X_train2,
                y_train2,
                eval_set=[(X_test, y_test)],
                eval_metric='rmse',
                verbose=False,
                early_stopping_rounds=100,
                callbacks = [LightGBMPruningCallback(trial, 'rmse')],
            )
            prediction = np.exp(model.predict(X_test))
            score = root_mean_squared_log_error(prediction, np.exp(y_test))
            scores.append(score)
    return np.average(scores)

In [44]:
study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
func = lambda trial: objective(trial, X_train, y_train_log)
study.optimize(func, n_trials=20)

[32m[I 2021-11-08 16:44:43,950][0m A new study created in memory with name: LGBM Classifier[0m
[32m[I 2021-11-08 16:44:46,687][0m Trial 0 finished with value: 0.4417989729424875 and parameters: {'n_estimators': 10000, 'learning_rate': 0.056143547959128404, 'num_leaves': 2640, 'max_depth': 7, 'min_data_in_leaf': 6300, 'lambda_l1': 100, 'lambda_l2': 20, 'min_gain_to_split': 9.79170572798174, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.4}. Best is trial 0 with value: 0.4417989729424875.[0m
[32m[I 2021-11-08 16:44:47,764][0m Trial 1 finished with value: 0.8433245390866679 and parameters: {'n_estimators': 10000, 'learning_rate': 0.2588314657638129, 'num_leaves': 2700, 'max_depth': 12, 'min_data_in_leaf': 8700, 'lambda_l1': 55, 'lambda_l2': 45, 'min_gain_to_split': 10.836157168763341, 'bagging_fraction': 0.30000000000000004, 'bagging_freq': 1, 'feature_fraction': 0.4}. Best is trial 0 with value: 0.4417989729424875.[0m
[32m[I 2021-11-08 16:44:49,887][0m Trial

In [45]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value (rmse): 0.24673
	Best params:
		n_estimators: 10000
		learning_rate: 0.1513938509701189
		num_leaves: 500
		max_depth: 6
		min_data_in_leaf: 300
		lambda_l1: 20
		lambda_l2: 80
		min_gain_to_split: 0.4047223383263967
		bagging_fraction: 0.6000000000000001
		bagging_freq: 1
		feature_fraction: 0.4


In [46]:
study.optimize(func, n_trials=100)

[32m[I 2021-11-08 16:45:15,653][0m Trial 20 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2021-11-08 16:45:17,751][0m Trial 21 finished with value: 0.325868412614642 and parameters: {'n_estimators': 10000, 'learning_rate': 0.19416665608886288, 'num_leaves': 1000, 'max_depth': 6, 'min_data_in_leaf': 1600, 'lambda_l1': 70, 'lambda_l2': 85, 'min_gain_to_split': 6.735601057586257, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 11 with value: 0.24673024944334526.[0m
[32m[I 2021-11-08 16:45:19,617][0m Trial 22 finished with value: 0.31313302274960997 and parameters: {'n_estimators': 10000, 'learning_rate': 0.16969468011379307, 'num_leaves': 920, 'max_depth': 5, 'min_data_in_leaf': 1200, 'lambda_l1': 55, 'lambda_l2': 85, 'min_gain_to_split': 11.87177647944143, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 11 with value: 0.24673024944334526.[0m
[

In [None]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")