In [32]:
%run ../common_utils.py
import lightgbm as lgbm
import optuna
from IPython.utils import io

In [11]:
random_state = 1
number_of_splits = 5

In [12]:
def load_data(val_data=False):
    train, test, metadata = load_all_data()
    nonCategorical, categorical = get_cat_and_non_cat_data(metadata)
    categorical.remove('district')
    all_features = list(train.columns)
    all_features.remove('price')
    numerical_features = ['area_total','area_kitchen','area_living','floor','rooms','ceiling',
        'bathrooms_shared','bathrooms_private','balconies','loggias','phones','building_id','constructed','stories']
    categorical_to_numerical(train, ['street','address'])
    categorical_to_numerical(test, ['street','address'])
    if not val_data:
        X_train, y_train, test_labels = pre_process_numerical(features = all_features, numerical_features = numerical_features, train = train, test = test,
                            outliers_value=7, val_data=val_data, val_split=0.2, random_state=42, scaler="std",
                            add_R="True", add_rel_height="True", droptable=[],
                            one_hot_encode=False, cat_features=categorical, drop_old=True)
        y_train_log = np.log(y_train)
        return X_train, y_train, y_train_log, test_labels
    else:
        X_train, y_train, X_test, y_test, test_labels = pre_process_numerical(features = all_features, numerical_features = numerical_features, train = train, test = test,
                            outliers_value=7, val_data=val_data, val_split=0.2, random_state=42, scaler="std",
                            add_R="True", add_rel_height="True", droptable=[],
                            one_hot_encode=False, cat_features=categorical, drop_old=True)
        y_train_log = np.log(y_train)
        return X_train, y_train, y_train_log, X_test, y_test, test_labels
X_train, y_train, y_train_log, test_labels = load_data()

Std


In [50]:
from optuna.integration import LightGBMPruningCallback

def objective(trial, X, y):
    
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "num_iterations": trial.suggest_int("num_iterations",1e2,1e5,log=True),
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3,log=True),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.1
        ),
    }
    
    X_train = X
    y_train = y
    
    scores = []
        
    cv = GroupKFold(n_splits=number_of_splits)
    groups = X_train["building_id"]

    with io.capture_output() as captured:
        for train_index, test_index in cv.split(X_train, y_train, groups):
            X_train2, X_test = X_train.iloc[train_index], X_train.iloc[test_index]
            y_train2, y_test = y_train.iloc[train_index], y_train.iloc[test_index]

            model = lgbm.LGBMRegressor(**param_grid, random_state=random_state)

            model.fit(
                X_train2,
                y_train2,
                eval_set=[(X_test, y_test)],
                eval_metric='rmse',
                verbose=False,
                early_stopping_rounds=100,
                callbacks = [LightGBMPruningCallback(trial, 'rmse')],
            )
            prediction = np.exp(model.predict(X_test))
            score = root_mean_squared_log_error(prediction, np.exp(y_test))
            scores.append(score)
    return np.average(scores)

In [51]:
study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
with io.capture_output() as captured:
    func = lambda trial: objective(trial, X_train, y_train_log)
    study.optimize(func, n_trials=20)

[32m[I 2021-11-08 16:50:44,471][0m A new study created in memory with name: LGBM Classifier[0m
[32m[I 2021-11-08 16:50:52,452][0m Trial 0 finished with value: 0.4728145117165127 and parameters: {'num_iterations': 10356, 'n_estimators': 10000, 'learning_rate': 0.014750231021632037, 'num_leaves': 880, 'max_depth': 12, 'min_data_in_leaf': 5400, 'lambda_l1': 90, 'lambda_l2': 30, 'min_gain_to_split': 10.001016608034877, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 0 with value: 0.4728145117165127.[0m
[32m[I 2021-11-08 16:50:53,138][0m Trial 1 finished with value: 0.8433245390866679 and parameters: {'num_iterations': 12039, 'n_estimators': 10000, 'learning_rate': 0.1441497317897493, 'num_leaves': 1880, 'max_depth': 7, 'min_data_in_leaf': 8300, 'lambda_l1': 95, 'lambda_l2': 75, 'min_gain_to_split': 8.44187032554026, 'bagging_fraction': 0.2, 'bagging_freq': 1, 'feature_fraction': 0.9}. Best is trial 0 with value: 0.4728145117165127

In [52]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value (rmse): 0.24273
	Best params:
		num_iterations: 533
		n_estimators: 10000
		learning_rate: 0.07132775528350026
		num_leaves: 260
		max_depth: 9
		min_data_in_leaf: 400
		lambda_l1: 15
		lambda_l2: 15
		min_gain_to_split: 0.3981077952081069
		bagging_fraction: 0.7
		bagging_freq: 1
		feature_fraction: 0.30000000000000004


In [53]:
with io.capture_output() as captured:
    study.optimize(func, n_trials=100)

[32m[I 2021-11-08 16:51:36,354][0m Trial 20 finished with value: 0.2625959016767369 and parameters: {'num_iterations': 562, 'n_estimators': 10000, 'learning_rate': 0.14645860434928423, 'num_leaves': 640, 'max_depth': 9, 'min_data_in_leaf': 400, 'lambda_l1': 35, 'lambda_l2': 90, 'min_gain_to_split': 0.133397776501884, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 19 with value: 0.24273186852976608.[0m
[32m[I 2021-11-08 16:51:41,907][0m Trial 21 finished with value: 0.26510378891220715 and parameters: {'num_iterations': 615, 'n_estimators': 10000, 'learning_rate': 0.14243499371211632, 'num_leaves': 640, 'max_depth': 9, 'min_data_in_leaf': 200, 'lambda_l1': 40, 'lambda_l2': 95, 'min_gain_to_split': 0.20164301580740196, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 19 with value: 0.24273186852976608.[0m
[32m[I 2021-11-08 16:51:47,929][0m Trial 22 finished with value: 0.273144

In [54]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value (rmse): 0.23855
	Best params:
		num_iterations: 677
		n_estimators: 10000
		learning_rate: 0.1638297627426783
		num_leaves: 400
		max_depth: 8
		min_data_in_leaf: 200
		lambda_l1: 25
		lambda_l2: 85
		min_gain_to_split: 0.009538882474017303
		bagging_fraction: 0.6000000000000001
		bagging_freq: 1
		feature_fraction: 0.30000000000000004
