In [1]:
%run ../common_utils.py

In [2]:
import lightgbm as lgbm
from sklearn.metrics import make_scorer

## Global parameters

In [3]:
random_state = 1

# Parameter Tuning

## Parameters and defaults:
 * Regularization:
    * reg_lambda (L2) = 0
    * reg_alpha (L1) = 0
    * num_leaves = 31
    * subsample = 1
    * max_depth = -1
* Training
    * n_estimators = 100
    * early_stopping_rounds = None
    * categorical_feature (don't use one hot!) = 'auto'
    * learning_rate = eta = 0.1
* Reproducability
    * random_state = None

# Load Data

In [4]:
def load_data(val_data=False):
    train, test, metadata = load_all_data()
    nonCategorical, categorical = get_cat_and_non_cat_data(metadata)
    categorical.remove('district')
    all_features = list(train.columns)
    all_features.remove('price')
    numerical_features = ['area_total','area_kitchen','area_living','floor','rooms','ceiling',
        'bathrooms_shared','bathrooms_private','balconies','loggias','phones','building_id','constructed','stories']
    categorical_to_numerical(train, ['street','address'])
    categorical_to_numerical(test, ['street','address'])
    if not val_data:
        X_train, y_train, test_labels = pre_process_numerical(features = all_features, numerical_features = numerical_features, train = train, test = test,
                            outliers_value=7, val_data=val_data, val_split=0.2, random_state=42, scaler="std",
                            add_R="True", add_rel_height="True", droptable=[],
                            one_hot_encode=False, cat_features=categorical, drop_old=True)
        y_train_log = np.log(y_train)
        return X_train, y_train, y_train_log, test_labels
    else:
        X_train, y_train, X_test, y_test, test_labels = pre_process_numerical(features = all_features, numerical_features = numerical_features, train = train, test = test,
                            outliers_value=7, val_data=val_data, val_split=0.2, random_state=42, scaler="std",
                            add_R="True", add_rel_height="True", droptable=[],
                            one_hot_encode=False, cat_features=categorical, drop_old=True)
        y_train_log = np.log(y_train)
        return X_train, y_train, y_train_log, X_test, y_test, test_labels


In [5]:
X_train, y_train, y_train_log, test_labels = load_data()

Std


# Different models

In [6]:
lgbm1 = lgbm.LGBMRegressor(
    num_leaves=10,
    max_depth=5, 
    random_state=42, 
    silent=True, 
    metric='mse',
    n_jobs=4, 
    num_iterations=2000,
    colsample_bytree=0.95,
    subsample=0.9,
    learning_rate=0.05
)

In [7]:
lgbm2 = lgbm.LGBMRegressor(
    random_state=33,
    early_stopping_rounds = 10,
    num_iterations=10000
)

In [8]:
lgbm3 = lgbm.LGBMRegressor(
    num_leaves=40,
    max_depth=10, 
    random_state=42, 
    silent=True, 
    metric='regression',
    num_threads=4, 
    num_iterations=10000,
    feature_fraction=0.8,
    bagging_fraction=0.9,
    bagging_freq=5,
    learning_rate=0.05,
    early_stopping_round=20
)

# Select features and perform cross validation

In [9]:
selected_features = ['area_total','area_kitchen','floor','bathrooms_private',
            'r','district','constructed','stories','rel_height','parking','building_id']
X_train_selected = X_train[selected_features]

## LGBM1

In [10]:
%%capture --no-display

number_splits = 5

scores, average_score, best_model, best_index = lgbm_groupKFold(
    number_of_splits=number_splits,
    model=lgbm1,
    X_train=X_train_selected, 
    y_train=y_train_log,
)

In [11]:
print("scores:        ", scores)
print("average score: ", average_score)
print("best model:    ", best_model)
print("best index:    ", best_index)

scores:         [0.23472800746853872, 0.22887643149785344, 0.20719755838490833, 0.23352677808408173, 0.24369446889177043]
average score:  0.2296046488654305
best model:     LGBMRegressor(colsample_bytree=0.95, learning_rate=0.05, max_depth=5,
              metric='mse', n_jobs=4, num_iterations=2000, num_leaves=10,
              random_state=42, subsample=0.9)
best index:     2


In [12]:
# predict_and_store(best_model, test_labels[selected_features], test_labels, path=".\submissions\LGBM3.0.csv", exponential=True)

## LGBM2 

In [13]:
%%capture --no-display

number_splits = 5

scores, average_score, best_model, best_index = lgbm_groupKFold(
    number_of_splits=number_splits,
    model=lgbm2,
    X_train=X_train_selected, 
    y_train=y_train_log)

In [14]:
print("scores:        ", scores)
print("average score: ", average_score)
print("best model:    ", best_model)
print("best index:    ", best_index)

scores:         [0.22651257767456692, 0.2353128963600227, 0.21032635729741744, 0.22784189465539958, 0.24042514218130573]
average score:  0.2280837736337425
best model:     LGBMRegressor(early_stopping_rounds=10, num_iterations=10000, random_state=33)
best index:     2


## LGBM3

In [15]:
%%capture --no-display

number_splits = 5

scores, average_score, best_model, best_index = lgbm_groupKFold(
    number_of_splits=number_splits,
    model=lgbm3,
    X_train=X_train_selected, 
    y_train=y_train_log)

In [16]:
print("scores:        ", scores)
print("average score: ", average_score)
print("best model:    ", best_model)
print("best index:    ", best_index)

scores:         [0.23185107984399478, 0.22446232592298868, 0.20850908530040685, 0.22853958860850024, 0.2407515034729897]
average score:  0.22682271662977604
best model:     LGBMRegressor(bagging_fraction=0.9, bagging_freq=5, early_stopping_round=20,
              feature_fraction=0.8, learning_rate=0.05, max_depth=10,
              metric='regression', num_iterations=10000, num_leaves=40,
              num_threads=4, random_state=42)
best index:     2


# Scikit-learn random grid search with cross validation

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

lgbm4 = lgbm.LGBMRegressor(
    random_state=42, 
    silent=True, 
    metric='regression',
    num_threads=4, 
)



learning_rate=0.05,


distributions = dict(num_iterations = np.logspace(2,5,10).astype(int),
                     num_leaves = np.linspace(10,50,21).astype(int),
                     max_depth = np.linspace(5,19,8).astype(int),
                     feature_fraction = np.arange(0.7,0.999,0.05),
                     bagging_fraction = np.arange(0.7,0.999,0.05),
                     bagging_freq = np.arange(1,11,1),
                     learning_rate = np.geomspace(0.001,0.1,10))

clf = RandomizedSearchCV(lgbm4, distributions, random_state=42, scoring=custom_asymmetric_eval)

groups = X_train["building_id"]

search = clf.fit(  
    X=X_train_selected, 
    y=y_train_log, 
    groups=groups)

search.best_params_

In [None]:
search.best_estimator_

In [None]:
lgbm5 = lgbm.LGBMRegressor(
  bagging_fraction=0.75, bagging_freq=7,
  feature_fraction=0.9000000000000001,
  learning_rate=0.03593813663804626, max_depth=13,
  metric='regression', num_iterations=10000, num_leaves=44,
  num_threads=4, random_state=42
)

In [None]:
%%capture --no-display

number_splits = 5

scores, average_score, best_model, best_index = lgbm_groupKFold(
    number_of_splits=number_splits,
    model=lgbm5,
    X_train=X_train_selected, 
    y_train=y_train_log)

In [None]:
print("scores:        ", scores)
print("average score: ", average_score)
print("best model:    ", best_model)
print("best index:    ", best_index)

In [None]:
lgbm6 = lgbm.LGBMRegressor(
  bagging_fraction=0.75, bagging_freq=7,
  feature_fraction=0.9000000000000001,
  learning_rate=0.03593813663804626, max_depth=13,
  metric='regression', num_iterations=10000, num_leaves=44,
  num_threads=4, random_state=42, early_stopping_rounds=10
)

In [None]:
%%capture --no-display

number_splits = 5

scores, average_score, best_model, best_index = lgbm_groupKFold(
    number_of_splits=number_splits,
    model=lgbm6,
    X_train=X_train_selected, 
    y_train=y_train_log)

In [None]:
print("scores:        ", scores)
print("average score: ", average_score)
print("best model:    ", best_model)
print("best index:    ", best_index)

In [None]:
%%capture --no-display

lgbm7 = lgbm.LGBMRegressor(
    random_state=42, 
    silent=True, 
    metric='regression',
    num_threads=4, 
)





distributions = dict(num_iterations = np.logspace(2,5,10).astype(int),
                     num_leaves = np.linspace(10,50,21).astype(int),
                     max_depth = np.linspace(5,19,8).astype(int),
                     feature_fraction = np.arange(0.7,0.999,0.05),
                     bagging_fraction = np.arange(0.7,0.999,0.05),
                     bagging_freq = np.arange(1,11,1),
                     learning_rate = np.geomspace(0.001,0.1,10))

scorer = make_scorer(custom_asymmetric_eval, greater_is_better=False)
clf = RandomizedSearchCV(lgbm7, distributions, random_state=42, scoring='neg_root_mean_squared_error')

groups = X_train["building_id"]

search = clf.fit(  
    X=X_train_selected, 
    y=y_train_log, 
    groups=groups,
    eval_metric='neg_root_mean_squared_error')

print(search.best_params_)
print(search.best_estimator_)

In [None]:
%%capture --no-display

number_splits = 5

scores, average_score, best_model, best_index = lgbm_groupKFold(
    number_of_splits=number_splits,
    model=search.best_estimator_,
    X_train=X_train_selected, 
    y_train=y_train_log)

In [None]:
print("scores:        ", scores)
print("average score: ", average_score)
print("best model:    ", best_model)
print("best index:    ", best_index)

# Manual tuning

In [17]:
num_iterations = np.logspace(2,5,10).astype(int)
num_leaves = np.linspace(10,50,21).astype(int)
max_depth = np.linspace(5,19,8).astype(int)
feature_fraction = np.arange(0.7,0.999,0.05)
bagging_fraction = np.arange(0.7,0.999,0.05)
bagging_freq = np.arange(1,11,1)
learning_rate = np.geomspace(0.001,0.1,10)

number_splits = 5

In [18]:
lgbm8 = lgbm.LGBMRegressor(
    random_state=random_state, 
    silent=True, 
    metric='regression',
    num_threads=4, 
)

In [None]:
# best_average_score = 1
# for n_it in num_iterations:
#     print("level 0")
#     for n_leaves in num_leaves:
#         print("level 1")
#         for max_d in max_depth:
#             for feat_frac in feature_fraction:
#                 for bag_frac in bagging_fraction:
#                     for bag_freq in bagging_freq:
#                         for l_rate in learning_rate:
#                             model = lgbm.LGBMRegressor(
#                                         num_iterations = n_it,
#                                         num_leaves = n_leaves,
#                                         max_depth = max_d,
#                                         feature_fraction = feat_frac,
#                                         bagging_fraction = bag_frac,
#                                         bagging_freq = bag_freq,
#                                         learning_rate = l_rate,
#                                         random_state=random_state, 
#                                         silent=True, 
#                                         metric='regression',
#                                         num_threads=4, 
#                                     )
                            
                                          
#                             scores, average_score, _, _ = lgbm_groupKFold(
#                                 number_of_splits=number_splits,
#                                 model=model,
#                                 X_train=X_train, 
#                                 y_train=y_train_log,
#                                 eval_metric = 'neg_root_mean_squared_error'
#                             )
                            
#                             if average_score < best_average_score:
#                                 best_average_score = average_score
#                                 best_params = dict(
#                                         num_iterations = n_it,
#                                         num_leaves = n_leaves,
#                                         max_depth = max_d,
#                                         feature_fraction = feat_frac,
#                                         bagging_fraction = bag_frac,
#                                         bagging_freq = bag_freq,
#                                         learning_rate = l_rate,
#                                 )
#                                 print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
#                                 print(average_score)
#                                 print(best_params)

Promising: https://towardsdatascience.com/kagglers-guide-to-lightgbm-hyperparameter-tuning-with-optuna-in-2021-ed048d9838b5