In [35]:
%run ../common_utils.py

In [6]:
import lightgbm as lgbm

# Parameter Tuning

## Parameters and defaults:
 * Regularization:
    * reg_lambda (L2) = 0
    * reg_alpha (L1) = 0
    * num_leaves = 31
    * subsample = 1
    * max_depth = -1
* Training
    * n_estimators = 100
    * early_stopping_rounds = None
    * categorical_feature (don't use one hot!) = 'auto'
    * learning_rate = eta = 0.1
* Reproducability
    * random_state = None

# Load Data

In [9]:
def load_data():
    train, test, metadata = load_all_data()
    nonCategorical, categorical = get_cat_and_non_cat_data(metadata)
    categorical.remove('district')
    all_features = list(train.columns)
    all_features.remove('price')
    numerical_features = ['area_total','area_kitchen','area_living','floor','rooms','ceiling',
        'bathrooms_shared','bathrooms_private','balconies','loggias','phones','building_id','constructed','stories']
    categorical_to_numerical(train, ['street','address'])
    categorical_to_numerical(test, ['street','address'])
    X_train, y_train, test_labels = pre_process_numerical(features = all_features, numerical_features = numerical_features, train = train, test = test,
                        outliers_value=7, val_data=False, val_split=0.0, random_state=42, scaler="std",
                        add_R="True", add_rel_height="True", droptable=[],
                        one_hot_encode=False, cat_features=categorical, drop_old=True)

    y_train_log = np.log(y_train)
    return X_train, y_train, y_train_log, test_labels

In [10]:
X_train, y_train, y_train_log, test_labels = load_data()

# Different models

In [7]:
lgbm1 = lgbm.LGBMRegressor(
    num_leaves=10,
    max_depth=5, 
    random_state=42, 
    silent=True, 
    metric='mse',
    n_jobs=4, 
    n_estimators=2000,
    colsample_bytree=0.95,
    subsample=0.9,
    learning_rate=0.05
)

# Select features and perform cross validation

In [27]:
selected_features = ['area_total','area_kitchen','floor','bathrooms_private',
            'r','district','constructed','stories','rel_height','parking','building_id']
X_train_selected = X_train[selected_features]

In [31]:
%%capture --no-display

number_splits = 5

scores, average_score, best_model, best_index = lgbm_groupKFold(
    number_of_splits=number_splits,
    model=lgbm1,
    X_train=X_train_selected, 
    y_train=y_train_log)

In [32]:
print("scores: ", scores)
print("average score: ", average_score)
print("best model: ", best_model)
print("best index: ", best_index)

scores:  [0.23472800746853872, 0.22887643149785344, 0.20719755838490833, 0.23352677808408173, 0.24369446889177043]
average score:  0.2296046488654305
best model:  LGBMRegressor(colsample_bytree=0.95, learning_rate=0.05, max_depth=5,
              metric='mse', n_estimators=2000, n_jobs=4, num_leaves=10,
              random_state=42, subsample=0.9)
best index:  2


In [36]:
predict_and_store(best_model, test_labels[selected_features], test_labels, path=".\submissions\LGBM3.0.csv", exponential=True)