# Imports

In [1]:
%run ../common_utils.py

In [2]:
from sklearn.ensemble import GradientBoostingRegressor

# Global parameters

In [3]:
random_state = 1

In [4]:
number_of_splits = 5

# Loading Data

In [5]:
def load_data(val_data, one_hot):
    train, test, metadata = load_all_data()
    nonCategorical, categorical = get_cat_and_non_cat_data(metadata)
#     categorical.remove('district')
    all_features = list(train.columns)
    all_features.remove('price')
    numerical_features = ['area_total','area_kitchen','area_living','floor','rooms','ceiling',
        'bathrooms_shared','bathrooms_private','balconies','loggias','phones','building_id','constructed','stories']
    categorical_to_numerical(train, ['street','address'])
    categorical_to_numerical(test, ['street','address'])
    if not val_data:
        X_train, y_train, test_labels = pre_process_numerical(
                            features = all_features, numerical_features = numerical_features, train = train, test = test,
                            outliers_value=7, val_data=val_data, val_split=0.2, random_state=42, scaler="std",
                            add_R="True", add_rel_height="True", droptable=[],
                            one_hot_encode=one_hot, cat_features=categorical, drop_old=True)
        y_train_log = np.log(y_train)
        return X_train, y_train, y_train_log, test_labels
    else:
        X_train, y_train, X_test, y_test, test_labels = pre_process_numerical(features = all_features, numerical_features = numerical_features, train = train, test = test,
                            outliers_value=7, val_data=val_data, val_split=0.2, random_state=42, scaler="std",
                            add_R="True", add_rel_height="True", droptable=[],
                            one_hot_encode=one_hot, cat_features=categorical, drop_old=True)
        y_train_log = np.log(y_train)
        return X_train, y_train, y_train_log, X_test, y_test, test_labels

In [6]:
X_train, y_train, y_train_log, test_labels = load_data(val_data=False, one_hot=True)

Hot encoding
Std


# Tune Model

## Original Model

In [23]:
grad_boost_regr = GradientBoostingRegressor(
    learning_rate=0.01,
    n_estimators=2000,
    subsample=1.0, 
    criterion='mse', 
    min_samples_split=4, 
    min_samples_leaf=2, 
    min_weight_fraction_leaf=0.0, 
    max_depth=9, 
    min_impurity_decrease=0.0, 
    init=None, 
    random_state=0, 
    max_features=None,
    alpha=0.9,
    verbose=0,
    max_leaf_nodes=None,
    warm_start=False,
    validation_fraction=0.1,
    n_iter_no_change=None,
    tol=0.0001,
    ccp_alpha=0.0
)

# Manual Tuning of Parameters
Tuning based on strategy from: https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/

In [45]:
gb1 = GradientBoostingRegressor(
    learning_rate = 0.1,
    loss = 'squared_error',
    criterion = 'squared_error',
    verbose = 0,
    warm_start = False,
)

## Start with Learning Rate 0.1
### Determine Number of Estimators

In [None]:
## CODE FROM LGBM

num_iterations = np.logspace(2,5,10).astype(int)
num_leaves = np.linspace(10,50,21).astype(int)
max_depth = np.linspace(5,19,8).astype(int)
feature_fraction = np.arange(0.7,0.999,0.05)
bagging_fraction = np.arange(0.7,0.999,0.05)
bagging_freq = np.arange(1,11,1)
learning_rate = np.geomspace(0.001,0.1,10)

best_average_score = 1
for n_it in num_iterations:
    print("level 0")
    for n_leaves in num_leaves:
        print("level 1")
        for max_d in max_depth:
            for feat_frac in feature_fraction:
                for bag_frac in bagging_fraction:
                    for bag_freq in bagging_freq:
                        for l_rate in learning_rate:
                            model = lgbm.LGBMRegressor(
                                        num_iterations = n_it,
                                        num_leaves = n_leaves,
                                        max_depth = max_d,
                                        feature_fraction = feat_frac,
                                        bagging_fraction = bag_frac,
                                        bagging_freq = bag_freq,
                                        learning_rate = l_rate,
                                        random_state=random_state, 
                                        silent=True, 
                                        metric='regression',
                                        num_threads=4, 
                                    )
                            
                                          
                            scores, average_score, _, _ = lgbm_groupKFold(
                                number_of_splits=number_splits,
                                model=model,
                                X_train=X_train, 
                                y_train=y_train_log,
                                eval_metric = 'neg_root_mean_squared_error'
                            )
                            
                            if average_score < best_average_score:
                                best_average_score = average_score
                                best_params = dict(
                                        num_iterations = n_it,
                                        num_leaves = n_leaves,
                                        max_depth = max_d,
                                        feature_fraction = feat_frac,
                                        bagging_fraction = bag_frac,
                                        bagging_freq = bag_freq,
                                        learning_rate = l_rate,
                                )
                                print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
                                print(average_score)
                                print(best_params)

In [8]:
n_estimators = np.linspace(10,100,10).astype(int)

best_average_score = 1

for n_est in n_estimators:
    print("current number of trees: ", n_est)
    params = dict(n_estimators = n_est)
    model = GradientBoostingRegressor(
                **params,
                learning_rate = 0.1,
                loss = 'squared_error',
                criterion = 'squared_error',
                verbose = 0,
                warm_start = False,
            )
    scores, average_score, _, _ = gradient_boost_groupKFold(number_of_splits=number_of_splits,
                                                            model=model,
                                                            X_train=X_train, 
                                                            y_train=y_train_log
                                                            )
    print(average_score)
    if average_score < best_average_score:
        best_average_score = average_score
        best_params = params
        print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
        print("scores: ", scores)
        print("average score: ", average_score)
        print("parameters: ", best_params)

current number of trees:  10
starting on split  1  of cross validation
starting on split  2  of cross validation
starting on split  3  of cross validation
starting on split  4  of cross validation
starting on split  5  of cross validation
0.4310192643415668
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
scores:  [0.42028064836827045, 0.4249393960816709, 0.42153340139603845, 0.45489915183511026, 0.433443724026744]
average score:  0.4310192643415668
parameters:  {'n_estimators': 10}
current number of trees:  20
starting on split  1  of cross validation
starting on split  2  of cross validation
starting on split  3  of cross validation
starting on split  4  of cross validation
starting on split  5  of cross validation
0.31295718756813046
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
scores:  [0.3063632315769583, 0.31480732011403334, 0.2993034154884508, 0.3218096868183128, 0.32250228384289725]
average score:  0.31

The score is still going lower with an increasing number of estimators, so we'll check if we can find an upper bound

In [11]:
n_estimators = np.linspace(110,200,10).astype(int)

best_average_score = 0.23017944010276176

for n_est in n_estimators:
    print("       -----------------       ")
    print("current number of trees: ", n_est)
    params = dict(n_estimators = n_est)
    model = GradientBoostingRegressor(
                **params,
                learning_rate = 0.1,
                loss = 'squared_error',
                criterion = 'squared_error',
                verbose = 0,
                warm_start = False,
            )
    scores, average_score, _, _ = gradient_boost_groupKFold(number_of_splits=number_of_splits,
                                                            model=model,
                                                            X_train=X_train, 
                                                            y_train=y_train_log
                                                            )
    print(average_score)
    if average_score < best_average_score:
        best_average_score = average_score
        best_params = params
        print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
        print("scores: ", scores)
        print("average score: ", average_score)
        print("parameters: ", best_params)

current number of trees:  110
starting on split  1  of cross validation
starting on split  2  of cross validation
starting on split  3  of cross validation
starting on split  4  of cross validation
starting on split  5  of cross validation
0.22860768369618864
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
scores:  [0.22373627221984732, 0.2324311234939978, 0.20723369456348217, 0.22642120181339162, 0.25321612639022434]
average score:  0.22860768369618864
parameters:  {'n_estimators': 110}
current number of trees:  120
starting on split  1  of cross validation
starting on split  2  of cross validation
starting on split  3  of cross validation
starting on split  4  of cross validation
starting on split  5  of cross validation
0.2273934783028113
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
scores:  [0.2233758035353712, 0.23197532231310808, 0.2049344308050802, 0.22504068831439966, 0.25164114654609737]
average score

In [13]:
n_estimators = np.linspace(210,300,10).astype(int)

best_average_score = 0.2219789624211495

for n_est in n_estimators:
    print("       -----------------       ")
    print("current number of trees: ", n_est)
    params = dict(n_estimators = n_est)
    model = GradientBoostingRegressor(
                **params,
                learning_rate = 0.1,
                loss = 'squared_error',
                criterion = 'squared_error',
                verbose = 0,
                warm_start = False,
            )
    scores, average_score, _, _ = gradient_boost_groupKFold(number_of_splits=number_of_splits,
                                                            model=model,
                                                            X_train=X_train, 
                                                            y_train=y_train_log
                                                            )
    print(average_score)
    if average_score < best_average_score:
        best_average_score = average_score
        best_params = params
        print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
        print("scores: ", scores)
        print("average score: ", average_score)
        print("parameters: ", best_params)

       -----------------       
current number of trees:  210
starting on split  1  of cross validation
starting on split  2  of cross validation
starting on split  3  of cross validation
starting on split  4  of cross validation
starting on split  5  of cross validation
0.22085599697697603
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
scores:  [0.21815351913980666, 0.22602322896939744, 0.19480056415356642, 0.2197107258912381, 0.2455919467308714]
average score:  0.22085599697697603
parameters:  {'n_estimators': 210}
       -----------------       
current number of trees:  220
starting on split  1  of cross validation
starting on split  2  of cross validation
starting on split  3  of cross validation
starting on split  4  of cross validation
starting on split  5  of cross validation
0.22143197066121587
       -----------------       
current number of trees:  230
starting on split  1  of cross validation
starting on split  2  of cross validation
s

Conclusion: with a learning rate of 0.1, [STOPPED] 300 trees are optimal. This seems like a high but perhaps still reasonably value. Tuning will for now continue with this value.

In [15]:
optimal_n_estimators = 300

### Tuning tree-specific parameters
#### max_depth & min_samples_split

In [17]:
max_depth = np.arange(start=4,stop=16,step=2).astype(int)
min_samples_split = np.logspace(1, 4, num=7, endpoint=True).astype(int)

best_average_score = 1

for m_depth in max_depth:
    for m_s_split in min_samples_split:
        print("       -----------------       ")
        print("current max depth:         ", m_depth)
        print("current min samples split: ", m_s_split)
        params = dict(
            max_depth = m_depth,
            min_samples_split = m_s_split,
        )
        model = GradientBoostingRegressor(
                    **params,
                    n_estimators = optimal_n_estimators,
                    learning_rate = 0.1,
                    loss = 'squared_error',
                    criterion = 'squared_error',
                    verbose = 0,
                    warm_start = False,
                )
        scores, average_score, _, _ = gradient_boost_groupKFold(number_of_splits=number_of_splits,
                                                                model=model,
                                                                X_train=X_train, 
                                                                y_train=y_train_log
                                                                )
        print(average_score)
        if average_score < best_average_score:
            best_average_score = average_score
            best_params = params
            print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
            print("scores: ", scores)
            print("average score: ", average_score)
            print("parameters: ", best_params)

       -----------------       
current max depth:          4
current min samples split:  10
starting on split  1  of cross validation
starting on split  2  of cross validation
starting on split  3  of cross validation
starting on split  4  of cross validation
starting on split  5  of cross validation
0.21362385337499581
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
scores:  [0.20958452633987557, 0.2137921012210202, 0.18228777368167004, 0.20949461204185538, 0.2529602535905579]
average score:  0.21362385337499581
parameters:  {'max_depth': 4, 'min_samples_split': 10}
       -----------------       
current max depth:          4
current min samples split:  31
starting on split  1  of cross validation
starting on split  2  of cross validation
starting on split  3  of cross validation
starting on split  4  of cross validation
starting on split  5  of cross validation
0.21305527942135574
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<

In [90]:
min_samples_split

array([   10.        ,    31.6227766 ,   100.        ,   316.22776602,
        1000.        ,  3162.27766017, 10000.        ])

In [67]:
n_estimators = np.linspace(10,100,10)
n_estimators

array([ 10.,  20.,  30.,  40.,  50.,  60.,  70.,  80.,  90., 100.])

In [84]:
X_train.size*0.005

8476.76