# Imports

In [1]:
%run ../common_utils.py

In [2]:
from sklearn.ensemble import GradientBoostingRegressor

# Global parameters

In [3]:
random_state = 1

In [4]:
number_of_splits = 5

# Loading Data

In [5]:
def load_data(val_data, one_hot):
    train, test, metadata = load_all_data()
    nonCategorical, categorical = get_cat_and_non_cat_data(metadata)
#     categorical.remove('district')
    all_features = list(train.columns)
    all_features.remove('price')
    numerical_features = ['area_total','area_kitchen','area_living','floor','rooms','ceiling',
        'bathrooms_shared','bathrooms_private','balconies','loggias','phones','building_id','constructed','stories']
    categorical_to_numerical(train, ['street','address'])
    categorical_to_numerical(test, ['street','address'])
    if not val_data:
        X_train, y_train, test_labels = pre_process_numerical(
                            features = all_features, numerical_features = numerical_features, train = train, test = test,
                            outliers_value=7, val_data=val_data, val_split=0.2, random_state=42, scaler="std",
                            add_R="True", add_rel_height="True", droptable=[],
                            one_hot_encode=one_hot, cat_features=categorical, drop_old=True)
        y_train_log = np.log(y_train)
        return X_train, y_train, y_train_log, test_labels
    else:
        X_train, y_train, X_test, y_test, test_labels = pre_process_numerical(features = all_features, numerical_features = numerical_features, train = train, test = test,
                            outliers_value=7, val_data=val_data, val_split=0.2, random_state=42, scaler="std",
                            add_R="True", add_rel_height="True", droptable=[],
                            one_hot_encode=one_hot, cat_features=categorical, drop_old=True)
        y_train_log = np.log(y_train)
        return X_train, y_train, y_train_log, X_test, y_test, test_labels

In [6]:
X_train, y_train, y_train_log, test_labels = load_data(val_data=False, one_hot=True)

Hot encoding
Std


# Tune Model

## Original Model

In [7]:
grad_boost_regr = GradientBoostingRegressor(
    learning_rate=0.01,
    n_estimators=2000,
    subsample=1.0, 
    criterion='mse', 
    min_samples_split=4, 
    min_samples_leaf=2, 
    min_weight_fraction_leaf=0.0, 
    max_depth=9, 
    min_impurity_decrease=0.0, 
    init=None, 
    random_state=0, 
    max_features=None,
    alpha=0.9,
    verbose=0,
    max_leaf_nodes=None,
    warm_start=False,
    validation_fraction=0.1,
    n_iter_no_change=None,
    tol=0.0001,
    ccp_alpha=0.0
)

# Manual Tuning of Parameters
Tuning based on strategy from: https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/

In [8]:
gb1 = GradientBoostingRegressor(
    learning_rate = 0.1,
    loss = 'squared_error',
    criterion = 'squared_error',
    verbose = 0,
    warm_start = False,
)

## Start with Learning Rate 0.1
### Determine Number of Estimators

In [None]:
n_estimators = np.linspace(10,100,10).astype(int)

best_average_score = 1

for n_est in n_estimators:
    print("current number of trees: ", n_est)
    params = dict(n_estimators = n_est)
    model = GradientBoostingRegressor(
                **params,
                learning_rate = 0.1,
                loss = 'squared_error',
                criterion = 'squared_error',
                verbose = 0,
                warm_start = False,
                random_state = random_state,
            )
    scores, average_score, _, _ = gradient_boost_groupKFold(number_of_splits=number_of_splits,
                                                            model=model,
                                                            X_train=X_train, 
                                                            y_train=y_train_log
                                                            )
    print(average_score)
    if average_score < best_average_score:
        best_average_score = average_score
        best_params = params
        print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
        print("scores: ", scores)
        print("average score: ", average_score)
        print("parameters: ", best_params)

The score is still going lower with an increasing number of estimators, so we'll check if we can find an upper bound

In [None]:
n_estimators = np.linspace(110,200,10).astype(int)

best_average_score = 0.23017944010276176

for n_est in n_estimators:
    print("       -----------------       ")
    print("current number of trees: ", n_est)
    params = dict(n_estimators = n_est)
    model = GradientBoostingRegressor(
                **params,
                learning_rate = 0.1,
                loss = 'squared_error',
                criterion = 'squared_error',
                verbose = 0,
                warm_start = False,
                random_state = random_state,
            )
    scores, average_score, _, _ = gradient_boost_groupKFold(number_of_splits=number_of_splits,
                                                            model=model,
                                                            X_train=X_train, 
                                                            y_train=y_train_log
                                                            )
    print(average_score)
    if average_score < best_average_score:
        best_average_score = average_score
        best_params = params
        print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
        print("scores: ", scores)
        print("average score: ", average_score)
        print("parameters: ", best_params)

In [None]:
n_estimators = np.linspace(210,300,10).astype(int)

best_average_score = 0.2219789624211495

for n_est in n_estimators:
    print("       -----------------       ")
    print("current number of trees: ", n_est)
    params = dict(n_estimators = n_est)
    model = GradientBoostingRegressor(
                **params,
                learning_rate = 0.1,
                loss = 'squared_error',
                criterion = 'squared_error',
                verbose = 0,
                warm_start = False,
                random_state = random_state,
            )
    scores, average_score, _, _ = gradient_boost_groupKFold(number_of_splits=number_of_splits,
                                                            model=model,
                                                            X_train=X_train, 
                                                            y_train=y_train_log
                                                            )
    print(average_score)
    if average_score < best_average_score:
        best_average_score = average_score
        best_params = params
        print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
        print("scores: ", scores)
        print("average score: ", average_score)
        print("parameters: ", best_params)

Conclusion: with a learning rate of 0.1, [STOPPED] 300 trees are optimal. This seems like a high but perhaps still reasonably value. Tuning will for now continue with this value.

In [9]:
optimal_n_estimators = 300

### Tuning tree-specific parameters
#### max_depth & min_samples_split

In [None]:
max_depth = np.arange(start=4,stop=16,step=2).astype(int)
min_samples_split = np.logspace(1, 4, num=7, endpoint=True).astype(int)

best_average_score = 1

for m_depth in max_depth:
    for m_s_split in min_samples_split:
        print("       -----------------       ")
        print("current max depth:         ", m_depth)
        print("current min samples split: ", m_s_split)
        params = dict(
            max_depth = m_depth,
            min_samples_split = m_s_split,
        )
        model = GradientBoostingRegressor(
                    **params,
                    n_estimators = optimal_n_estimators,
                    learning_rate = 0.1,
                    loss = 'squared_error',
                    criterion = 'squared_error',
                    verbose = 0,
                    warm_start = False,
                    random_state = random_state,
                )
        scores, average_score, _, _ = gradient_boost_groupKFold(number_of_splits=number_of_splits,
                                                                model=model,
                                                                X_train=X_train, 
                                                                y_train=y_train_log
                                                                )
        print(average_score)
        if average_score < best_average_score:
            best_average_score = average_score
            best_params = params
            print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
            print("scores: ", scores)
            print("average score: ", average_score)
            print("parameters: ", best_params)

**Result:** 
```
scores:  [0.20500680772749033, 0.20547976952996316, 0.17665855113277418, 0.20171066397528792, 0.22715854765728632]
average score:  0.2032028680045604
parameters:  {'max_depth': 14, 'min_samples_split': 1000}
```

In [14]:
optimal_max_depth = 13
optimal_min_samples_split = 1000

**Check** if this can be improved by increasing `max_depth`, while keeping `min_samples_split` to 1000 (this was always optimal)

In [13]:
best_average_score = 0.2032028680045604
for m_depth in [13,16,17]:
    print("       -----------------       ")
    print("current max depth:         ", m_depth)
    params = dict(
        max_depth = m_depth,
    )
    model = GradientBoostingRegressor(
                **params,
                n_estimators = optimal_n_estimators,
                min_samples_split = optimal_min_samples_split,
                learning_rate = 0.1,
                loss = 'squared_error',
                criterion = 'squared_error',
                verbose = 0,
                warm_start = False,
                random_state = random_state,
            )
    scores, average_score, _, _ = gradient_boost_groupKFold(number_of_splits=number_of_splits,
                                                            model=model,
                                                            X_train=X_train, 
                                                            y_train=y_train_log
                                                            )
    print(average_score)
    if average_score < best_average_score:
        best_average_score = average_score
        best_params = params
        print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
        print("scores: ", scores)
        print("average score: ", average_score)
        print("parameters: ", best_params)

       -----------------       
current max depth:          13
starting on split  1  of cross validation
starting on split  2  of cross validation
starting on split  3  of cross validation
starting on split  4  of cross validation
starting on split  5  of cross validation
0.202723888466623
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
scores:  [0.20330882091356947, 0.20530917286186795, 0.1769646788996571, 0.20436452577377298, 0.22367224388424758]
average score:  0.202723888466623
parameters:  {'max_depth': 13}
       -----------------       
current max depth:          16
starting on split  1  of cross validation
starting on split  2  of cross validation
starting on split  3  of cross validation
starting on split  4  of cross validation
starting on split  5  of cross validation
0.207592587678627
       -----------------       
current max depth:          17
starting on split  1  of cross validation
starting on split  2  of cross validation
startin

**Results:** 

**`max_depth = 13, average_score = 0.202723888466623` --> OPTIMAL**

`max_depth = 15, average_score = 0.20544291334639767`

`max_depth = 16, average_score = 0.207592587678627`

`max_depth = 17, average_score = 0.20443446714202365`


#### min_samples_leaf

In [25]:
min_samples_leaf = np.arange(7,9,1).astype(int)

best_average_score = 0.205

for m_s_leaf in min_samples_leaf:
    print("       -----------------       ")
    print("current min samples leaf:         ", m_s_leaf)
    params = dict(
        min_samples_leaf = m_s_leaf,
    )
    model = GradientBoostingRegressor(
                **params,
                n_estimators = optimal_n_estimators,
                max_depth = optimal_max_depth,
                min_samples_split = optimal_min_samples_split,
                learning_rate = 0.1,
                loss = 'squared_error',
                criterion = 'squared_error',
                verbose = 0,
                warm_start = False,
                random_state = random_state,
            )
    scores, average_score, _, _ = gradient_boost_groupKFold(number_of_splits=number_of_splits,
                                                            model=model,
                                                            X_train=X_train, 
                                                            y_train=y_train_log
                                                            )
    print(average_score)
    if average_score < best_average_score:
        best_average_score = average_score
        best_params = params
        print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
        print("scores: ", scores)
        print("average score: ", average_score)
        print("parameters: ", best_params)

       -----------------       
current min samples leaf:          7
starting on split  1  of cross validation
starting on split  2  of cross validation
starting on split  3  of cross validation


KeyboardInterrupt: 

In [19]:
optimal_min_samples_leaf = 40

**Note:** using `optimal_min_leaf_samples =1` leads to a lower average score (`0.2032028680045604`) than `= 40` with an average score of `0.20446437151272606` but we chose the higher value to avoid overfitting

#### max_features

In [27]:
max_features = np.arange(7,13,1).astype(int)

best_average_score = 0.20446437151272606

for m_feature in max_features:
    print("       -----------------       ")
    print("current max features:         ", m_feature)
    params = dict(
        max_features = m_feature,
    )
    model = GradientBoostingRegressor(
                **params,
                n_estimators = optimal_n_estimators,
                max_depth = optimal_max_depth,
                min_samples_split = optimal_min_samples_split,
                min_samples_leaf = optimal_min_samples_leaf,
                learning_rate = 0.1,
                loss = 'squared_error',
                criterion = 'squared_error',
                verbose = 0,
                warm_start = False,
                random_state = random_state,
            )
    scores, average_score, _, _ = gradient_boost_groupKFold(number_of_splits=number_of_splits,
                                                            model=model,
                                                            X_train=X_train, 
                                                            y_train=y_train_log
                                                            )
    print(average_score)
    if average_score < best_average_score:
        best_average_score = average_score
        best_params = params
        print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
        print("scores: ", scores)
        print("average score: ", average_score)
        print("parameters: ", best_params)

       -----------------       
current max features:          7
starting on split  1  of cross validation
starting on split  2  of cross validation
starting on split  3  of cross validation
starting on split  4  of cross validation
starting on split  5  of cross validation
0.2122976175736293
       -----------------       
current max features:          8
starting on split  1  of cross validation
starting on split  2  of cross validation
starting on split  3  of cross validation
starting on split  4  of cross validation
starting on split  5  of cross validation
0.215511875218214
       -----------------       
current max features:          9
starting on split  1  of cross validation
starting on split  2  of cross validation
starting on split  3  of cross validation
starting on split  4  of cross validation
starting on split  5  of cross validation
0.21502492793268552
       -----------------       
current max features:          10
starting on split  1  of cross validation
starting o

In [24]:
max_features = np.arange(1,21,2).astype(int)

best_average_score = 0.20446437151272606

for m_feature in max_features:
    print("       -----------------       ")
    print("current max features:         ", m_feature)
    params = dict(
        max_features = m_feature,
    )
    model = GradientBoostingRegressor(
                **params,
                n_estimators = optimal_n_estimators,
                max_depth = optimal_max_depth,
                min_samples_split = optimal_min_samples_split,
                min_samples_leaf = optimal_min_samples_leaf,
                learning_rate = 0.1,
                loss = 'squared_error',
                criterion = 'squared_error',
                verbose = 0,
                warm_start = False,
                random_state = random_state,
            )
    scores, average_score, _, _ = gradient_boost_groupKFold(number_of_splits=number_of_splits,
                                                            model=model,
                                                            X_train=X_train, 
                                                            y_train=y_train_log
                                                            )
    print(average_score)
    if average_score < best_average_score:
        best_average_score = average_score
        best_params = params
        print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
        print("scores: ", scores)
        print("average score: ", average_score)
        print("parameters: ", best_params)

       -----------------       
current max features:          1
starting on split  1  of cross validation
starting on split  2  of cross validation
starting on split  3  of cross validation
starting on split  4  of cross validation
starting on split  5  of cross validation
0.274590576242699
       -----------------       
current max features:          3
starting on split  1  of cross validation
starting on split  2  of cross validation
starting on split  3  of cross validation
starting on split  4  of cross validation
starting on split  5  of cross validation
0.23499314773903132
       -----------------       
current max features:          5
starting on split  1  of cross validation
starting on split  2  of cross validation
starting on split  3  of cross validation
starting on split  4  of cross validation
starting on split  5  of cross validation
0.22470103800770844
       -----------------       
current max features:          7
starting on split  1  of cross validation
starting o

# New features and data cleaning

In [8]:
optimal_n_estimators = 300
optimal_max_depth = 13
optimal_min_samples_split = 1000
optimal_min_samples_leaf = 40
optimal_max_features = 40
optimal_subsample = 0.95
original_learning_rate = 0.1

In [9]:
model = GradientBoostingRegressor(
            n_estimators = optimal_n_estimators*10,
            max_depth = optimal_max_depth,
            min_samples_split = optimal_min_samples_split,
            min_samples_leaf = optimal_min_samples_leaf,
            max_features = optimal_max_features,
            subsample = optimal_subsample,
            learning_rate = original_learning_rate / 10,
            loss = 'squared_error',
            criterion = 'squared_error',
            verbose = 0,
            warm_start = False,
            random_state = random_state,
        )

In [19]:
model_no_cv = GradientBoostingRegressor(
            n_estimators = optimal_n_estimators*10,
            max_depth = optimal_max_depth,
            min_samples_split = optimal_min_samples_split,
            min_samples_leaf = optimal_min_samples_leaf,
            max_features = optimal_max_features,
            subsample = optimal_subsample,
            learning_rate = original_learning_rate / 10,
            loss = 'squared_error',
            criterion = 'squared_error',
            verbose = 0,
            warm_start = False,
            random_state = random_state,
        )

In [12]:
# Define the features (this is all)
features =           ["building_id", # For grouping
                      "area_total", "area_kitchen", "area_living", "floor", "ceiling", "stories", "rooms",
                      "bathrooms_private", "bathrooms_shared", "balconies","loggias", "phones", "latitude", "longitude", "constructed", # Numerical
                     "layout", "condition", "district", "material", "parking", "heating", "seller", #Categorical
                      "windows_court", "windows_street", "new", "elevator_without", "elevator_passenger", "elevator_service", "garbage_chute"] # Bool
                     #"street", "address"] # Strings

all_numerical_features = ["area_total", "area_kitchen", "area_living", "floor",
                      "ceiling", "stories", "rooms", "bathrooms_private", "bathrooms_shared", "balconies","loggias", "phones", "latitude", "longitude", "constructed"]

float_numerical_features = ["area_total", "area_kitchen", "area_living", "ceiling", "latitude", "longitude", "constructed"]
int_numerical_features = ["floor", "stories", "rooms", "bathrooms_private", "bathrooms_shared", "balconies", "loggias", "phones"] # Ordinal categories

cat_features = ["layout", "condition", "district", "material", "parking", "heating", "seller"] # All are non-ordinal

droptable = []

# Load data
train, test, metaData = load_all_data()
# Clean data
train_labels, train_targets, test_labels = clean_data(train, test, features, float_numerical_features, int_numerical_features, cat_features, log_targets=False, log_area=True, fillNan=True)
# Add new features
train_labels, test_labels, added_features = feature_engineering(train_labels, test_labels, float_numerical_features, int_numerical_features, cat_features)
# Normalize
train_labels, test_labels = normalize(train_labels, test_labels, float_numerical_features, scaler="minMax")
# One-hot encoding
train_labels, test_labels = one_hot_encoder(train_labels, test_labels, ["condition", "district", "material", "parking", "heating", "seller"], drop_old=True)
# Drop some features
train_labels.drop(droptable, inplace=True, axis=1)
test_labels.drop(droptable, inplace=True, axis=1)

minMax


In [18]:
scores, average_score, best_model, best_index = gradient_boost_groupKFold(number_of_splits=number_of_splits,
                                                        model=model,
                                                        X_train=train_labels, 
                                                        y_train=np.log(train_targets)
                                                        )

starting on split  1  of cross validation
starting on split  2  of cross validation
starting on split  3  of cross validation
starting on split  4  of cross validation
starting on split  5  of cross validation
Average score 0.2052103426074366


In [20]:
print("scores:        ", scores)
print("average score: ", average_score)
print("best model:    ", best_model)
print("best index:    ", best_index)

scores:         [0.21432832954051392, 0.18445627736361914, 0.1999504554104062, 0.19235765459822682, 0.23495899612441673]
average score:  0.2052103426074366
best model:     GradientBoostingRegressor(criterion='squared_error', learning_rate=0.01,
                          max_depth=13, max_features=40, min_samples_leaf=40,
                          min_samples_split=1000, n_estimators=3000,
                          random_state=1, subsample=0.95)
best index:     1


In [21]:
model_no_cv.fit(train_labels,np.log(train_targets))

GradientBoostingRegressor(criterion='squared_error', learning_rate=0.01,
                          max_depth=13, max_features=40, min_samples_leaf=40,
                          min_samples_split=1000, n_estimators=3000,
                          random_state=1, subsample=0.95)

In [24]:
predict_and_store(model_no_cv, test_labels, test, path=".\submissions\GB4.0.csv", exponential=True)

# LOOOOT of extra features

In [5]:
features =           ["building_id", # For grouping
                      "area_total", "area_kitchen", "area_living", "floor", "ceiling", "stories", "rooms",
                      "bathrooms_private", "bathrooms_shared", "balconies","loggias", "phones", "latitude", "longitude", "constructed", # Numerical
                     "layout", "condition", "district", "material", "parking", "heating", "seller", #Categorical
                      "windows_court", "windows_street", "new", "elevator_without", "elevator_passenger", "elevator_service", "garbage_chute", # Bool
                     "street", "address"] # Strings

all_numerical_features = ["area_total", "area_kitchen", "area_living", "floor",
                      "ceiling", "stories", "rooms", "bathrooms_private", "bathrooms_shared", "balconies","loggias", "phones", "latitude", "longitude", "constructed"]

float_numerical_features = ["area_total", "area_kitchen", "area_living", "ceiling", "latitude", "longitude", "constructed"]
int_numerical_features = ["floor", "stories", "rooms", "bathrooms_private", "bathrooms_shared", "balconies", "loggias", "phones"] # Ordinal categories

cat_features = ["layout", "condition", "district", "material", "parking", "heating", "seller"] # All are non-ordinal

droptable = []

# Load data
train, test, metaData = load_all_data()
# Clean data
train_labels, train_targets, test_labels = clean_data(train, test, features, float_numerical_features, int_numerical_features, cat_features, log_targets=False, log_area=True, fillNan=True)
# Add new features
train_labels, test_labels, added_features = feature_engineering(
    train_labels, 
    test_labels,
    add_base_features=True, 
    add_bool_features=True,
    add_weak_features=True,
    add_dist_to_metro=True,
    add_close_to_uni=True,
    add_dist_to_hospital=True,
    add_floor_features=True,
    add_street_info=True,
    )

# Normalize
train_labels, test_labels = normalize(train_labels, test_labels, float_numerical_features, scaler="minMax")
# One-hot encoding
train_labels, test_labels = one_hot_encoder(train_labels, test_labels, ["condition", "district", "material", "parking", "heating", "seller"], drop_old=True)
# Drop some features
train_labels.drop(droptable, inplace=True, axis=1)
test_labels.drop(droptable, inplace=True, axis=1)

minMax


In [6]:
droptable = ['street','address']
train_labels.drop(droptable, inplace=True, axis=1)
test_labels.drop(droptable, inplace=True, axis=1)

In [7]:
optimal_n_estimators = 300
optimal_max_depth = 13
optimal_min_samples_split = 1000
optimal_min_samples_leaf = 40
optimal_max_features = 40
optimal_subsample = 0.95
original_learning_rate = 0.1

model = GradientBoostingRegressor(
            n_estimators = optimal_n_estimators*10,
            max_depth = optimal_max_depth,
            min_samples_split = optimal_min_samples_split,
            min_samples_leaf = optimal_min_samples_leaf,
            max_features = optimal_max_features,
            subsample = optimal_subsample,
            learning_rate = original_learning_rate / 10,
            loss = 'squared_error',
            criterion = 'squared_error',
            verbose = 0,
            warm_start = False,
            random_state = random_state,
        )

model_no_cv = GradientBoostingRegressor(
            n_estimators = optimal_n_estimators*10,
            max_depth = optimal_max_depth,
            min_samples_split = optimal_min_samples_split,
            min_samples_leaf = optimal_min_samples_leaf,
            max_features = optimal_max_features,
            subsample = optimal_subsample,
            learning_rate = original_learning_rate / 10,
            loss = 'squared_error',
            criterion = 'squared_error',
            verbose = 0,
            warm_start = False,
            random_state = random_state,
        )

In [8]:
scores, average_score, best_model, best_index = gradient_boost_groupKFold(
    number_of_splits=number_of_splits,
    model=model,
    X_train=train_labels, 
    y_train=np.log(train_targets))

starting on split  1  of cross validation
starting on split  2  of cross validation
starting on split  3  of cross validation
starting on split  4  of cross validation
starting on split  5  of cross validation


In [9]:
print("scores:        ", scores)
print("average score: ", average_score)
print("best model:    ", best_model)
print("best index:    ", best_index)

scores:         [0.2152763450188409, 0.1795831890053143, 0.1984155296145181, 0.1920902203136464, 0.23641330039989722]
average score:  0.20435571687044338
best model:     GradientBoostingRegressor(criterion='squared_error', learning_rate=0.01,
                          max_depth=13, max_features=40, min_samples_leaf=40,
                          min_samples_split=1000, n_estimators=3000,
                          random_state=1, subsample=0.95)
best index:     1


In [None]:
model_no_cv.fit(train_labels,np.log(train_targets))
predict_and_store(model_no_cv, test_labels, test, path=".\submissions\GB5.0.csv", exponential=True)

## Predict price per square meter

In [10]:
price_per_square_meter = train_targets/train['area_total']

In [None]:
scores, average_score, best_model, best_index = gradient_boost_groupKFold(
    number_of_splits=number_of_splits,
    model=model,
    X_train=train_labels.drop(['area_total'],axis=1), 
    y_train=np.log(price_per_square_meter))

starting on split  1  of cross validation
starting on split  2  of cross validation
starting on split  3  of cross validation
starting on split  4  of cross validation


In [None]:
model_no_cv.fit(train_labels.drop(['area_total'],axis=1),np.log(price_per_square_meter))
predict_and_store(model_no_cv, test_labels.drop(['area_total'],axis=1), test, path=".\submissions\GB5.1.csv", exponential=True, price_per_sq = True, total_area_df = test['area_total'])