# Project: House Prices - Advanced Regression Techniques

## Grid Search

In [1]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames
import pickle

# Pretty display for notebooks
%matplotlib inline

### Load the corresponding dataset via pickle

Run only one of the following cells

In [26]:
# Loading datasets via pickle:
# Work with the top two most corralated features
features = pd.read_pickle('features_top2.pkl')
log_prices = pd.read_pickle('log_prices_top2.pkl')
public_features = pd.read_pickle('public_features_top2.pkl')

In [53]:
# Loading datasets via pickle:
# Work with the top ten most corralated features
features = pd.read_pickle('features_top10.pkl')
log_prices = pd.read_pickle('log_prices_top10.pkl')
public_features = pd.read_pickle('public_features_top10.pkl')

In [21]:
# Loading datasets via pickle:
# Work with all features
features = pd.read_pickle('features_all.pkl')
log_prices = pd.read_pickle('log_prices_all.pkl')
public_features = pd.read_pickle('public_features_all.pkl')

### Split the data

In [3]:
# Import 'train_test_split'
from sklearn.model_selection import train_test_split

# Shuffle and split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(
        features, log_prices, test_size=0.2, random_state=2
    )

### Gridsearch to all regressors

#### Defining hyperparameters for all regressors to be tuned

In [4]:
# DecisionTreeRegressor: 
splitter = ['best', 'random']
max_features = ['auto', 'sqrt', 'log2']
max_depth_range = np.arange(1, 11)
min_samples_split = np.arange(2, 9)
min_samples_leaf = np.arange(1, 9)
param_grid_A = dict(splitter=splitter, max_features=max_features, max_depth=max_depth_range, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)

In [5]:
# SVR: 
C = [0.9, 1.0, 1.1]
kernel = ['linear', 'rbf']
max_iter = [1500]
param_grid_B = dict(C=C, kernel=kernel, max_iter=max_iter)

In [6]:
# ElasticNet: 
alpha = [1.0, 1.1]
tol = [0.00005, 0.0001, 0.00015, 0.0002]
param_grid_C = dict(alpha=alpha, tol=tol)

In [7]:
# Lasso: 
alpha = [0.01, 1, 10]
tol = [0.00005, 0.0001, 0.00015, 0.0002]
selection = ['cyclic', 'random']
param_grid_D = dict(alpha=alpha, tol=tol, selection=selection)

In [8]:
# LassoLars: 
alpha = [0.0001, 0.001, 0.01, 1]
normalize = ['True', 'False']
param_grid_E = dict(alpha=alpha, normalize=normalize)

In [9]:
# BayesianRidge: 
tol = [0.0001, 0.001, 0.01, 0.1,  1, 10, 50, 100, 500]
n_iter= [1000]
normalize = ['True', 'False']
# values = np.arange(0.5, 8) / 1000000.0
# alpha_1 = values
# alpha_2 = values
# lambda_1 = values
# lambda_2 = values
param_grid_F = dict(n_iter=n_iter, tol=tol, normalize=normalize)

In [10]:
# GradientBoostingRegressor:
# alpha = [0.9, 1]
# learning_rate = [0.01, 0.1, 1]
# n_estimators = np.arange(100, 200, 50)
# loss = ['ls', 'lad']
max_depth_range = np.arange(1, 6)
criterion = ['friedman_mse', 'mse']
max_features = ['auto', 'sqrt', 'log2']
min_samples_split = np.arange(2, 6)
min_samples_leaf = np.arange(1, 10)
param_grid_G = dict(max_depth=max_depth_range, criterion=criterion, max_features=max_features, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)

In [11]:
# ExtraTreesRegressor: 
# n_estimators = np.arange(50, 500, 50)
# max_features = ['auto', 'sqrt', 'log2']
# max_depth_range = np.arange(1, 6)
min_samples_split = np.arange(2, 6)
min_samples_leaf = np.arange(1, 6)
param_grid_H = dict(min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)

In [12]:
# BaggingRegressor: 
n_estimators = np.arange(50, 500, 50)
# max_features = [1.0, 2.0]
# max_samples = np.arange(0.0, 2.0)
param_grid_I = dict(n_estimators=n_estimators)

In [13]:
# AdaBoostRegressor: 
learning_rate = [0.001, 0.01, 1.0]
# n_estimators = np.arange(50, 500, 50)
loss = ['linear', 'square', 'exponential']
param_grid_J = dict(learning_rate=learning_rate, loss=loss)

In [14]:
# XGBRegressor:
colsample_bytree = [0.9, 1.0]
gamma = [0.0, 0.1]
learning_rate = [0.1, 0.5]
max_depth = np.arange(3, 5)
min_child_weight = np.arange(1, 4)
n_estimators = np.linspace(100, 10000, 100)
reg_alpha =  [0.0, 0.05, 0.1, 0.2, 0.3, 0.4]
reg_lambda = [0.6, 0.7, 0.8, 0.9, 0.95, 1.0]
subsample = [0.6, 0.7, 0.8, 0.9, 0.95, 1.0]

param_grid_K = dict(colsample_bytree =colsample_bytree, gamma=gamma, max_depth=max_depth, reg_alpha=reg_alpha, reg_lambda=reg_lambda,  subsample=subsample)

#### Load the models dictionary file via pickle

In [15]:
# load the regressors dictioanry
filename = 'regs_dict.dict'

# load the models dictionary according to selected filename
regs_dict = pickle.load(open(filename, 'rb'))

In [17]:
# top2 features with untuned regressors
# selected_regressors = regs_dict['top_2']['untuned']

# top10 features with untuned regressors
# selected_regressors = regs_dict['top_10']['untuned']

# all features with untuned regressors
selected_regressors = regs_dict['all']['untuned']

#### Combine regressors and hyperparameters in a tuple

In [18]:
# tune all models
models = ['DecisionTreeRegressor', 'SVR', 'ElasticNet', 'Lasso', 'LassoLars', 'BayesianRidge', 'GradientBoostingRegressor', 'ExtraTreesRegressor', 'BaggingRegressor', 'AdaBoostRegressor', 'XGBRegressor']
param_grids = [param_grid_A, param_grid_B, param_grid_C, param_grid_D, param_grid_E, param_grid_F, param_grid_G, param_grid_H, param_grid_I, param_grid_J, param_grid_K]
regressor_tuples = zip(models, param_grids)

In [45]:
# debugging one by one
models = ['SVR']
param_grids = [param_grid_B]
regressor_tuples = zip(models, param_grids)

### Define tune_parameters

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold

from time import time

# initialize empty dictionary
tuned_regressors = {}

# loop throught all regressors and perform gridsearch
def tune_parameters(regressor_tuples):
    for reg, param_grid in regressor_tuples:
        # Regressor name
        print '{}\n'.format(selected_regressors[reg].__class__.__name__)
        
        # Indicate the classifier and the training set size
        print "Training a {} using a training set size of {}. . .".format(selected_regressors[reg].__class__.__name__, len(X_train))
        
        # Old regressor
        print '{} Old regressor:\n{}\n'.format(selected_regressors[reg].__class__.__name__, selected_regressors[reg])
        
        start = time()
        from sklearn.metrics import mean_squared_error as mse
        def rmse(y_pred, y_test):
            #y_pred = reg.predict(X_test)
            mse_score = mse(y_test, y_pred)
            rmse_score = np.sqrt(mse_score)
            return rmse_score

        # Make an rmse scoring function using 'make_scorer' 
        rmse_scorer = make_scorer(rmse, greater_is_better=False)
        
        # K-fold
        kf = KFold(n_splits=4, random_state=2, shuffle=True)
        cv = kf

        # Perform grid search on the classifier using the f1_scorer as the scoring method
        grid_obj = GridSearchCV(selected_regressors[reg], param_grid, scoring=rmse_scorer)

        # Fit the grid search object to the training data and find the optimal parameters
        start_train = time()
        grid_obj = grid_obj.fit(X_train, y_train)
        end_train = time()
        
        # Get the estimator
        reg = grid_obj.best_estimator_
        tuned_regressors[reg.__class__.__name__] = reg
        end_grid = time()
        
        # tuned regressor
        print 'Tuned regressor:\n{}\n'.format(reg)

        # Calculate rmsle
        start_test = time()
        # Predict 
        y_pred = reg.predict(X_test)
        rmse_score = rmse(y_pred, y_test)
        
        end_test = time()
        train_time = end_train - start_train
        grid_time = end_grid - start
        test_time = end_test - start_test
        
        # calculate r2 score
        from sklearn.metrics import r2_score
        score = reg.score(X_test, y_test)
        
        # training time
        print 'Trained model in: {}'.format(train_time)
        
        # testing time
        print 'Test model in : {}'.format(test_time)
        
        # grid search time
        print 'GridSearchCV performed in : {}\n'.format(grid_time)
        
        # r2 score
        print 'r2 score is: {}'.format(score)
        
        # rmsle score
        print 'rmsle score is: {}\n'.format(rmse_score)

### Perform grid search of all models

In [20]:
tune_parameters(regressor_tuples)

DecisionTreeRegressor

Training a DecisionTreeRegressor using a training set size of 1164. . .
DecisionTreeRegressor Old regressor:
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=2,
           splitter='best')

Tuned regressor:
DecisionTreeRegressor(criterion='mse', max_depth=7, max_features='auto',
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=8, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=2,
           splitter='best')

Trained model in: 259.116488934
Test model in : 0.00585913658142
GridSearchCV performed in : 259.117077827

r2 score is: 0.725685598036
rmsle score is: 0.213334028096

SVR

Training a SVR using a training set size of 1164. . .
SVR Old regressor:
SVR(C=1.0, cache_size=200, coef0=0.0

  y = column_or_1d(y, warn=True)


Tuned regressor:
SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=1500, shrinking=True, tol=0.001, verbose=False)

Trained model in: 7.74384188652
Test model in : 0.105340003967
GridSearchCV performed in : 7.7450940609

r2 score is: -0.0055317210357
rmsle score is: 0.408445124859

ElasticNet

Training a ElasticNet using a training set size of 1164. . .
ElasticNet Old regressor:
ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=2, selection='cyclic', tol=0.0001, warm_start=False)

Tuned regressor:
ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=2, selection='cyclic', tol=0.0002, warm_start=False)

Trained model in: 3.20535993576
Test model in : 0.00382208824158
GridSearchCV performed in : 3.20547318459

r2 score is: 0.



Tuned regressor:
Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=2,
   selection='random', tol=5e-05, warm_start=False)

Trained model in: 15.4352319241
Test model in : 0.00264286994934
GridSearchCV performed in : 15.4354019165

r2 score is: 0.840188763331
rmsle score is: 0.16283189797

LassoLars

Training a LassoLars using a training set size of 1164. . .
LassoLars Old regressor:
LassoLars(alpha=1.0, copy_X=True, eps=2.2204460492503131e-16,
     fit_intercept=True, fit_path=True, max_iter=500, normalize=True,
     positive=False, precompute='auto', verbose=False)

Tuned regressor:
LassoLars(alpha=0.0001, copy_X=True, eps=2.2204460492503131e-16,
     fit_intercept=True, fit_path=True, max_iter=500, normalize='True',
     positive=False, precompute='auto', verbose=False)

Trained model in: 1.16700983047
Test model in : 0.00208282470703
GridSearchCV performed in : 1.16716885567

r2 score is: 0.9013678829

  estimator.fit(X_train, y_train, **fit_params)
  best_estimator.fit(X, y, **self.fit_params)


Tuned regressor:
ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=4,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=10, n_jobs=1, oob_score=False, random_state=2,
          verbose=0, warm_start=False)

Trained model in: 15.0411059856
Test model in : 0.0138528347015
GridSearchCV performed in : 15.0414431095

r2 score is: 0.830549275096
rmsle score is: 0.167670841697

BaggingRegressor

Training a BaggingRegressor using a training set size of 1164. . .
BaggingRegressor Old regressor:
BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=2,
         verbose=0, warm_start=False)

Tuned regressor:
BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features

#### save the regressors in the corresponding dictionary

In [22]:
# top2 features with tuned regressors
# regs_dict['top_2']['tuned'] = tuned_regressors

# top10 features with tuned regressors
# regs_dict['top_10']['tuned'] = tuned_regressors

# all features with tuned regressors
regs_dict['all']['tuned'] = tuned_regressors

### Save the regressor dict via pickle

In [23]:
# Save regressors in dictionary
filename = 'regs_dict.dict'
pickle.dump(regs_dict, open(filename, 'wb'))
print '{} saved!'.format(filename)

regs_dict.dict saved!
