# Project: House Prices - Advanced Regression Techniques

## Deafult Regression Models

In [1]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames

# plotting with pyplot
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from scipy.stats import norm

# to work with files
import pickle

# Pretty display for notebooks
%matplotlib inline

### Load the corresponding dataset via pickle

Run only one of the following cells

In [24]:
# Loading datasets via pickle:
# Work with the top two most corralated features
features = pd.read_pickle('features_top2.pkl')
log_prices = pd.read_pickle('log_prices_top2.pkl')
public_features = pd.read_pickle('public_features_top2.pkl')

In [25]:
# Loading datasets via pickle:
# Work with the top ten most corralated features
features = pd.read_pickle('features_top10.pkl')
log_prices = pd.read_pickle('log_prices_top10.pkl')
public_features = pd.read_pickle('public_features_top10.pkl')

In [3]:
# Loading datasets via pickle:
# Work with all features
features = pd.read_pickle('features_all.pkl')
log_prices = pd.read_pickle('log_prices_all.pkl')
public_features = pd.read_pickle('public_features_all.pkl')

### Split the data

In [38]:
# Import 'train_test_split'
from sklearn.model_selection import train_test_split

# Shuffle and split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(
        features, log_prices, test_size=0.2, random_state=2
    )

### Test regression Models

In [22]:
from time import time

def train_regressor(reg, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    reg.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print "Trained model in {:.4f} seconds".format(end - start)
    
def make_prediction(reg, X_test, y_test):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Print and return results
    # return reg.score(X_test, y_test)
    # The error metric: RMSE on the log of the sale prices.
    from sklearn.metrics import mean_squared_error as mse
    from sklearn.metrics import r2_score
    
    score = reg.score(X_test, y_test)
    
    start = time()
    # The error metric: RMSE on the log of the sale prices.
    y_pred = reg.predict(X_test)
    
    def rmse(y_pred, y_test):
        mse_score = mse(y_test, y_pred)
        rmse_score = np.sqrt(mse_score)
        return rmse_score

    rmsle_score = rmse(y_pred, y_test)
    end = time()
    total_time = end - start
    
    return total_time, score, rmsle_score, y_pred
    
def train_predict(reg, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print "Training a {} using a training set size of {}. . .".format(reg.__class__.__name__, len(X_train))
    
    # Train the classifier
    train_regressor(reg, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    print "Test model in {:.4f} seconds".format(make_prediction(reg, X_test, y_test)[0])
    print "R2 score for test set: {}".format(make_prediction(reg, X_test, y_test)[1])
    print "RMSLE score for test set: {}\n".format(make_prediction(reg, X_test, y_test)[2])

In [39]:
# Import the supervised learning models from sklearn

from sklearn.tree import DecisionTreeRegressor
from sklearn import svm
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LassoLars
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
import xgboost as xgb

# Initialize the models
reg_A = DecisionTreeRegressor(random_state=2)
reg_B = svm.SVR()
reg_C = ElasticNet(random_state=2)
reg_D = Lasso(random_state=2)
reg_E = LassoLars()
reg_F = BayesianRidge()
reg_G = GradientBoostingRegressor() # default loss function for regression is least squares ('ls')
reg_H = ExtraTreesRegressor(random_state=2)
reg_I = BaggingRegressor(random_state=2)
reg_J = AdaBoostRegressor(random_state=2)

# Install xgboost https://xgboost.readthedocs.io/en/latest/build.html
reg_K = xgb.XGBRegressor()

# Execute the 'train_predict' function for each classifier
for reg in [reg_A, reg_B, reg_C, reg_D, reg_E, reg_F, reg_G, reg_H, reg_I, reg_J, reg_K]:
    print "\n{}: \n".format(reg.__class__.__name__)
    train_predict(reg, X_train, y_train, X_test, y_test)


DecisionTreeRegressor: 

Training a DecisionTreeRegressor using a training set size of 1164. . .
Trained model in 0.0764 seconds
Test model in 0.0010 seconds
R2 score for test set: 0.681487629803
RMSLE score for test set: 0.229878831485


SVR: 

Training a SVR using a training set size of 1164. . .
Trained model in 0.2621 seconds
Test model in 0.0175 seconds
R2 score for test set: 0.0163711960787
RMSLE score for test set: 0.4039721706


ElasticNet: 

Training a ElasticNet using a training set size of 1164. . .
Trained model in 0.0325 seconds
Test model in 0.0007 seconds
R2 score for test set: 0.824931281324
RMSLE score for test set: 0.170427669581


Lasso: 

Training a Lasso using a training set size of 1164. . .
Trained model in 0.0026 seconds
Test model in 0.0008 seconds
R2 score for test set: 0.818117192459
RMSLE score for test set: 0.173712733227


LassoLars: 

Training a LassoLars using a training set size of 1164. . .
Trained model in 0.0412 seconds
Test model in 0.0007 seconds




RMSLE score for test set: 0.174096124767


BaggingRegressor: 

Training a BaggingRegressor using a training set size of 1164. . .
Trained model in 0.1938 seconds
Test model in 0.0056 seconds
R2 score for test set: 0.819752024583
RMSLE score for test set: 0.172930272807


AdaBoostRegressor: 

Training a AdaBoostRegressor using a training set size of 1164. . .
Trained model in 0.3396 seconds
Test model in 0.0091 seconds
R2 score for test set: 0.760534238021
RMSLE score for test set: 0.199323058721


XGBRegressor: 

Training a XGBRegressor using a training set size of 1164. . .
Trained model in 0.3249 seconds
Test model in 0.0025 seconds
R2 score for test set: 0.842019424436
RMSLE score for test set: 0.161896580093



### Export the regressors to a dictionary via pickle

#### An empty dictionary is already provided, use the following line to reset the entire dictionary of regressors

In [2]:
## an empty dictionary is already provided, use this to reset the entire dictionary of regressors

# initialize only the the first time otherwise the dictionary will be reset
# regs_dict = {'top_2': {'untuned': {}, 'tuned': {}}, 'top_10': {'untuned': {}, 'tuned': {}}, 'all': {'untuned': {}, 'tuned': {}}}
# Save regressors in dictionary
# filename = 'regs_dict.dict'
# pickle.dump(regs_dict, open(filename, 'wb'))
# print '{} saved!'.format(filename)

regs_dict.dict saved!


#### Load the models dictionary file via pickle

In [30]:
# load the regressors dictioanry
filename = 'regs_dict.dict'

# load the models dictionary according to selected filename
regs_dict = pickle.load(open(filename, 'rb'))

In [31]:
# save regressors in dictionary
for reg in [reg_A, reg_B, reg_C, reg_D, reg_E, reg_F, reg_G, reg_H, reg_I, reg_J, reg_K]:
    # select only one of the following three lines according to the features use
    # regs_dict['top_2']['untuned'][reg.__class__.__name__] = reg
    # regs_dict['top_10']['untuned'][reg.__class__.__name__] = reg
    regs_dict['all']['untuned'][reg.__class__.__name__] = reg
    print '{} saved!'.format(reg.__class__.__name__)

DecisionTreeRegressor saved!
SVR saved!
ElasticNet saved!
Lasso saved!
LassoLars saved!
BayesianRidge saved!
GradientBoostingRegressor saved!
ExtraTreesRegressor saved!
BaggingRegressor saved!
AdaBoostRegressor saved!
XGBRegressor saved!


### Save the regressor dict via pickle

In [32]:
# Save regressors in dictionary
filename = 'regs_dict.dict'
pickle.dump(regs_dict, open(filename, 'wb'))
print '{} saved!'.format(filename)

regs_dict.dict saved!
