# Project: House Prices - Advanced Regression Techniques

## Ensemble generation

In [37]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
import pickle

# Pretty display for notebooks
%matplotlib inline

In [36]:
# Load the Ames housing dataset
train = pd.read_csv('data/house_prices/train.csv')
test = pd.read_csv('data/house_prices/test.csv')

### Load the corresponding dataset via pickle

Run only one of the following cells

In [3]:
# Loading datasets via pickle:
# Work with the top two most corralated features
features = pd.read_pickle('features_top2.pkl')
log_prices = pd.read_pickle('log_prices_top2.pkl')
public_features = pd.read_pickle('public_features_top2.pkl')

In [27]:
# Loading datasets via pickle:
# Work with the top ten most corralated features
features = pd.read_pickle('features_top10.pkl')
log_prices = pd.read_pickle('log_prices_top10.pkl')
public_features = pd.read_pickle('public_features_top10.pkl')

In [38]:
# Loading datasets via pickle:
# Work with all features
features = pd.read_pickle('features_all.pkl')
log_prices = pd.read_pickle('log_prices_all.pkl')
public_features = pd.read_pickle('public_features_all.pkl')

### Split the data

In [39]:
# Import 'train_test_split'
from sklearn.model_selection import train_test_split

# Shuffle and split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(
        features, log_prices, test_size=0.2, random_state=2
    )

### Load the models dictionary file via pickle

In [40]:
# load the regressors dictioanry
filename = 'regs_dict.dict'

# load the models dictionary according to selected filename
regs_dict = pickle.load(open(filename, 'rb'))

#### load the models dictionary according to selected filename

In [45]:
# top2 features with untuned regressors
# selected_regressors = regs_dict['top_2']['untuned']

# top2 features with tuned regressors
# selected_regressors = regs_dict['top_2']['tuned']

# top10 features with untuned regressors
# selected_regressors = regs_dict['top_10']['untuned']

# top10 features with tuned regressors
# selected_regressors = regs_dict['top_10']['tuned']

# all features with untuned regressors
# selected_regressors = regs_dict['all']['untuned']

# all features with tuned regressors
selected_regressors = regs_dict['all']['tuned']

models = []
for e in selected_regressors:
    models.append(selected_regressors[e])

#### Select specific models from the dictionary to ensemble

In [70]:
# add regressors here to remove then from the base_models of ensemble
models = [regs_dict['all']['tuned']['BayesianRidge'], regs_dict['all']['tuned']['GradientBoostingRegressor']]

### Define ensemble

In [42]:
# ensemble class mean
class Ensemble(object):
    def __init__(self, base_models):
        self.base_models = models
        
    def predict(self, X, dictionary=None):
        individual = np.zeros((len(self.base_models), X.shape[0]))
        for i, model in enumerate(self.base_models):
            individual[i, :] = model.predict(X)    # for features with feature engineering
            # individual[i, :] = model.predict(X)[:] # for features without feature engineering
        
        result = np.mean(individual, axis=0) 
        
        return result

In [71]:
# load the models into the ensemble
ensemble = Ensemble(models)

### Results from ensemble

In [72]:
# calculate r2 score and rsmle score of ensemble
from sklearn.metrics import mean_squared_error as mse

y_pred = ensemble.predict(X_test)
def rmse(y_pred, y_test):
    mse_score = mse(y_test, y_pred)
    rmse_score = np.sqrt(mse_score)
    return rmse_score

rmse_score = rmse(y_pred, y_test)

print 'rmsle score is: {}'.format(rmse_score)

from sklearn.metrics import r2_score
r2_score = r2_score(y_test, y_pred)

print 'r2 score is: {}'.format(r2_score)

rmsle score is: 0.122978915078
r2 score is: 0.908843077059


### Apply ensemble to competition dataset and save in csv for submission

In [73]:
### export to csv for kaggle submission
# make prediction
y_pred = ensemble.predict(public_features)
# Data Recovery: Exponentiate the predictions
y_pred = np.exp(y_pred)
# dataframe for export to submission
pred_df = pd.DataFrame(y_pred, index=test["Id"], columns=["SalePrice"]) 

#### all regressor with default parameters

In [77]:
# The top two most corralated features
pred_df.to_csv('submission_top2.csv', header=True, index_label='Id')

In [20]:
# The top ten most corralated features
pred_df.to_csv('submission_top10.csv', header=True, index_label='Id')

In [11]:
# All engineered features
pred_df.to_csv('submission_all.csv', header=True, index_label='Id')

#### all regressor with tuned parameters

In [65]:
# The top two most corralated features
pred_df.to_csv('submission_top2_tuned.csv', header=True, index_label='Id')

In [37]:
# The top ten most corralated features
pred_df.to_csv('submission_top10_tuned.csv', header=True, index_label='Id')

In [49]:
# All engineered features
pred_df.to_csv('submission_all_tuned.csv', header=True, index_label='Id')

#### custom

In [74]:
# All engineered features
pred_df.to_csv('submission_custom3.csv', header=True, index_label='Id')