# Prediction Testbed for House Prices V3

## Goal: Obtain top 1% of Regression Performance

#### Steps:
1. Manipulate training set to perform a linear regression model
2. Train model using Linear Regression, XGBoost and Random Forest
3. Test Model - using a Stacking Model to improve performance

Using both categorical and numerical features

### Components of Module
1. Implement method for selecting Numerical Features
2. Implement method for selecting Categorical Features
3. Encode Categorical Features
4. Combine Numerical and Categorical Training Sets
5. Generate Training and Validation Set
6. Construct General Purpose, Stacked ML Model
7. Evaluate Performance on Validation Set

### Import Libraries and Datasets

In [25]:
# import libraries
import pandas as pd
from typing import Optional
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, BayesianRidge, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error  # using this metric as the main model performance metric
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
%matplotlib inline

Implement Python Module

In [23]:
# implementing data processing module
# include the following regressors: RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, XGBRegressor, LGBMRegressor, SGDRegressor, SVR, MLPRegressor

class StackedClassicalML:
    # Generate a general composite solution to regression problem using classical ML Regression techniques, with K-fold Cross Validation
    """
    LinearRegression
    SGDRegressor
    RandomForestRegressor
    GradientBoostingRegressor
    AdaBoostRegressor
    XGBRegressor
    LGBMRegressor
    SVR --> Support Vector Regressor
    MLPRegressor --> Multilayer Perceptron --> Neural Network
    """
    
    # constructor
    def __init__(self, train_df: pd.DataFrame, train_ground: pd.DataFrame, test_df: pd.DataFrame, k_fold: int = 5):
        self.train_raw = train_df.copy()
        self.train_ground = train_ground
        self.test_raw = test_df
        self.k_fold = k_fold
        self.train_df = self.train_raw.copy()
        self.test_df = self.test_raw.copy()
        self.train_combined = None
        self.encode_error_columns = []  # in case there are issues with categorical encoding
        self.final_columns = list(self.train_raw.columns)
        self.train_dataset = None
        self.test_dataset = None

        # select all features by default
        self.train_features_num, self.train_features_cat = StackedClassicalML.split_numeric_categorical(self, self.train_raw)
        self.num_features = list(self.train_features_num.columns)
        self.cat_features = list(self.train_features_cat.columns)

        # generate segmented datasets
        self.train_df_num = self.train_df[self.num_features]
        self.train_df_cat = self.train_df[self.cat_features]
        self.test_df_num = self.test_df[self.num_features]
        self.test_df_cat = self.test_df[self.cat_features]

        # ML params
        self.k = 5
        self.rf_parameters = {'n_estimators': [10, 50, 100, 500], 'max_depth': [4, 8, 16, 32, 64, None]}  # Random Forest
        self.gb_parameters = {'n_estimators': [10, 50, 100, 500, 1000], 'max_depth': [2, 4 , 8, 16]}  # Gradient Boosting Tree
        self.xgb_parameters = {'n_estimators': [10, 50, 100, 500, 1000], 'max_depth': [2, 4 , 8, 16]}  # XG Boost
        self.lgb_parameters = {'n_estimators': [10, 50, 100, 500, 1000], 'max_depth': [2, 4 , 8, 16]}  # LGBM
        
        

    # display features separately
    def display_describe_features(self):
        # display output
        display(self.train_features_num.describe())
        print('')
        display(self.train_features_cat.describe())

    # function to select num features
    def select_num_features(self, features: list = None):
        if features:
            self.num_features = features

        # output final datasets
        self.train_df_num = self.train_raw[self.num_features]
        self.test_df_num = self.test_raw[self.num_features]
        
    # function to select cat_features
    def select_cat_features(self, features: list = None):
        if features:
            self.cat_features = features

        # output final datasets
        self.train_df_cat = self.train_raw[self.cat_features]
        self.test_df_cat = self.test_raw[self.cat_features]

    def update_columns(self, num_features: Optional[list], cat_features: Optional[list]):
        # update both num and cat features
        self.select_num_features(num_features)
        self.select_cat_features(cat_features)

        print('Training and Test Datasets Updated')

    # perform categorical encoding on overall categorical dataset
    def label_encode(self):
        # have to encode all categorical variables at the start, train labelencoder, for both training and test sets
        # impute first, ensuring no data leakage
        train_df_cat_temp = StackedClassicalML.impute_mode(self.train_df_cat.copy())
        test_df_cat_temp = StackedClassicalML.impute_mode(self.test_df_cat.copy())

        # loop through columns
        for col in self.train_df_cat:
            encoder = LabelEncoder()
            encoder.fit(self.train_df_cat[col])  # train label encoder
            # after training label encoder, transform both training and test sets, using encoder trained on training set
            try:
                train_df_cat_temp[col] = encoder.transform(self.train_df_cat[col])
                test_df_cat_temp[col] = encoder.transform(self.test_df_cat[col])
            except:
                self.encode_error_columns.append(col)
                print(f"Error for Column {col}")
                continue  # move onto next loop

        # change
        self.train_df_cat = train_df_cat_temp.copy()
        self.test_df_cat = test_df_cat_temp.copy()
        return train_df_cat_temp

    # combine final training set before performing normalisation
    def combine_encoded_data(self):
        # process numerical data
        self.process_numerical_data()

        # combine both numerical and categorical features
        self.train_combined = pd.concat([self.train_df_num, self.train_df_cat], axis=1)
        self.test_combined = pd.concat([self.test_df_num, self.test_df_cat], axis=1)

    # normalise training and test datasets
    def finalise_dataset(self):
        self.train_dataset = StackedClassicalML.normalise(self.train_combined)
        self.test_dataset = StackedClassicalML.normalise(self.test_combined)
        
    # process numerical data, using mean
    def process_numerical_data(self):
        self.train_df_num = StackedClassicalML.impute_mean(self.train_df_num)
        self.test_df_num = StackedClassicalML.impute_mean(self.test_df_num)

    # train all default models, with k-fold cross validation
    def train_isolated_models(self):
        
        # train RandomForest
        rf = RandomForestRegressor()
        self.rf_cv = GridSearchCV(rf, self.rf_parameters, cv=self.k)
        self.rf_cv.fit(self.train_dataset, self.train_ground)

        # train Gradient Boosting Regressor
        gb = GradientBoostingRegressor()
        self.gb_cv = GridSearchCV(gb, self.gb_parameters, cv=self.k)
        self.gb_cv.fit(self.train_dataset, self.train_ground)

        # train XGBoost
        xgb = XGBRegressor()
        self.xgb_cv = GridSearchCV(xgb, self.xgb_parameters, cv=self.k)
        self.xgb_cv.fit(self.train_dataset, self.train_ground)

        # train AdaBoost
        adb = AdaBoostRegressor()
        self.adb_cv = GridSearchCV(adb, self.xgb_parameters, cv=self.k)
        self.adb_cv.fit(self.train_dataset, self.train_ground)

        # LGBMRegressor(n_estimators=200,learning_rate=0.1,n_jobs=-1),

    
# split categorical and numerical variables
    @staticmethod
    def split_numeric_categorical(self, data: pd.DataFrame) -> tuple:
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        train_features_num = data.select_dtypes(include=numerics)
        train_features_cat = data.select_dtypes(include=object)

        # return dataframes in a tuple
        return (train_features_num, train_features_cat)


    # perform normalisation
    @staticmethod
    def normalise(data: pd.DataFrame) -> pd.DataFrame:
        # apply StandardScaler
        scaler = StandardScaler()
        output = scaler.fit_transform(data)
        return output
        

    @staticmethod
    def impute_mode(data: pd.DataFrame) -> pd.DataFrame:
        # impute categorical features using mode
        return data.fillna(value=data.mode())

    @staticmethod
    def impute_mean(data: pd.DataFrame) -> pd.DataFrame:
        # impute numerical features using mean
        return data.fillna(value=data.mean())


IndentationError: unexpected indent (3604750009.py, line 150)

In [15]:
# read datasets
truth_col = 'SalePrice'
train_raw = pd.read_csv("train.csv")
test_raw = pd.read_csv("test.csv")
train_df = train_raw.copy()
train_truth = train_df[[truth_col]].copy()
train_df.drop(truth_col, axis=1, inplace=True)
test_df = test_raw.copy()
sample_submission = pd.read_csv("sample_submission.csv")

In [16]:
# initialise mod
mod = StackedClassicalML(train_df, train_truth, test_df)
mod.label_encode()
mod.combine_encoded_data()

# remove error data and update again
allowed_cat_features = [x for x in mod.cat_features if x not in mod.encode_error_columns]
mod.update_columns(num_features=mod.num_features, cat_features=allowed_cat_features)
mod.label_encode()
mod.combine_encoded_data()

Error for Column MSZoning
Error for Column Utilities
Error for Column Exterior1st
Error for Column Exterior2nd
Error for Column KitchenQual
Error for Column Functional
Error for Column SaleType
Training and Test Datasets Updated


In [17]:
mod.finalise_dataset()

In [20]:
pd.DataFrame(mod.train_dataset)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,63,64,65,66,67,68,69,70,71,72
0,-1.730865,0.073375,-0.229372,-0.207142,0.651479,-0.517200,1.050994,0.878668,0.511418,0.575425,...,0.854650,-0.768736,-0.318475,0.11211,0.0689,0.289745,0.063305,0.457447,0.191594,0.208502
1,-1.728492,-0.872563,0.451936,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.574410,1.171992,...,0.139585,-0.768736,-0.318475,0.11211,0.0689,0.289745,0.063305,0.457447,0.191594,0.208502
2,-1.726120,0.073375,-0.093110,0.073480,0.651479,-0.517200,0.984752,0.830215,0.323060,0.092907,...,0.139585,-0.768736,-0.318475,0.11211,0.0689,0.289745,0.063305,0.457447,0.191594,0.208502
3,-1.723747,0.309859,-0.456474,-0.096897,0.651479,-0.517200,-1.863632,-0.720298,-0.574410,-0.499274,...,-1.290545,1.301075,0.801942,0.11211,0.0689,0.289745,0.063305,0.457447,0.191594,-3.426284
4,-1.721374,0.073375,0.633618,0.375148,1.374795,-0.517200,0.951632,0.733308,1.364570,0.463568,...,0.139585,-0.768736,-0.318475,0.11211,0.0689,0.289745,0.063305,0.457447,0.191594,0.208502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1.721374,0.073375,-0.365633,-0.260560,-0.071836,-0.517200,0.918511,0.733308,-0.574410,-0.973018,...,0.139585,-0.768736,-0.318475,0.11211,0.0689,0.289745,0.063305,0.457447,0.191594,0.208502
1456,1.723747,-0.872563,0.679039,0.266407,-0.071836,0.381743,0.222975,0.151865,0.084843,0.759659,...,0.139585,-0.768736,0.801942,0.11211,0.0689,0.289745,0.063305,-1.390058,0.191594,0.208502
1457,1.726120,0.309859,-0.183951,-0.147810,0.651479,3.078570,-1.002492,1.024029,-0.574410,-0.369871,...,-1.290545,-0.768736,-0.318475,0.11211,0.0689,0.289745,0.063305,-3.237563,-4.759330,0.208502
1458,1.728492,-0.872563,-0.093110,-0.080160,-0.795151,0.381743,-0.704406,0.539493,-0.574410,-0.865548,...,0.854650,-0.768736,0.801942,0.11211,0.0689,0.289745,0.063305,0.457447,0.191594,0.208502


In [32]:
mod.test_combined

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleCondition
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,,Attchd,Unf,TA,TA,Y,,MnPrv,,Normal
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,,Attchd,Unf,TA,TA,Y,,,Gar2,Normal
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,TA,Attchd,Fin,TA,TA,Y,,MnPrv,,Normal
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,Gd,Attchd,Fin,TA,TA,Y,,,,Normal
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,,Attchd,RFn,TA,TA,Y,,,,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,21.0,1936,4,7,1970,1970,0.0,0.0,...,,,,,,Y,,,,Normal
1455,2916,160,21.0,1894,4,5,1970,1970,0.0,252.0,...,,CarPort,Unf,TA,TA,Y,,,,Abnorml
1456,2917,20,160.0,20000,5,7,1960,1996,0.0,1224.0,...,TA,Detchd,Unf,TA,TA,Y,,,,Abnorml
1457,2918,85,62.0,10441,5,5,1992,1992,0.0,337.0,...,,,,,,Y,,MnPrv,Shed,Normal


In [None]:
    DecisionTreeRegressor(),
    LinearRegression(n_jobs=-1),
    RandomForestRegressor(n_estimators=200,n_jobs=-1),
    AdaBoostRegressor(n_estimators=200,learning_rate=0.1),
    XGBRegressor(n_estimators=200,learning_rate=0.1,n_jobs=-1),
    LGBMRegressor(n_estimators=200,learning_rate=0.1,n_jobs=-1),
    GradientBoostingRegressor(n_estimators=200,learning_rate=0.1)

In [7]:
# read datasets
train_raw = pd.read_csv("train.csv")
test_raw = pd.read_csv("test.csv")
train_df = train_raw.copy()
test_df = test_raw.copy()
sample_submission = pd.read_csv("sample_submission.csv")

In [8]:
sample_submission.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


### Understanding the training set, and performing feature engineering

Break up numeric and categorical features

In [9]:
train_price = train_df['SalePrice'].values.ravel()
train_features = train_df.drop('SalePrice', axis=1).copy()

In [10]:
# break up numeric and categorical features
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
train_features_num = train_features.select_dtypes(include=numerics)
train_features_cat = train_features.select_dtypes(include=object)

Handle Numeric Features

In [11]:
# check for correlation with price
train_features_num['SalePrice'] = train_price

In [12]:
rel_features_num_df = train_features_num.corr()[['SalePrice']].sort_values(by='SalePrice')
rel_features_num_df

Unnamed: 0,SalePrice
KitchenAbvGr,-0.135907
EnclosedPorch,-0.128578
MSSubClass,-0.084284
OverallCond,-0.077856
YrSold,-0.028923
LowQualFinSF,-0.025606
Id,-0.021917
MiscVal,-0.02119
BsmtHalfBath,-0.016844
BsmtFinSF2,-0.011378


In [13]:
# select features which makes sense
num_features = rel_features_num_df[rel_features_num_df['SalePrice'].abs() > 0.5]  # selecting only those with more than 50% correlation

In [14]:
train_df_num_final = train_df[num_features.index].drop('SalePrice', axis=1)

Handle Categorical Features

In [15]:
train_features_cat.columns

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [16]:
# let the model train and see what happens
train_features_cat.describe()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
count,1460,1460,91,1460,1460,1460,1460,1460,1460,1460,...,1379,1379,1379,1379,1460,7,281,54,1460,1460
unique,5,2,2,4,4,2,5,3,25,9,...,6,3,5,5,3,3,4,4,9,6
top,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,Attchd,Unf,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal
freq,1151,1454,50,925,1311,1459,1052,1382,225,1260,...,870,605,1311,1326,1340,3,157,49,1267,1198


In [17]:
# have to perform encoding of categorical features
train_features_cat_df = train_features_cat.describe().transpose()

In [18]:
# select only columns with sufficient data, and which the greatest frequency cannot be more than a threshold
freq_threshold = int(len(train_features_cat) * 0.90)  # mode frequency should be less than 90%, otherwise there isn't much information
train_features_cat_df = train_features_cat_df[train_features_cat_df['count'] > 1000]
train_features_cat_df = train_features_cat_df[train_features_cat_df['freq'] < freq_threshold]

### Manipulate Training Dataset

Select relevant columns and impute missing values

In [19]:
train_df_cat_final = train_df[train_features_cat_df.index]

In [20]:
train_df_cat_final = train_df_cat_final.fillna(value=train_df_cat_final.mode())

### Encode Categorical Features

In [21]:
# perform label encoding of categorical variables
for col in train_df_cat_final:
    train_df_cat_final[col] = LabelEncoder().fit_transform(train_df_cat_final[col])


### Combine both Numerical and Categorical Features and Perform Standard Scaling

In [22]:
train_df_final = pd.concat([train_df_num_final, train_df_cat_final], axis=1)

Split Training Set into Training Set and Test Set

In [23]:
tdf, vdf, tprice, vprice = train_test_split(train_df_final, train_price, test_size=0.2, random_state=12)

Perform Transformation of Data Set

In [24]:
def transform_dataset(test_df, num_cols):

    # break up numeric and categorical features
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    test_features_num = test_df.select_dtypes(include=numerics)
    test_features_cat = test_df.select_dtypes(include=object)
    
    # select num_columns
    test_features_num = test_features_num[num_cols]

    # apply label encoding of categorical variables
    for col in test_features_cat:
        test_features_cat[col] = LabelEncoder().fit_transform(test_features_cat[col])

    # generate final test set and perform standardisation
    test_df_final = pd.concat([test_features_num, test_features_cat], axis=1)

    return test_df_final
    

In [25]:
dd = transform_dataset(test_raw, num_features.index[:-1])

### Use Test Set as the Validation Set for Model Selection

In [28]:
# perform standard scaling
scaler = StandardScaler()
scaler.fit(train_df_final)
train_df_ff = pd.DataFrame(scaler.transform(train_df_final), columns=train_df_final.columns)


In [29]:
# transform test set before prediction
test_df = test_df[train_df_ff.columns]

In [30]:
# break up numeric and categorical features
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
test_features_num = test_df.select_dtypes(include=numerics)
test_features_cat = test_df.select_dtypes(include=object)

In [31]:
# apply label encoding of categorical variables
for col in test_features_cat:
    test_features_cat[col] = LabelEncoder().fit_transform(test_features_cat[col])

In [32]:
# generate final test set and perform standardisation
test_df_final = pd.concat([test_features_num, test_features_cat], axis=1)
test_df_ff = pd.DataFrame(scaler.transform(test_df_final), columns=test_df_final.columns)

### Train Model and Generate Predictions

In [33]:
test_df_ff.fillna(value=test_df_ff.mean(), inplace=True)

1. Implement Random Forest Model

In [44]:
# Perform Grid Search CV on Random Forest Model
rf = RandomForestRegressor()
rf_parameters = {
    'n_estimators': [10, 50, 100, 500],
    'max_depth': [4, 8, 16, 32, 64, None],
}
rf_cv = GridSearchCV(rf, rf_parameters, cv=5)
rf_cv.fit(train_df_ff.values, train_price)

In [47]:
# print results from CV
def print_cv_results(cv_model):
    # return results asd dataframe
    results = pd.DataFrame(cv_model.cv_results_)[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by='rank_test_score')
    return results


In [48]:
print_cv_results(rf_cv)

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
15,"{'max_depth': 32, 'n_estimators': 500}",0.852911,0.029228,1
11,"{'max_depth': 16, 'n_estimators': 500}",0.851967,0.029521,2
14,"{'max_depth': 32, 'n_estimators': 100}",0.851829,0.034245,3
19,"{'max_depth': 64, 'n_estimators': 500}",0.851381,0.030427,4
18,"{'max_depth': 64, 'n_estimators': 100}",0.850037,0.031123,5
17,"{'max_depth': 64, 'n_estimators': 50}",0.849387,0.028472,6
7,"{'max_depth': 8, 'n_estimators': 500}",0.848857,0.025833,7
10,"{'max_depth': 16, 'n_estimators': 100}",0.848147,0.034222,8
6,"{'max_depth': 8, 'n_estimators': 100}",0.848097,0.030485,9
9,"{'max_depth': 16, 'n_estimators': 50}",0.847401,0.027515,10


2. Implement Gradient Boosted Trees

In [50]:
gb = GradientBoostingRegressor()
gb_parameters = {
    'n_estimators': [10, 50, 100, 500, 1000],
    'max_depth': [2, 4 , 8, 16]
}
gb_cv = GridSearchCV(gb, gb_parameters, cv=5)
gb_cv.fit(train_df_ff.values, train_price)

In [52]:
print_cv_results(gb_cv)

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
3,"{'max_depth': 2, 'n_estimators': 500}",0.881417,0.034573,1
7,"{'max_depth': 4, 'n_estimators': 100}",0.879644,0.034774,2
4,"{'max_depth': 2, 'n_estimators': 1000}",0.876592,0.038791,3
8,"{'max_depth': 4, 'n_estimators': 500}",0.875497,0.037481,4
9,"{'max_depth': 4, 'n_estimators': 1000}",0.874657,0.035225,5
6,"{'max_depth': 4, 'n_estimators': 50}",0.871728,0.038516,6
2,"{'max_depth': 2, 'n_estimators': 100}",0.86196,0.037053,7
1,"{'max_depth': 2, 'n_estimators': 50}",0.846438,0.022863,8
14,"{'max_depth': 8, 'n_estimators': 1000}",0.841385,0.038638,9
13,"{'max_depth': 8, 'n_estimators': 500}",0.840714,0.044227,10


3. Implement Linear Regression with L2 Regularisation

In [71]:
lr_parameters = {
    'alpha': [1, 2, 5, 10, 20, 50, 100, 200, 300, 400, 500]
    }
lr = Ridge()
lr_cv = GridSearchCV(lr, lr_parameters, cv=5)
lr_cv.fit(train_df_ff.values, train_price)

In [72]:
print_cv_results(lr_cv)

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
6,{'alpha': 100},0.801938,0.056878,1
7,{'alpha': 200},0.801412,0.055022,2
5,{'alpha': 50},0.801211,0.057931,3
8,{'alpha': 300},0.799981,0.053401,4
4,{'alpha': 20},0.799907,0.058643,5
3,{'alpha': 10},0.799195,0.058905,6
2,{'alpha': 5},0.798762,0.059042,7
1,{'alpha': 2},0.798473,0.059128,8
0,{'alpha': 1},0.798371,0.059157,9
9,{'alpha': 400},0.798203,0.051981,10


4. Implement Bayesian Ridge Regression

In [75]:
br_parameters = {
    'n_iter': [100, 300, 500, 1000]
    }
br = BayesianRidge()
br_cv = GridSearchCV(br, br_parameters, cv=5)
br_cv.fit(train_df_ff.values, train_price)

In [76]:
print_cv_results(br_cv)

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
0,{'n_iter': 100},0.80014,0.058523,1
1,{'n_iter': 300},0.80014,0.058523,1
2,{'n_iter': 500},0.80014,0.058523,1
3,{'n_iter': 1000},0.80014,0.058523,1


In [None]:
# collate results from all best models


5. Implement Neural Network Regressor

In [120]:
nn = MLPRegressor(hidden_layer_sizes=(1000, 2000, 2000, 2000, 1000), activation='relu', max_iter=6000)
nn.fit(train_df_ff.values, train_price)

In [121]:
nn_test_price = nn.predict(test_df_ff.values)
test_price = nn_test_price

In [None]:
nn = MLPRegressor()
nn_parameters = {
    'hidden_layer_sizes': [(100), (100, 100)]
    }
nn_cv = GridSearchCV(nn, nn_parameters, cv=5)
nn_cv.fit(train_df_ff.values, train_price)

In [117]:
nn_test_price = nn.predict(test_df_ff.values)
test_price = nn_test_price

In [127]:
# generate prediction
gb_test_price = gb_cv.best_estimator_.predict(test_df_ff.values)
rf_test_price = rf_cv.best_estimator_.predict(test_df_ff.values)
test_price = (gb_test_price + rf_test_price + nn_test_price) / 3

In [128]:
output = pd.DataFrame()
output['Id'] = test_raw['Id']
output['SalePrice'] = test_price

### Export Predicted Prices

In [129]:
filepath = 'submission.csv'
output.to_csv(filepath, index=False)  # remove index

In [130]:
output

Unnamed: 0,Id,SalePrice
0,1461,117911.723937
1,1462,157199.090566
2,1463,176489.393485
3,1464,185850.935169
4,1465,206085.102903
...,...,...
1454,2915,74008.480992
1455,2916,82531.878998
1456,2917,138881.748849
1457,2918,124737.989645
