In [119]:
# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
IMAGES_PATH = "images/"

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)


In [120]:
import pandas as pd

HOUSING_PATH = "data/"

def load_housing_data(housing_path=HOUSING_PATH):
    train_path = os.path.join(housing_path, "train.csv")
    test_path = os.path.join(housing_path, "test.csv")
    return pd.read_csv(train_path), pd.read_csv(test_path) 

In [121]:
housing, housing_test = load_housing_data()
housing.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [122]:
housing_labels = housing["SalePrice"].copy()
housing_labels_log = np.log(housing_labels)
housing_num = housing.select_dtypes(include=[np.number]).drop("SalePrice", axis=1)
housing_cat = housing.select_dtypes(include="object")
attribs_num = list(housing_num)
attribs_cat = list(housing_cat)

In [123]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

pipeline_num = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

In [124]:
pipeline_cat = Pipeline([
        ('imputer', SimpleImputer(strategy="most_frequent")),
        ('one_hot_encoder', OneHotEncoder()),
    ])

In [125]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

full_pipeline = ColumnTransformer([
        ("num", pipeline_num, attribs_num),
        ("cat", pipeline_cat, attribs_cat),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

In [126]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels_log)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [127]:
# let's try the full preprocessing pipeline on a few training instances
some_data = housing.iloc[:5]
some_labels = housing_labels_log.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))
print("Labels:", list(some_labels))

('Predictions:', array([12.23909825, 12.21342766, 12.27024074, 12.0035779 , 12.57480225]))
('Labels:', [12.247694320220994, 12.109010932687042, 12.31716669303576, 11.84939770159144, 12.429216196844383])


In [128]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(lin_reg, housing_prepared, housing_labels_log, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores)).describe()

count    10.000000
mean      0.150392
std       0.048050
min       0.097441
25%       0.123460
50%       0.130596
75%       0.167647
max       0.262219
dtype: float64

In [129]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(random_state=42)
forest_reg.fit(housing_prepared, housing_labels_log)

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels_log, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-forest_scores)).describe()

count    10.000000
mean      0.149567
std       0.014969
min       0.129736
25%       0.137959
50%       0.151025
75%       0.161912
max       0.169179
dtype: float64

In [130]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [25, 30, 35], 'max_features': [4, 8, 12, 16]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [25, 30, 35], 'max_features': [6, 8, 12]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels_log)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [25, 30, 35], 'max_features': [4, 8, 12, 16]}, {'n_estimators': [25, 30, 35], 'max_features': [6, 8, 12], 'bootstrap': [False]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [131]:
pd.DataFrame(grid_search.cv_results_)
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)
grid_search.best_params_

(0.1718736576061073, {'max_features': 4, 'n_estimators': 25})
(0.170402697998944, {'max_features': 4, 'n_estimators': 30})
(0.16983754520742905, {'max_features': 4, 'n_estimators': 35})
(0.16133809650355932, {'max_features': 8, 'n_estimators': 25})
(0.15961099804686302, {'max_features': 8, 'n_estimators': 30})
(0.15904646189363703, {'max_features': 8, 'n_estimators': 35})
(0.1578751012042337, {'max_features': 12, 'n_estimators': 25})
(0.15649378455108337, {'max_features': 12, 'n_estimators': 30})
(0.15497438274483732, {'max_features': 12, 'n_estimators': 35})
(0.1519190970944437, {'max_features': 16, 'n_estimators': 25})
(0.15059657160479, {'max_features': 16, 'n_estimators': 30})
(0.14944992342709365, {'max_features': 16, 'n_estimators': 35})
(0.15977693824521402, {'max_features': 6, 'n_estimators': 25, 'bootstrap': False})
(0.15755046092181155, {'max_features': 6, 'n_estimators': 30, 'bootstrap': False})
(0.15617321148597207, {'max_features': 6, 'n_estimators': 35, 'bootstrap': False

{'bootstrap': False, 'max_features': 12, 'n_estimators': 35}

In [132]:
final_model = grid_search.best_estimator_

X_test = housing_test

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)


In [133]:
final_model

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
           max_features=12, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=35, n_jobs=None, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [158]:
full_pipeline.named_transformers_["cat"].named_steps["one_hot_encoder"].categories_


[array(['C (all)', 'FV', 'RH', 'RL', 'RM'], dtype=object),
 array(['Grvl', 'Pave'], dtype=object),
 array(['Grvl', 'Pave'], dtype=object),
 array(['IR1', 'IR2', 'IR3', 'Reg'], dtype=object),
 array(['Bnk', 'HLS', 'Low', 'Lvl'], dtype=object),
 array(['AllPub', 'NoSeWa'], dtype=object),
 array(['Corner', 'CulDSac', 'FR2', 'FR3', 'Inside'], dtype=object),
 array(['Gtl', 'Mod', 'Sev'], dtype=object),
 array(['Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr',
        'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel',
        'NAmes', 'NPkVill', 'NWAmes', 'NoRidge', 'NridgHt', 'OldTown',
        'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber',
        'Veenker'], dtype=object),
 array(['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNe',
        'RRNn'], dtype=object),
 array(['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNn'],
       dtype=object),
 array(['1Fam', '2fmCon', 'Duplex', 'Twnhs', 'TwnhsE'], dtype=object),
 

In [159]:
attribs_cat

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [187]:
import copy
feature_list = copy.deepcopy(attribs_num)
catagories_list = full_pipeline.named_transformers_["cat"].named_steps["one_hot_encoder"].categories_
for j in range(len(attribs_cat)):
    catagories = catagories_list[j]
    for k in range(len(catagories)):
        feature_list.append(attribs_cat[j] + ' = ' + catagories[k])
        
housing_prepared_df = pd.DataFrame(data=housing_prepared.toarray(), columns=feature_list)

In [188]:
housing_prepared_df["SalePrice"] = pd.Series(housing_labels, index=housing_prepared_df.index)

In [189]:
corr_matrix = housing_prepared_df.corr()
corr_matrix["SalePrice"].sort_values(ascending=False)

SalePrice                  1.000000
OverallQual                0.790982
GrLivArea                  0.708624
GarageCars                 0.640409
GarageArea                 0.623431
TotalBsmtSF                0.613581
1stFlrSF                   0.605852
FullBath                   0.560664
BsmtQual = Ex              0.553105
TotRmsAbvGrd               0.533723
YearBuilt                  0.522897
YearRemodAdd               0.507101
KitchenQual = Ex           0.504094
Foundation = PConc         0.497734
MasVnrArea                 0.472614
Fireplaces                 0.466929
GarageYrBlt                0.466754
ExterQual = Gd             0.452466
ExterQual = Ex             0.451164
BsmtFinType1 = GLQ         0.434597
HeatingQC = Ex             0.434543
GarageFinish = Fin         0.419678
Neighborhood = NridgHt     0.402149
BsmtFinSF1                 0.386420
SaleType = New             0.357509
SaleCondition = Partial    0.352060
LotFrontage                0.334771
MasVnrType = Stone         0