# House Price Predictor

* In this notebook I will be uisng machine learning algorithms to determine the prices of houses using the data set from Kaggle.

### Import Requried Libraires for analysis

In [37]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings("ignore")

### Import the datasets

In [2]:
# Import the training data
path = Path("Data/train.csv")
test_path = Path("Data/test.csv")

In [3]:
# create dataframe from this csv
train_df = pd.read_csv(path)
train_df.head(
)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
test_df = pd.read_csv(test_path)
test_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [6]:
# check for null values in both datasets
test_df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 80, dtype: int64

In [7]:
train_df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [9]:
# Check columns types and correct if necessary
# Call the object columns to make sure they should be categorized as object or if shouls be replaced
train_df.dtypes[(train_df.dtypes!="int64") & (train_df.dtypes!="float64")]

MSZoning         object
Street           object
Alley            object
LotShape         object
LandContour      object
Utilities        object
LotConfig        object
LandSlope        object
Neighborhood     object
Condition1       object
Condition2       object
BldgType         object
HouseStyle       object
RoofStyle        object
RoofMatl         object
Exterior1st      object
Exterior2nd      object
MasVnrType       object
ExterQual        object
ExterCond        object
Foundation       object
BsmtQual         object
BsmtCond         object
BsmtExposure     object
BsmtFinType1     object
BsmtFinType2     object
Heating          object
HeatingQC        object
CentralAir       object
Electrical       object
KitchenQual      object
Functional       object
FireplaceQu      object
GarageType       object
GarageFinish     object
GarageQual       object
GarageCond       object
PavedDrive       object
PoolQC           object
Fence            object
MiscFeature      object
SaleType        

In [10]:
# Use this helper function to assit in dropping null values and use the mean ti fill the gap
def fill_in(data):
    for i in data.columns:
        # if data type is equal to object we leave it alone
        if data[i].dtype == "object":
            data[i] = data[i].fillna("")
        # if data type is a intger or float we want to take the mean of that and fill it in for the null values
        elif (data[i].dtype == "int64" or data[i].dtype == "float64"):
            data[i] = data[i].fillna(data[i].mean())
        else:
            pass
    return data

In [11]:
train_df = fill_in(train_df)

In [12]:
test_df = fill_in(test_df)

## Prepare the datasets to but used in the machine learning algorithm

In [190]:
# Convert all the data into numerical data
train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)

In [191]:
train_df.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0


In [16]:
# check the target column to determine if it is a classification or regression model
# sale price is numberical so it is going to be a regression model
train_df["SalePrice"]

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [17]:
# reshape the target model so the shape has a 1 to this is so it matches with the training set
y = train_df["SalePrice"].values.reshape(-1, 1)

In [18]:
# store the training data into the X variable
X = train_df.drop(columns = ["SalePrice"])
X.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0


In [19]:
# make sure the y shape has a 1 in it
# reshape the model so it is (1460, 1)
y.shape

(1460, 1)

In [20]:
X.shape

(1460, 305)

### Fit the data to a Scaler

In [154]:
# create a scaler for the model
scaler = StandardScaler()

# scale the training data

X_scaler = scaler.fit(X)

# use the X_scaler to scale the test data
X_train_scaled = X_scaler.transform(X)
#X_test_scaled = X_scaler.transform(X_test)

### Incorporate PCA to assist on the speed of the algorithm since there is a lot of columns

In [155]:
# incorporate PCA into the train model
pca = PCA(n_components = 3)

X_pca_scaled = pca.fit_transform(X)
#test_pca_scaled = pca.fit_transform(X_test_scaled)

In [170]:
# split the test csv into X_test, y_test
X_test = test_df
# y_test = test_df["SalesPrice"]
X_test_pca_scaled = pca.fit_transform(X_test)

## Import Regression Model (Gradient Boosting)

In [158]:
# use gradient boosting regessor
from sklearn.ensemble import GradientBoostingRegressor
# import the relevent metrics to determine how our model did mse, r2, score
from sklearn.metrics import mean_squared_error, r2_score


In [159]:
gbr = GradientBoostingRegressor()

In [None]:
# Use hyperparameter tunning to assist in getting the best parameters for the model

In [160]:
from sklearn.model_selection import GridSearchCV

In [None]:
## the more parameters that are supplied the longer the model will take to fit

In [161]:
grid_search = {"n_estimators":[50, 75, 100],
              "learning_rate":[0.01, 0.5, 1],
              "max_depth":[2, 3, 4],
              "min_samples_split": [2, 3, 4],
              "min_samples_leaf": [2, 3, 4]
              }

In [111]:
# Pass the grid_search varaible into GridSearchCV
param_search = GridSearchCV(estimator = gbr, param_grid = grid_search, scoring = "neg_mean_squared_error")

In [162]:
param_search.fit(X_pca_scaled, y)

GridSearchCV(estimator=GradientBoostingRegressor(),
             param_grid={'learning_rate': [0.01, 0.5, 1],
                         'max_depth': [2, 3, 4], 'min_samples_leaf': [2, 3, 4],
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': [50, 75, 100]},
             scoring='neg_mean_squared_error')

In [163]:
# Obatin the best parameters by using bese_params_
param_search.best_params_

{'learning_rate': 0.5,
 'max_depth': 4,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 50}

In [164]:
# Create model based on above parameters
gradient_model = GradientBoostingRegressor(n_estimators = 50, 
                                           learning_rate = 0.5, 
                                           max_depth = 4, 
                                           min_samples_leaf = 2, 
                                           min_samples_split = 4, 
                                           random_state = 78)

In [165]:
gradient_model.fit(X_pca_scaled, y)

GradientBoostingRegressor(learning_rate=0.5, max_depth=4, min_samples_leaf=2,
                          min_samples_split=4, n_estimators=50,
                          random_state=78)

In [176]:
predictions_x_train = gradient_model.predict(X_pca_scaled)
y

array([[208500],
       [181500],
       [223500],
       ...,
       [266500],
       [142125],
       [147500]], dtype=int64)

In [177]:
r2_score(predictions_x_train, y)

0.9365254484710016

In [178]:
np.sqrt(mean_squared_error(predictions_x_train, y))

18871.292746371862

# Test set analysis

In [179]:
predictions_x_test = gradient_model.predict(X_test_pca_scaled)

In [186]:
pd.DataFrame({'id':X_test['Id'], 'SalePrice':predictions_x_test}).to_csv('file.csv', index=False)

In [122]:
train_score = gradient_model.score(X_train_scaled, y_train, sample_weight = None)
test_score = gradient_model.score(X_test_scaled, y_test, sample_weight = None)

In [123]:
print(f"The training score is {train_score}.")
print(f"The test score is {test_score}.")

The training score is 0.9663538510600024.
The test score is 0.8622282168994633.


In [124]:
mean_squared_error(predictions, y_test)

971182702.8138472

In [153]:
y_test.reshape(1,-1)[0][:5]

array([320000, 165000, 204900, 171000, 262500], dtype=int64)

In [143]:
errors_df = pd.DataFrame({'predictions':predictions,
                         'real':y_test.reshape(1,-1)[0]})
errors_df['diff'] = (errors_df['predictions']-errors_df['real'])**2

In [146]:
np.sqrt(errors_df['diff'].mean())

31163.804370035556

In [149]:
np.sqrt(mean_squared_error(predictions, y_test))

31163.804370035556

In [147]:
r2_score(predictions, y_test)

0.8252946032724282

In [35]:
# scale the training data which is all of the training set
train_scaler = scaler.fit(train_df)

# use the train scaler to scale the testing data
train_scaled = train_scaler.transform(train_df)
test_scaled = train_scaler.transform(test_df)

ValueError: X has 293 features, but StandardScaler is expecting 306 features as input.