In [2]:
import pandas as pd
import numpy as np

In [2]:
# Increase the print output
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [4]:
df = pd.read_csv("C:\\Users\\ASUS\\Desktop\\DataSets\\PropertyPrice_Data.csv")
df.shape

(1459, 26)

In [33]:
df.tail(100)

Unnamed: 0,Id,Road_Type,Property_Shape,House_Type,House_Condition,Construction_Year,Remodel_Year,BsmtFinSF1,Total_Basement_Area,Air_Conditioning,First_Floor_Area,Second_Floor_Area,LowQualFinSF,Underground_Full_Bathroom,Full_Bathroom_Above_Grade,Bedroom_Above_Grade,Kitchen_Quality,Rooms_Above_Grade,Fireplaces,Garage,Garage_Built_Year,Garage_Area,Pool_Area,Miscellaneous_Value,Year_Sold,Sale_Price
1359,1360,Paved,Reg,1Fam,5,2004,2005,1447,1980,Y,1980,0,0,1,2,3,Ex,8,1,Attchd,2004.0,132.736244,0,0,2006,315000
1360,1361,Paved,Reg,1Fam,6,1921,1998,0,612,Y,990,1611,0,0,3,4,TA,8,0,BuiltIn,1998.0,723.31124,0,0,2008,189000
1361,1362,Paved,IR1,1Fam,5,2005,2005,1274,1530,Y,1530,0,0,1,2,3,Gd,7,1,Attchd,2005.0,-70.720555,0,0,2009,260000
1362,1363,Paved,IR1,1Fam,4,1920,2007,0,715,Y,1281,457,0,0,2,4,TA,7,1,Attchd,1920.0,557.211721,0,0,2009,104900
1363,1364,Paved,IR1,1Fam,5,2006,2007,0,616,Y,616,796,0,0,2,3,Gd,6,1,BuiltIn,2007.0,471.923874,0,0,2007,156932
1364,1365,Paved,Reg,TwnhsE,5,2005,2005,0,600,Y,520,600,80,0,2,2,Gd,4,0,Detchd,2005.0,168.968353,0,0,2006,144152
1365,1366,Paved,Reg,1Fam,5,2000,2000,533,814,Y,814,860,0,1,2,3,Gd,7,0,Attchd,2000.0,485.081821,0,0,2010,216000
1366,1367,Paved,IR1,1Fam,5,1999,1999,633,873,Y,882,908,0,1,2,3,Gd,7,0,Attchd,1999.0,104.064866,0,0,2008,193000
1367,1368,Paved,Reg,TwnhsE,6,1977,1977,548,757,Y,925,550,0,0,2,4,TA,6,1,Attchd,1977.0,662.188903,0,0,2006,127000
1368,1369,Paved,Reg,TwnhsE,5,2003,2004,685,848,Y,848,0,0,1,1,1,Gd,4,0,Attchd,2003.0,618.065644,0,0,2009,144000


#### Sampling

In [5]:
from sklearn.model_selection import train_test_split
trainDf, testDf = train_test_split(df, train_size=0.8, random_state = 150)

In [6]:
print(trainDf.shape)
print(testDf.shape)

(1167, 26)
(292, 26)


In [7]:
# Create Source Column in both Train and Test
trainDf['Source'] = "Train"
testDf['Source'] = "Test"

# Combine Train and Test
fullRaw = pd.concat([trainDf, testDf], axis = 0)
fullRaw.shape

(1459, 27)

In [8]:
# Lets drop "Id" column from the data as it is not going to assist us in our model
fullRaw.drop(['Id'], axis = 1, inplace = True) 

#### Missing Value Imputation

In [9]:
# Check for NAs
fullRaw.isnull().sum()

Road_Type                     0
Property_Shape                0
House_Type                    0
House_Condition               0
Construction_Year             0
Remodel_Year                  0
BsmtFinSF1                    0
Total_Basement_Area           0
Air_Conditioning              0
First_Floor_Area              0
Second_Floor_Area             0
LowQualFinSF                  0
Underground_Full_Bathroom     0
Full_Bathroom_Above_Grade     0
Bedroom_Above_Grade           0
Kitchen_Quality               0
Rooms_Above_Grade             0
Fireplaces                    0
Garage                       81
Garage_Built_Year            81
Garage_Area                   0
Pool_Area                     0
Miscellaneous_Value           0
Year_Sold                     0
Sale_Price                    0
Source                        0
dtype: int64

In [10]:
# Check data types of the variables
fullRaw.dtypes

Road_Type                     object
Property_Shape                object
House_Type                    object
House_Condition                int64
Construction_Year              int64
Remodel_Year                   int64
BsmtFinSF1                     int64
Total_Basement_Area            int64
Air_Conditioning              object
First_Floor_Area               int64
Second_Floor_Area              int64
LowQualFinSF                   int64
Underground_Full_Bathroom      int64
Full_Bathroom_Above_Grade      int64
Bedroom_Above_Grade            int64
Kitchen_Quality               object
Rooms_Above_Grade              int64
Fireplaces                     int64
Garage                        object
Garage_Built_Year            float64
Garage_Area                  float64
Pool_Area                      int64
Miscellaneous_Value            int64
Year_Sold                      int64
Sale_Price                     int64
Source                        object
dtype: object

In [10]:
# Garage variable (Categorical)
tempMode = fullRaw.loc[fullRaw["Source"] == "Train", "Garage"].mode()[0] # Step 1 Inference: Should always be on the training set
# tempMode = fullRaw["Garage"].mode()[0] -> results in data leakage
tempMode
fullRaw["Garage"].fillna(tempMode, inplace = True) # Step 2 Action: Should always be on the entire data set (train + test)

In [12]:
fullRaw.isnull().sum()

Road_Type                     0
Property_Shape                0
House_Type                    0
House_Condition               0
Construction_Year             0
Remodel_Year                  0
BsmtFinSF1                    0
Total_Basement_Area           0
Air_Conditioning              0
First_Floor_Area              0
Second_Floor_Area             0
LowQualFinSF                  0
Underground_Full_Bathroom     0
Full_Bathroom_Above_Grade     0
Bedroom_Above_Grade           0
Kitchen_Quality               0
Rooms_Above_Grade             0
Fireplaces                    0
Garage                        0
Garage_Built_Year            81
Garage_Area                   0
Pool_Area                     0
Miscellaneous_Value           0
Year_Sold                     0
Sale_Price                    0
Source                        0
dtype: int64

In [11]:
# Garage_Built_Year (Continuous)
tempMedian = fullRaw.loc[fullRaw["Source"] == "Train", "Garage_Built_Year"].median() # Step 1: Inference (on train)
fullRaw["Garage_Built_Year"].fillna(tempMedian, inplace = True)   # Step 2: Action (Change the data) on fulldata

# All NAs should be gone now
fullRaw.isnull().sum()

Road_Type                    0
Property_Shape               0
House_Type                   0
House_Condition              0
Construction_Year            0
Remodel_Year                 0
BsmtFinSF1                   0
Total_Basement_Area          0
Air_Conditioning             0
First_Floor_Area             0
Second_Floor_Area            0
LowQualFinSF                 0
Underground_Full_Bathroom    0
Full_Bathroom_Above_Grade    0
Bedroom_Above_Grade          0
Kitchen_Quality              0
Rooms_Above_Grade            0
Fireplaces                   0
Garage                       0
Garage_Built_Year            0
Garage_Area                  0
Pool_Area                    0
Miscellaneous_Value          0
Year_Sold                    0
Sale_Price                   0
Source                       0
dtype: int64

#### Dummy Variable Creation

In [1]:
fullRaw2 = pd.get_dummies(fullRaw, drop_first = False)
print(fullRaw2.shape)
fullRaw.shape

NameError: name 'pd' is not defined

In [16]:
fullRaw2.head()

Unnamed: 0,House_Condition,Construction_Year,Remodel_Year,BsmtFinSF1,Total_Basement_Area,First_Floor_Area,Second_Floor_Area,LowQualFinSF,Underground_Full_Bathroom,Full_Bathroom_Above_Grade,Bedroom_Above_Grade,Rooms_Above_Grade,Fireplaces,Garage_Built_Year,Garage_Area,Pool_Area,Miscellaneous_Value,Year_Sold,Sale_Price,Road_Type_Gravel,Road_Type_Paved,Property_Shape_IR1,Property_Shape_IR2,Property_Shape_IR3,Property_Shape_Reg,House_Type_1Fam,House_Type_2fmCon,House_Type_Duplex,House_Type_Twnhs,House_Type_TwnhsE,Air_Conditioning_N,Air_Conditioning_Y,Kitchen_Quality_Ex,Kitchen_Quality_Fa,Kitchen_Quality_Gd,Kitchen_Quality_TA,Garage_2TFes,Garage_2Types,Garage_Attchd,Garage_Basment,Garage_BuiltIn,Garage_CarPort,Garage_Detchd,Source_Test,Source_Train
56,5,1999,2000,649,970,983,756,0,1,2,3,7,0,1999.0,742.697147,0,0,2009,172500,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1
392,7,1959,1959,0,0,882,0,0,0,1,3,5,0,1959.0,409.340977,0,1200,2007,106500,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1
176,5,1988,1989,831,1151,1164,896,0,0,2,4,8,1,1988.0,257.352391,0,0,2007,211000,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1
342,4,1949,1950,0,0,1040,0,0,0,2,2,6,0,1949.0,38.9841,0,0,2006,87500,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1
446,6,1966,2002,247,1517,1888,0,0,0,2,2,6,1,1966.0,459.927746,0,0,2010,190000,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1


#### Divide the data into Train & Test

In [13]:
trainDf = fullRaw2[fullRaw2['Source_Train'] == 1].drop(['Source_Train'], axis = 1).copy()
testDf = fullRaw2[fullRaw2['Source_Train'] == 0].drop(['Source_Train'], axis = 1).copy()

In [14]:
trainX = trainDf.drop(['Sale_Price'], axis = 1).copy()
trainY = trainDf['Sale_Price'].copy()
testX = testDf.drop(['Sale_Price'], axis = 1).copy()
testY = testDf['Sale_Price'].copy()

### ML Regression Algorithms

#### 1. Decision Tree 

In [16]:
from sklearn.tree import DecisionTreeRegressor

In [17]:
M1 = DecisionTreeRegressor(random_state = 123).fit(trainX, trainY) # Indep, Dep
testPrediction = M1.predict(testX)

In [18]:
# RMSE
np.sqrt(np.mean((testY - testPrediction)**2))

47107.45250793839

+/-47107: 200000 +/- 47107 => The house price will be between ~153000 to 247107!

In [19]:
# MAPE
(np.mean(np.abs(((testY - testPrediction)/testY))))*100

16.816111070364325

#### 2. Random Forest

In [20]:
from sklearn.ensemble import RandomForestRegressor

In [21]:
M2 = RandomForestRegressor(random_state = 123).fit(trainX, trainY) # Indep, Dep
testPrediction = M2.predict(testX)

In [22]:
# RMSE
np.sqrt(np.mean((testY - testPrediction)**2))

36194.76924137028

In [23]:
# MAPE
(np.mean(np.abs(((testY - testPrediction)/testY))))*100

11.415975327297813

#### 3. K-Nearest Neighbour

In [24]:
from sklearn.neighbors import KNeighborsRegressor

In [25]:
M3 = KNeighborsRegressor().fit(trainX, trainY) # Indep, Dep
testPrediction = M3.predict(testX)

In [26]:
# RMSE
np.sqrt(np.mean((testY - testPrediction)**2))

48669.893481726496

In [27]:
# MAPE
(np.mean(np.abs(((testY - testPrediction)/testY))))*100

17.634129962705053

#### Try standardizing the data before KNN

In [30]:
trainX.head()

Unnamed: 0,House_Condition,Construction_Year,Remodel_Year,BsmtFinSF1,Total_Basement_Area,First_Floor_Area,Second_Floor_Area,LowQualFinSF,Underground_Full_Bathroom,Full_Bathroom_Above_Grade,Bedroom_Above_Grade,Rooms_Above_Grade,Fireplaces,Garage_Built_Year,Garage_Area,Pool_Area,Miscellaneous_Value,Year_Sold,Road_Type_Paved,Property_Shape_IR2,Property_Shape_IR3,Property_Shape_Reg,House_Type_2fmCon,House_Type_Duplex,House_Type_Twnhs,House_Type_TwnhsE,Air_Conditioning_Y,Kitchen_Quality_Fa,Kitchen_Quality_Gd,Kitchen_Quality_TA,Garage_2Types,Garage_Attchd,Garage_Basment,Garage_BuiltIn,Garage_CarPort,Garage_Detchd
56,5,1999,2000,649,970,983,756,0,1,2,3,7,0,1999.0,742.697147,0,0,2009,1,0,0,1,0,0,1,0,1,0,1,0,0,1,0,0,0,0
392,7,1959,1959,0,0,882,0,0,0,1,3,5,0,1959.0,409.340977,0,1200,2007,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0
176,5,1988,1989,831,1151,1164,896,0,0,2,4,8,1,1988.0,257.352391,0,0,2007,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0
342,4,1949,1950,0,0,1040,0,0,0,2,2,6,0,1949.0,38.9841,0,0,2006,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1
446,6,1966,2002,247,1517,1888,0,0,0,2,2,6,1,1966.0,459.927746,0,0,2010,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0


In [28]:
from sklearn.preprocessing import StandardScaler

Train_Scaling = StandardScaler().fit(trainX) # Step 1: Inference
trainX_Std = Train_Scaling.transform(trainX) # Step 2: Action on train
testX_Std  = Train_Scaling.transform(testX) # Step 2: Action on test

# Add the column names to trainX_Std, testX_Std
trainX_Std = pd.DataFrame(trainX_Std, columns = trainX.columns)
testX_Std = pd.DataFrame(testX_Std, columns = testX.columns)

In [29]:
trainX_Std.head()

Unnamed: 0,House_Condition,Construction_Year,Remodel_Year,BsmtFinSF1,Total_Basement_Area,First_Floor_Area,Second_Floor_Area,LowQualFinSF,Underground_Full_Bathroom,Full_Bathroom_Above_Grade,Bedroom_Above_Grade,Rooms_Above_Grade,Fireplaces,Garage_Built_Year,Garage_Area,Pool_Area,Miscellaneous_Value,Year_Sold,Road_Type_Gravel,Road_Type_Paved,Property_Shape_IR1,Property_Shape_IR2,Property_Shape_IR3,Property_Shape_Reg,House_Type_1Fam,House_Type_2fmCon,House_Type_Duplex,House_Type_Twnhs,House_Type_TwnhsE,Air_Conditioning_N,Air_Conditioning_Y,Kitchen_Quality_Ex,Kitchen_Quality_Fa,Kitchen_Quality_Gd,Kitchen_Quality_TA,Garage_2TFes,Garage_2Types,Garage_Attchd,Garage_Basment,Garage_BuiltIn,Garage_CarPort,Garage_Detchd,Source_Test
0,-0.520449,0.925506,0.740014,0.481959,-0.184658,-0.461179,0.923935,-0.121547,1.122632,0.784584,0.164625,0.312027,-0.941304,0.855037,1.297935,-0.071246,-0.092807,0.883718,-0.058646,0.058646,-0.705744,-0.170589,-0.077682,0.761026,-2.246471,-0.144905,-0.200267,5.862051,-0.290813,-0.262071,0.262071,-0.265786,-0.17841,1.224308,-1.006016,-0.065597,0.0,0.741486,-0.117902,-0.252607,-0.071889,-0.610689,0.0
1,1.260677,-0.392531,-1.265279,-1.010179,-2.452444,-0.728319,-0.798572,-0.121547,-0.826939,-1.035714,0.164625,-0.922331,-0.941304,-0.819031,-0.291337,-0.071246,2.0815,-0.627343,-0.058646,0.058646,1.416945,-0.170589,-0.077682,-1.314015,0.445143,-0.144905,-0.200267,-0.170589,-0.290813,-0.262071,0.262071,-0.265786,-0.17841,-0.816788,0.99402,-0.065597,0.0,0.741486,-0.117902,-0.252607,-0.071889,-0.610689,0.0
2,-0.520449,0.563046,0.202009,0.900402,0.238506,0.017558,1.242918,-0.121547,-0.826939,0.784584,1.372914,0.929206,0.619068,0.394668,-1.015941,-0.071246,-0.092807,-0.627343,-0.058646,0.058646,1.416945,-0.170589,-0.077682,-1.314015,0.445143,-0.144905,-0.200267,-0.170589,-0.290813,-0.262071,0.262071,-0.265786,-0.17841,-0.816788,0.99402,-0.065597,0.0,0.741486,-0.117902,-0.252607,-0.071889,-0.610689,0.0
3,-1.411012,-0.722041,-1.705465,-1.010179,-2.452444,-0.310416,-0.798572,-0.121547,-0.826939,0.784584,-1.043663,-0.305152,-0.941304,-1.237547,-2.05701,-0.071246,-0.092807,-1.382873,-0.058646,0.058646,-0.705744,-0.170589,-0.077682,0.761026,-2.246471,-0.144905,4.993329,-0.170589,-0.290813,3.815757,-3.815757,-0.265786,-0.17841,-0.816788,0.99402,-0.065597,0.0,-1.348644,-0.117902,-0.252607,-0.071889,1.637494,0.0
4,0.370114,-0.161875,0.837833,-0.442293,1.094186,1.932506,-0.798572,-0.121547,-0.826939,0.784584,-1.043663,-0.305152,0.619068,-0.526069,-0.050165,-0.071246,-0.092807,1.639248,-0.058646,0.058646,1.416945,-0.170589,-0.077682,-1.314015,0.445143,-0.144905,-0.200267,-0.170589,-0.290813,-0.262071,0.262071,-0.265786,-0.17841,1.224308,-1.006016,-0.065597,0.0,0.741486,-0.117902,-0.252607,-0.071889,-0.610689,0.0


In [30]:
M3_1 = KNeighborsRegressor().fit(trainX_Std, trainY) # Indep, Dep
testPrediction = M3_1.predict(testX_Std)

In [31]:
print("RMSE: ", np.sqrt(np.mean((testY - testPrediction)**2)))
print("MAPE: ", (np.mean(np.abs(((testY - testPrediction)/testY))))*100)

RMSE:  44748.0593036948
MAPE:  15.424151529931981
