In [3]:
# import modules
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [5]:
# read Boston Housing dataset
path = r"C:\Users\Jwpel\Downloads"
file = "\\train.csv"
df = pd.read_csv(path+file)

In [6]:
# first 5 records
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [7]:
# view shape of data
df.shape

(1460, 81)

In [8]:
# data type and missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [9]:
# view total missing values
df.isna().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [10]:
# removing columns with many missing values
df_dropped = df.drop(["Id", "Alley", "FireplaceQu", "PoolQC", "Fence", "MiscFeature"], axis=1)

In [11]:
# checking to see if those were removed
df_dropped.isna().sum()

MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 75, dtype: int64

In [12]:
# keep "LotFrontage" and replace missing values with mean value
df_dropped["LotFrontage"].fillna(df_dropped["LotFrontage"].mean(), inplace = True)
df_dropped["LotFrontage"].isna()

0       False
1       False
2       False
3       False
4       False
        ...  
1455    False
1456    False
1457    False
1458    False
1459    False
Name: LotFrontage, Length: 1460, dtype: bool

In [13]:
# now removing any records with any other missing values with removing more columns
df_dropped.dropna(axis = 0, how = "any", inplace = True)

In [14]:
# no missing values in dataset
df_dropped.shape
df_dropped.isna().sum()

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 75, dtype: int64

In [15]:
# how many records and columns now
df_dropped.shape

(1338, 75)

In [16]:
# importing modules for encoding categorical vairables
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [17]:
# renaming dataframe
df_new = df_dropped

In [18]:
# assigning X and y variables
X = df_new.drop("SalePrice", axis = 1)

In [19]:
y = df_new.SalePrice

In [20]:
X.shape

(1338, 74)

In [22]:
y.shape

(1338,)

In [23]:
# viewing X variables
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,61,0,0,0,0,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,42,0,0,0,0,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,84,0,0,0,0,0,12,2008,WD,Normal


In [24]:
from sklearn.compose import make_column_transformer

In [25]:
obj_list = df_new.select_dtypes(include=[object],).head()
obj_list.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 0 to 4
Data columns (total 38 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MSZoning       5 non-null      object
 1   Street         5 non-null      object
 2   LotShape       5 non-null      object
 3   LandContour    5 non-null      object
 4   Utilities      5 non-null      object
 5   LotConfig      5 non-null      object
 6   LandSlope      5 non-null      object
 7   Neighborhood   5 non-null      object
 8   Condition1     5 non-null      object
 9   Condition2     5 non-null      object
 10  BldgType       5 non-null      object
 11  HouseStyle     5 non-null      object
 12  RoofStyle      5 non-null      object
 13  RoofMatl       5 non-null      object
 14  Exterior1st    5 non-null      object
 15  Exterior2nd    5 non-null      object
 16  MasVnrType     5 non-null      object
 17  ExterQual      5 non-null      object
 18  ExterCond      5 non-null      obj

In [26]:
# transform data types object to category
X[X.select_dtypes(['object']).columns] = X.select_dtypes(['object']).apply(lambda x: x.astype('category'))
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1338 entries, 0 to 1459
Data columns (total 74 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   MSSubClass     1338 non-null   int64   
 1   MSZoning       1338 non-null   category
 2   LotFrontage    1338 non-null   float64 
 3   LotArea        1338 non-null   int64   
 4   Street         1338 non-null   category
 5   LotShape       1338 non-null   category
 6   LandContour    1338 non-null   category
 7   Utilities      1338 non-null   category
 8   LotConfig      1338 non-null   category
 9   LandSlope      1338 non-null   category
 10  Neighborhood   1338 non-null   category
 11  Condition1     1338 non-null   category
 12  Condition2     1338 non-null   category
 13  BldgType       1338 non-null   category
 14  HouseStyle     1338 non-null   category
 15  OverallQual    1338 non-null   int64   
 16  OverallCond    1338 non-null   int64   
 17  YearBuilt      1338 non-null   in

In [27]:
# reducing variables down to 13 
X = X[["LotFrontage", "LotArea", "Neighborhood", "BldgType", "OverallQual", "YearBuilt", "TotalBsmtSF", "CentralAir", "1stFlrSF", "2ndFlrSF",\
     "GrLivArea", "TotRmsAbvGrd", "SaleCondition"]]

In [28]:
X.head()

Unnamed: 0,LotFrontage,LotArea,Neighborhood,BldgType,OverallQual,YearBuilt,TotalBsmtSF,CentralAir,1stFlrSF,2ndFlrSF,GrLivArea,TotRmsAbvGrd,SaleCondition
0,65.0,8450,CollgCr,1Fam,7,2003,856,Y,856,854,1710,8,Normal
1,80.0,9600,Veenker,1Fam,6,1976,1262,Y,1262,0,1262,6,Normal
2,68.0,11250,CollgCr,1Fam,7,2001,920,Y,920,866,1786,6,Normal
3,60.0,9550,Crawfor,1Fam,7,1915,756,Y,961,756,1717,7,Abnorml
4,84.0,14260,NoRidge,1Fam,8,2000,1145,Y,1145,1053,2198,9,Normal


In [29]:
# import scaler module to scale data
from sklearn.preprocessing import StandardScaler

In [30]:
# scaling continuous variables to same scale
X[['LotFrontage', 'LotArea', 'OverallQual', 'YearBuilt', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'TotRmsAbvGrd']] = StandardScaler().fit_transform(X[['LotFrontage', 'LotArea', 'OverallQual', 'YearBuilt', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea','TotRmsAbvGrd']])

In [31]:
X.head()

Unnamed: 0,LotFrontage,LotArea,Neighborhood,BldgType,OverallQual,YearBuilt,TotalBsmtSF,CentralAir,1stFlrSF,2ndFlrSF,GrLivArea,TotRmsAbvGrd,SaleCondition
0,-0.254147,-0.218363,CollgCr,1Fam,0.589337,1.014157,-0.591875,Y,-0.828516,1.128813,0.331321,0.91508,Normal
1,0.422953,-0.107067,Veenker,1Fam,-0.165963,0.100528,0.409598,Y,0.221935,-0.811389,-0.529579,-0.347165,Normal
2,-0.118727,0.05262,CollgCr,1Fam,0.589337,0.94648,-0.434007,Y,-0.662928,1.156076,0.477367,-0.347165,Normal
3,-0.479847,-0.111906,Crawfor,1Fam,0.589337,-1.963596,-0.838543,Y,-0.556848,0.906167,0.344773,0.283958,Abnorml
4,0.603514,0.343926,NoRidge,1Fam,1.344638,0.912642,0.120996,Y,-0.080781,1.580921,1.269088,1.546203,Normal


In [32]:
# encoding categorical variables
column_trans = make_column_transformer((OneHotEncoder(), ["Neighborhood","BldgType","CentralAir","SaleCondition"]), remainder = "passthrough")

In [33]:
# applied transformer to X and reassigned name
X_transformed = column_trans.fit_transform(X)

In [34]:
# creating pipeline for future use
from sklearn.pipeline import make_pipeline

In [38]:
# import linear regression algorithm
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [44]:
# split data between test and train
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size = .2, random_state = 9)

In [45]:
X_train.shape
y_train.shape
X_test.shape

(268, 47)

In [46]:
# fit training data with regression model
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [47]:
# assign prediction variable for test data
pred = regr.predict(X_test)

In [49]:
# print mean squared error and variance score
print("Mean Squared Error: %.2f" % mean_squared_error(y_test, pred))
print("Total Variance: %.2f" % r2_score(y_test, pred))
      

Mean Squared Error: 866156257.82
Total Variance: 0.83


In [59]:
# create and assign random forest regressor algorithm, 1000 trees
regr_rf = RandomForestRegressor(n_estimators = 1000, random_state = 9)

In [60]:
# fit training data with random forest regressor
regr_rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=9, verbose=0, warm_start=False)

In [61]:
# assign random forest predictin variable for test data
rf_pred = regr_rf.predict(X_test)

In [62]:
# print random forest regression results
print("Total Variance: %.2f" % r2_score(y_test, rf_pred))

Total Variance: 0.84
