# Build Regression Model for Online App

### Import required libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

### Read csv data file into pandas dataframe

In [2]:
df = pd.read_csv('houseprices_modified.csv')
df

Unnamed: 0,LotFrontageSF,LotAreaSF,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMaterial,...,GrLivArea,GarageType,GarageYrBlt,GarageCars,GarageSF,GarageQual,WoodDeckSF,OpenPorchSF,MiscFeature,SalePrice
0,65,8450,Single Family Detached,2 Story,7,5,2003,2003,Gable,Standard Composite Shingle,...,1710,Attached,2003,2,548,3,0,61,,208500
1,80,9600,Single Family Detached,1 Story,6,8,1976,1976,Gable,Standard Composite Shingle,...,1262,Attached,1976,2,460,3,298,0,,181500
2,68,11250,Single Family Detached,2 Story,7,5,2001,2002,Gable,Standard Composite Shingle,...,1786,Attached,2001,2,608,3,0,42,,223500
3,60,9550,Single Family Detached,2 Story,7,5,1915,1970,Gable,Standard Composite Shingle,...,1717,Detached,1998,3,642,3,0,35,,140000
4,84,14260,Single Family Detached,2 Story,8,5,2000,2000,Gable,Standard Composite Shingle,...,2198,Attached,2000,3,836,3,192,84,,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1451,62,7917,Single Family Detached,2 Story,6,5,1999,2000,Gable,Standard Composite Shingle,...,1647,Attached,1999,2,460,3,0,40,,175000
1452,85,13175,Single Family Detached,1 Story,6,6,1978,1988,Gable,Standard Composite Shingle,...,2073,Attached,1978,2,500,3,349,0,,210000
1453,66,9042,Single Family Detached,2 Story,7,9,1941,2006,Gable,Standard Composite Shingle,...,2340,Attached,1941,1,252,3,0,60,Shed,266500
1454,68,9717,Single Family Detached,1 Story,5,6,1950,1996,Hip,Standard Composite Shingle,...,1078,Attached,1950,1,240,3,366,0,,142125


### Get info on dataset
9 categoricals features<br>
22 numberic features

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1456 entries, 0 to 1455
Data columns (total 32 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   LotFrontageSF  1456 non-null   int64 
 1   LotAreaSF      1456 non-null   int64 
 2   BldgType       1456 non-null   object
 3   HouseStyle     1456 non-null   object
 4   OverallQual    1456 non-null   int64 
 5   OverallCond    1456 non-null   int64 
 6   YearBuilt      1456 non-null   int64 
 7   YearRemodAdd   1456 non-null   int64 
 8   RoofStyle      1456 non-null   object
 9   RoofMaterial   1456 non-null   object
 10  ExterQual      1456 non-null   int64 
 11  ExterCond      1456 non-null   int64 
 12  Foundation     1456 non-null   object
 13  BsmtQual       1456 non-null   int64 
 14  BsmtCond       1456 non-null   int64 
 15  BsmtFinSF      1456 non-null   int64 
 16  TotalBsmtSF    1456 non-null   int64 
 17  Heating        1456 non-null   object
 18  HeatingQC      1456 non-null

### Describe dataset

In [4]:
df.describe()

Unnamed: 0,LotFrontageSF,LotAreaSF,OverallQual,OverallCond,YearBuilt,YearRemodAdd,ExterQual,ExterCond,BsmtQual,BsmtCond,...,FirstFlrSF,SecondFlrSF,GrLivArea,GarageYrBlt,GarageCars,GarageSF,GarageQual,WoodDeckSF,OpenPorchSF,SalePrice
count,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,...,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0
mean,57.296016,10448.784341,6.088599,5.576236,1971.18544,1984.819368,3.39217,3.083791,3.48489,2.934753,...,1157.108516,343.532967,1506.50206,1976.43956,1.764423,471.568681,2.809753,93.833791,46.221154,180151.233516
std,33.877802,9860.763449,1.369669,1.113966,30.20159,20.652143,0.570206,0.351509,0.87409,0.552906,...,369.307331,431.528915,496.815378,26.308749,0.746215,211.986421,0.723822,125.192349,65.352424,76696.59253
min,0.0,1300.0,1.0,1.0,1872.0,1950.0,2.0,1.0,0.0,0.0,...,334.0,0.0,334.0,1872.0,0.0,0.0,0.0,0.0,0.0,34900.0
25%,42.0,7538.75,5.0,5.0,1954.0,1966.75,3.0,3.0,3.0,3.0,...,882.0,0.0,1128.0,1959.0,1.0,329.5,3.0,0.0,0.0,129900.0
50%,63.0,9468.5,6.0,5.0,1972.0,1993.5,3.0,3.0,4.0,3.0,...,1086.0,0.0,1458.5,1978.0,2.0,478.5,3.0,0.0,24.0,163000.0
75%,79.0,11588.0,7.0,6.0,2000.0,2004.0,4.0,3.0,4.0,3.0,...,1389.25,728.0,1775.25,2001.0,2.0,576.0,3.0,168.0,68.0,214000.0
max,313.0,215245.0,10.0,9.0,2010.0,2010.0,5.0,5.0,5.0,4.0,...,3228.0,1818.0,3627.0,2010.0,4.0,1390.0,5.0,857.0,547.0,625000.0


### Get value counts for each categorical feature

In [5]:
df['BldgType'].value_counts()

Single Family Detached    1216
Townhouse End Unit         114
Duplex                      52
Townhouse                   43
Two Family Conversion       31
Name: BldgType, dtype: int64

In [6]:
df['HouseStyle'].value_counts()

1 Story        726
2 Story        441
1.5 Story      168
Split Level     65
Split Foyer     37
2.5 Story       19
Name: HouseStyle, dtype: int64

In [7]:
df['RoofStyle'].value_counts()

Gable      1140
Hip         283
Flat         13
Gambrel      11
Mansard       7
Shed          2
Name: RoofStyle, dtype: int64

In [8]:
df['RoofMaterial'].value_counts()

Standard Composite Shingle    1432
Tar & Gravel                    11
Wood Shingles                    5
Wood Shakes                      5
Metal                            1
Membrane                         1
Roll                             1
Name: RoofMaterial, dtype: int64

In [9]:
df['Foundation'].value_counts()

Poured Concrete    643
Cinder Block       634
Brick & Tile       146
Slab                24
Stone                6
Wood                 3
Name: Foundation, dtype: int64

In [10]:
df['Heating'].value_counts()

GasA     1424
GasW       18
Grav        7
Wall        4
OthW        2
Floor       1
Name: Heating, dtype: int64

In [11]:
df['CentralAir'].value_counts()

Yes    1361
No       95
Name: CentralAir, dtype: int64

### Indentify all columns on dataset

In [12]:
df.columns

Index(['LotFrontageSF', 'LotAreaSF', 'BldgType', 'HouseStyle', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMaterial',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtFinSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir',
       'FirstFlrSF', 'SecondFlrSF', 'GrLivArea', 'GarageType', 'GarageYrBlt',
       'GarageCars', 'GarageSF', 'GarageQual', 'WoodDeckSF', 'OpenPorchSF',
       'MiscFeature', 'SalePrice'],
      dtype='object')

### Encode each categorical feature for use in app

In [14]:
BldgType_le = LabelEncoder()
df['BldgType_Encoded'] = BldgType_le.fit_transform(df['BldgType'])

Below is quick representation of what encoding is doing to the categorical feature

In [15]:
# View encoding

L1 = list(BldgType_le.inverse_transform(df['BldgType_Encoded']))
d1 = dict(zip(BldgType_le.classes_, BldgType_le.transform(BldgType_le.classes_)))
print(d1)

{'Duplex': 0, 'Single Family Detached': 1, 'Townhouse': 2, 'Townhouse End Unit': 3, 'Two Family Conversion': 4}


In [16]:
HouseStyle_le = LabelEncoder()
df['HouseStyle_Encoded'] = HouseStyle_le.fit_transform(df['HouseStyle'])

In [17]:
RoofStyle_le = LabelEncoder()
df['RoofStyle_Encoded'] = RoofStyle_le.fit_transform(df['RoofStyle'])

In [18]:
RoofMaterial_le = LabelEncoder()
df['RoofMaterial_Encoded'] = RoofMaterial_le.fit_transform(df['RoofMaterial'])

In [19]:
Foundation_le = LabelEncoder()
df['Foundation_Encoded'] = Foundation_le.fit_transform(df['Foundation'])

In [20]:
Heating_le = LabelEncoder()
df['Heating_Encoded'] = Heating_le.fit_transform(df['Heating'])

In [21]:
MiscFeature_le = LabelEncoder()
df['MiscFeature_Encoded'] = MiscFeature_le.fit_transform(df['MiscFeature'])

In [22]:
CentralAir_le = LabelEncoder()
df['CentralAir_Encoded'] = CentralAir_le.fit_transform(df['CentralAir'])

In [24]:
GarageType_le = LabelEncoder()
df['GarageType_Encoded'] = GarageType_le.fit_transform(df['GarageType'])

### Info on dataset with additonal encoded features

In [25]:
df.head()

Unnamed: 0,LotFrontageSF,LotAreaSF,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMaterial,...,SalePrice,BldgType_Encoded,HouseStyle_Encoded,RoofStyle_Encoded,RoofMaterial_Encoded,Foundation_Encoded,Heating_Encoded,MiscFeature_Encoded,CentralAir_Encoded,GarageType_Encoded
0,65,8450,Single Family Detached,2 Story,7,5,2003,2003,Gable,Standard Composite Shingle,...,208500,1,2,1,3,2,1,0,1,0
1,80,9600,Single Family Detached,1 Story,6,8,1976,1976,Gable,Standard Composite Shingle,...,181500,1,0,1,3,1,1,0,1,0
2,68,11250,Single Family Detached,2 Story,7,5,2001,2002,Gable,Standard Composite Shingle,...,223500,1,2,1,3,2,1,0,1,0
3,60,9550,Single Family Detached,2 Story,7,5,1915,1970,Gable,Standard Composite Shingle,...,140000,1,2,1,3,0,1,0,1,4
4,84,14260,Single Family Detached,2 Story,8,5,2000,2000,Gable,Standard Composite Shingle,...,250000,1,2,1,3,2,1,0,1,0


### Dataset info showing all features included the new encoded categorical features

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1456 entries, 0 to 1455
Data columns (total 41 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   LotFrontageSF         1456 non-null   int64 
 1   LotAreaSF             1456 non-null   int64 
 2   BldgType              1456 non-null   object
 3   HouseStyle            1456 non-null   object
 4   OverallQual           1456 non-null   int64 
 5   OverallCond           1456 non-null   int64 
 6   YearBuilt             1456 non-null   int64 
 7   YearRemodAdd          1456 non-null   int64 
 8   RoofStyle             1456 non-null   object
 9   RoofMaterial          1456 non-null   object
 10  ExterQual             1456 non-null   int64 
 11  ExterCond             1456 non-null   int64 
 12  Foundation            1456 non-null   object
 13  BsmtQual              1456 non-null   int64 
 14  BsmtCond              1456 non-null   int64 
 15  BsmtFinSF             1456 non-null   

### Model developement

In [27]:
# Split into X and y

X = df[['LotFrontageSF','LotAreaSF','OverallQual','OverallCond','YearBuilt','YearRemodAdd','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtFinSF','TotalBsmtSF','HeatingQC','FirstFlrSF','SecondFlrSF','GrLivArea','GarageYrBlt','GarageCars','GarageSF','GarageQual','WoodDeckSF','OpenPorchSF','BldgType_Encoded','HouseStyle_Encoded','RoofStyle_Encoded','RoofMaterial_Encoded','Foundation_Encoded','Heating_Encoded','MiscFeature_Encoded','CentralAir_Encoded','GarageType_Encoded']]
y = df[['SalePrice']]

In [28]:
# Split into training and testing datasets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [29]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1164, 31), (292, 31), (1164, 1), (292, 1))

In [30]:
X_train.head()

Unnamed: 0,LotFrontageSF,LotAreaSF,OverallQual,OverallCond,YearBuilt,YearRemodAdd,ExterQual,ExterCond,BsmtQual,BsmtCond,...,OpenPorchSF,BldgType_Encoded,HouseStyle_Encoded,RoofStyle_Encoded,RoofMaterial_Encoded,Foundation_Encoded,Heating_Encoded,MiscFeature_Encoded,CentralAir_Encoded,GarageType_Encoded
254,70,8400,5,6,1957,1957,3,4,3,3,...,0,1,0,1,3,1,1,0,1,0
1063,80,14000,7,5,1996,1997,4,3,5,3,...,44,1,2,1,3,2,1,0,1,0
636,50,6000,5,4,1954,1954,3,3,3,3,...,0,4,1,1,3,1,1,0,1,2
1289,60,6600,5,4,1892,1965,3,3,3,3,...,287,1,2,1,3,4,1,0,0,0
514,55,10594,5,5,1926,1950,3,3,3,3,...,0,1,1,1,3,0,3,0,0,4


In [31]:
y_train[:5]

Unnamed: 0,SalePrice
254,145000
1063,328000
636,93000
1289,107500
514,96500


In [32]:
X_test.head()

Unnamed: 0,LotFrontageSF,LotAreaSF,OverallQual,OverallCond,YearBuilt,YearRemodAdd,ExterQual,ExterCond,BsmtQual,BsmtCond,...,OpenPorchSF,BldgType_Encoded,HouseStyle_Encoded,RoofStyle_Encoded,RoofMaterial_Encoded,Foundation_Encoded,Heating_Encoded,MiscFeature_Encoded,CentralAir_Encoded,GarageType_Encoded
497,60,9120,7,6,1925,1950,3,4,3,3,...,100,1,1,1,3,2,1,0,1,4
1261,34,4060,6,5,1998,1999,4,3,4,3,...,68,3,0,1,3,2,1,0,1,0
411,100,34650,5,5,1955,1955,3,3,3,3,...,0,4,0,3,3,1,1,0,0,0
1046,100,21750,5,4,1960,2006,3,2,0,0,...,0,1,0,3,3,3,1,0,1,0
1033,0,11500,4,3,1957,1957,3,4,0,0,...,0,1,0,1,3,3,1,0,0,4


In [33]:
y_test[:5]

Unnamed: 0,SalePrice
497,184000
1261,181000
411,145000
1046,115000
1033,84000


### Train RandomForestRegressor model

In [34]:
# Train Random Forest Regressor model

model = RandomForestRegressor(random_state = 42)
model.fit(X_train, y_train)

### Make predictions

In [35]:
# Make predictions on train data

y_pred_train = model.predict(X_train)

### Check for accuracy using r2_score

In [36]:
acc_train = r2_score(y_train, y_pred_train)
print("The accuracy of training dataset is: ", acc_train*100)

The accuracy of training dataset is:  98.35953863494822


In [37]:
# Make predictions on test data

y_pred_test = model.predict(X_test)

In [38]:
acc_test = r2_score(y_test, y_pred_test)
print("The accuracy of test dataset is: ", acc_test*100)

The accuracy of test dataset is:  88.21804171056594


### Save trained model into a serialized pickle format

In [39]:
# Save the model

with open('randomforestregressor_model.pkl','wb') as file1:
    pickle.dump(model, file1)

### Save encoded labels into pickle file to be used in app

In [40]:
# Save encoders

with open('BldgType_le.pkl','wb') as f1:
    pickle.dump(BldgType_le, f1)

In [41]:
# Save encoders

with open('HouseStyle_le.pkl','wb') as f2:
    pickle.dump(HouseStyle_le, f2)

In [42]:
# Save encoders

with open('HouseStyle_le.pkl','wb') as f2:
    pickle.dump(HouseStyle_le, f2)

In [43]:
# Save encoders

with open('RoofStyle_le.pkl','wb') as f3:
    pickle.dump(RoofStyle_le, f3)

In [44]:
# Save encoders

with open('RoofMaterial_le.pkl','wb') as f4:
    pickle.dump(RoofMaterial_le, f4)

In [45]:
# Save encoders

with open('Foundation_le.pkl','wb') as f5:
    pickle.dump(Foundation_le, f5)

In [46]:
# Save encoders

with open('Heating_le.pkl','wb') as f6:
    pickle.dump(Heating_le, f6)

In [47]:
# Save encoders

with open('MiscFeature_le.pkl','wb') as f7:
    pickle.dump(MiscFeature_le, f7)

In [48]:
# Save encoders

with open('CentralAir_le.pkl','wb') as f8:
    pickle.dump(CentralAir_le, f8)

In [49]:
# Save encoders

with open('GarageType_le.pkl','wb') as f9:
    pickle.dump(GarageType_le, f9)

### All components required to build are complete. App developement is done on "App.py" file which is consumed by HuggingFace site