## House Price Modeling Using Linear Regression

**Data Science in Production**

###### data source: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data?select=train.csv

**`Adegoke Olanrewaju`**

### importing the libraries

In [1]:
import numpy as np

import pandas as pd

import sklearn

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, mean_squared_log_error

import joblib

### Model Building Section

#### Model training

In [2]:
# Local system path for the dataset

dataPATH = '/Users/OLALYTICS/dsp-olanrewaju-adegoke/data/'

In [3]:
# Loading the train.csv dataset from path

train_csv_master = pd.read_csv(dataPATH + 'train.csv')

train_csv = train_csv_master.copy()

train_csv.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# Selecting the relevant features and target from the large dataset

train_csv_use = train_csv[['MSZoning','HouseStyle','YearBuilt',
                        'TotalBsmtSF','MiscVal','SalePrice']]

In [5]:
# Selecting the categorical and continuous features

categorical_features = ['MSZoning','HouseStyle']

continuous_features = ['YearBuilt','TotalBsmtSF','MiscVal']

# Defining the features and target

features = train_csv_use[categorical_features + continuous_features]

target = train_csv_use['SalePrice']


In [6]:
# Splitting of the dataset into train and test sets

X_train, X_test, y_train, y_test = train_test_split(features, target, 
                                    test_size = 0.3, random_state = 42)


#### Preprocessing and feature engineering of the X_train set

###### OneHotEncoding for the categorical variables

In [7]:
oneHot = OneHotEncoder(drop = 'first', sparse = False)

oneHot.fit(X_train[categorical_features])

X_train_cat = oneHot.transform(X_train[categorical_features])

X_train_cat_DF = pd.DataFrame(X_train_cat, columns = oneHot.get_feature_names(categorical_features ))


#### Persisting the OneHotEncoders for X_train set

In [8]:
# Accessing the path to the models folder from root

!cd ../models

In [9]:
# Saving the oneHot in my local system in the root folder

joblib.dump(oneHot, '../models/oneHot.joblib')

['../models/oneHot.joblib']

###### StandardScaling for the continuous variables

In [10]:
stdScaler = StandardScaler()

stdScaler.fit(X_train[continuous_features])

X_train_cont = stdScaler.transform(X_train[continuous_features])

X_train_cont_DF = pd.DataFrame(X_train_cont, columns = continuous_features)


  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):


#### Persisting the StandardScale for X_train set

In [11]:
# Accessing the path to the models folder from root

!cd ../models

In [12]:
# Saving the oneHot in my local system in the root folder

joblib.dump(stdScaler, '../models/stdScaler.joblib')

['../models/stdScaler.joblib']

#### Combining all the preprocessed X_train dataset into a single X_train

In [13]:
X_train = pd.concat([X_train_cont_DF, X_train_cat_DF],axis = 1)

X_train.head()

Unnamed: 0,YearBuilt,TotalBsmtSF,MiscVal,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
0,-0.032378,0.54613,-0.122987,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.106434,-1.146608,-0.122987,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.236584,-0.67926,-0.122987,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.138972,1.890034,-0.122987,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.813597,-0.583107,-0.122987,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


#### Building an automatic testing framework
NB: `To always check if the DataFrame remains the same after many manipulations.`

In [14]:
# Saving the X_train_dataset into parquet locally

X_train.to_parquet(dataPATH + 'X_train_df.parquet', index = False)

In [15]:
# Reading the X_train_dataset from the parquet

X_train_df = pd.read_parquet(dataPATH + 'X_train_df.parquet')

X_train_df.head()

Unnamed: 0,YearBuilt,TotalBsmtSF,MiscVal,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
0,-0.032378,0.54613,-0.122987,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.106434,-1.146608,-0.122987,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.236584,-0.67926,-0.122987,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.138972,1.890034,-0.122987,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.813597,-0.583107,-0.122987,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [16]:
# Resetting the index of both dataset - X_train and X_train_df

X_train_df = X_train_df.reset_index(drop = True)

X_train = X_train.reset_index(drop = True)       # The X_train here will be the output of the refactoring dataset

In [17]:
# Automatic testing of the code to be sure the DataFrame output remains the same

pd.testing.assert_frame_equal(X_train_df, X_train)

#### Model training and fitting

In [18]:
# Fit the model on the data using LinearRegression

LinReg = LinearRegression()

LinReg.fit(X_train, y_train)


  array.dtypes.apply(is_sparse).any()):


LinearRegression()

#### Persisting the LinearRegression model for X_train dataset

In [19]:
# switching to the models directory

!cd ../models

In [20]:
# saving the LinearRegression to the models folder

joblib.dump(LinReg, '../models/LinReg.joblib')

['../models/LinReg.joblib']

### Model Evaluation

#### Preprocessing and Feature Engineering on the X_test set

###### Preprocessing of the X_test set

In [21]:
# Loading the models file for the OneHotEncoding

loaded_oneHot = joblib.load('../models/oneHot.joblib')

In [22]:
# Transforming the dataset using the model's file for categorical variables 

X_test_cat = loaded_oneHot.transform(X_test[categorical_features])

X_test_cat_DF = pd.DataFrame(X_test_cat, columns = loaded_oneHot.get_feature_names(categorical_features ))


In [23]:
# Loading the models file for the StandardScaler

loaded_stdScaler = joblib.load('../models/stdScaler.joblib')

In [24]:
# Transforming the dataset using the model's file for categorical variables 

X_test_cont = loaded_stdScaler.transform(X_test[continuous_features])

X_test_cont_DF = pd.DataFrame(X_test_cont, columns = continuous_features)


  array.dtypes.apply(is_sparse).any()):


### Combining the X_test set for categorical and continuous features

In [25]:
X_test = pd.concat([X_test_cont_DF, X_test_cat_DF],axis = 1)

X_test.head()

Unnamed: 0,YearBuilt,TotalBsmtSF,MiscVal,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
0,-0.260141,-0.001718,-0.122987,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.748522,0.901672,-0.122987,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-1.431491,-0.115759,-0.122987,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,-0.780741,-0.124704,-0.122987,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.171509,1.252742,-0.122987,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


#### Building an automatic testing framework for X_test

In [26]:
# Saving the X_test_dataset into parquet

X_test.to_parquet(dataPATH + 'X_test_df.parquet', index = False)

# Reading data from the parquet

X_test_df = pd.read_parquet(dataPATH + 'X_test_df.parquet')

# Resetting the index of both dataset

X_test_df = X_test_df.reset_index(drop = True)

X_test = X_test.reset_index(drop = True)       # The X_test here will be the output of the refactoring dataset

# automatic testing of the code to be sure the DataFrame output didn't change regardless of refactoring

pd.testing.assert_frame_equal(X_test_df, X_test)


#### Model predictions using the X_test  from the train_test_split

In [27]:
# Loading the regression model from models folder

LinReg_model = joblib.load('../models/LinReg.joblib')

In [28]:
y_predicted = LinReg_model.predict(X_test)

y_predicted[:5]

  array.dtypes.apply(is_sparse).any()):


array([155191.34335272, 273887.9445002 , 103572.13405251, 154350.61352743,
       240647.02264699])

### Model evaluation and model performance

In [29]:
def compute_rmsle(y_test: np.ndarray, y_predicted: np.ndarray, precision: int = 2) -> float:
    
    msle = mean_squared_log_error(y_test, y_predicted)
    
    rmsle = np.sqrt(msle)
    
    return round(msle, precision), round(rmsle, precision)


In [30]:

compute_rmsle(y_test, y_predicted)     

(0.06, 0.25)

### Model Inference 

Inference is done on the test.csv dataset

In [31]:
# Loading and reading the given test.csv dataset

dataset = dataPATH + 'test.csv'

test_csv_master = pd.read_csv(dataset)

test_csv = test_csv_master.copy()

test_csv.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


#### Preprocessing of the test_csv dataset

In [32]:
# Defining the categorical and continuous features of the test_csv

categorical_features = ['MSZoning','HouseStyle']

continuous_features = ['YearBuilt','TotalBsmtSF','MiscVal']


In [33]:
# Selecting the categorical and continuous features of the test_csv

test_csv_features = test_csv[categorical_features + continuous_features]

test_csv_features.head()


Unnamed: 0,MSZoning,HouseStyle,YearBuilt,TotalBsmtSF,MiscVal
0,RH,1Story,1961,882.0,0
1,RL,1Story,1958,1329.0,12500
2,RL,2Story,1997,928.0,0
3,RL,2Story,1998,926.0,0
4,RL,1Story,1992,1280.0,0


In [34]:
# Checking for NaN values and correcting it

test_csv_features.isna().sum()

test_csv_features.dropna(inplace = True)

test_csv_features.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_csv_features.dropna(inplace = True)


Unnamed: 0,MSZoning,HouseStyle,YearBuilt,TotalBsmtSF,MiscVal
0,RH,1Story,1961,882.0,0
1,RL,1Story,1958,1329.0,12500
2,RL,2Story,1997,928.0,0
3,RL,2Story,1998,926.0,0
4,RL,1Story,1992,1280.0,0


In [35]:
# OneHotencoding for the categorical variables for the test_csv dataset

In [36]:

loaded_oneHot = joblib.load('../models/oneHot.joblib')

In [37]:
test_csv_cat = loaded_oneHot.transform(test_csv_features[categorical_features])


In [38]:
test_csv_cat_DF = pd.DataFrame(test_csv_cat, columns = loaded_oneHot.get_feature_names(categorical_features ))


In [39]:
# StandardScaler for the continuous variables for the test_csv dataset

In [40]:

loaded_stdScaler = joblib.load('../models/stdScaler.joblib')

In [41]:

test_csv_cont = loaded_stdScaler.transform(test_csv_features[continuous_features])

test_csv_cont_DF = pd.DataFrame(test_csv_cont, columns = continuous_features)


  array.dtypes.apply(is_sparse).any()):


In [42]:
# Combining the test_csv set for categorical and continuous features

test_set = pd.concat([test_csv_cont_DF, test_csv_cat_DF],axis = 1)

test_set.tail()

Unnamed: 0,YearBuilt,TotalBsmtSF,MiscVal,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
1449,-0.032378,-1.148844,-0.122987,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1450,-0.032378,-1.148844,-0.122987,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1451,-0.357753,0.367241,-0.122987,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1452,0.683447,-0.330426,1.948222,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1453,0.715984,-0.142593,-0.122987,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [43]:
final_test_csv = pd.concat([test_set, target], axis = 1)

final_test_csv.head()

Unnamed: 0,YearBuilt,TotalBsmtSF,MiscVal,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,SalePrice
0,-0.325216,-0.39751,-0.122987,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,208500
1,-0.422828,0.602033,36.862884,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,181500
2,0.846134,-0.294648,-0.122987,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,223500
3,0.878672,-0.299121,-0.122987,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,140000
4,0.683447,0.492464,-0.122987,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,250000


#### Building an automatic testing framework for test.csv

In [45]:
# Saving the final_test_csv into parquet

final_test_csv.to_parquet(dataPATH + 'final_test_csv_df.parquet', index = False)

# Reading data from the parquet

final_test_csv_df = pd.read_parquet(dataPATH + 'final_test_csv_df.parquet')

# Resetting the index of both dataset

final_test_csv_df = final_test_csv_df.reset_index(drop = True)

final_test_csv = final_test_csv.reset_index(drop = True)       # The final_test_csv here will be the output of the refactoring dataset


# Automatic testing of the code to be sure the DataFrame output didn't change regardless of refactoring

pd.testing.assert_frame_equal(final_test_csv_df, final_test_csv)



#### Predicting the house prices using the test_csv dataset

In [46]:
# load the model from models folder

In [47]:
LinReg_model = joblib.load('../models/LinReg.joblib')

In [48]:
pred = LinReg_model.predict(test_set)

pred[:5]

  array.dtypes.apply(is_sparse).any()):


array([112202.71314909, 171175.40084421, 225391.38660654, 225935.80195261,
       197418.35870414])