## Predicting the sales prices and practice feature engineering with Linear Regression

**House Prices - Advanced Regression Techniques**

##### data source: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data?select=train.csv

`Adegoke Olanrewaju`

## Step 1

In [1]:
# importing the libraries

In [2]:
import numpy as np

import pandas as pd

import sklearn

In [3]:
# loading the dataset from my local folder

In [4]:
dataset_master = pd.read_csv('/Users/OLALYTICS/dsp-olanrewaju-adegoke/data/train.csv')

house_data_csv = dataset_master.copy()

house_data_csv.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
#house_data_csv.info()

In [6]:
# defining the features and target of interest 

useful_data = house_data_csv[['MSZoning','HouseStyle','YearBuilt',
                              'TotalBsmtSF','MiscVal','SalePrice']]

# defining the categorical and continuous features

catFeature = ['MSZoning','HouseStyle']

contFeature = ['YearBuilt','TotalBsmtSF','MiscVal']

# selecting the features and target

features = useful_data[catFeature + contFeature]

target = useful_data['SalePrice']



In [7]:
# splitting the dataset using train_test_split

from sklearn.model_selection import train_test_split

x_train, X_test, y_train, y_test = train_test_split(features, target, 
                            test_size = 0.3, random_state = 42)



#### Step 1 - No 7

### Model buiding section

###### Model training

In [8]:
# Dataset Loadinng and Splitting

In [9]:
# loading the csv

train_csv_master = pd.read_csv('/Users/OLALYTICS/dsp-olanrewaju-adegoke/data/train.csv')

train_csv = train_csv_master.copy()

train_csv.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [10]:
# selecting the relevant features of interest

train_csv_use = train_csv[['MSZoning','HouseStyle','YearBuilt',
                              'TotalBsmtSF','MiscVal','SalePrice']]

In [11]:
# selecting the categorical and continuous features

categorical_features = ['MSZoning','HouseStyle']

continuous_features = ['YearBuilt','TotalBsmtSF','MiscVal']

# defining the features and target

features = train_csv_use[categorical_features + continuous_features]

target = train_csv_use['SalePrice']


In [12]:
# split the dataset into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, 
                                    test_size = 0.3, random_state = 42)


#### preprocessing and feature engineering of the train set

In [13]:
# OneHotEncoding for categorical variables

from sklearn.preprocessing import OneHotEncoder

# OneHotEncoding for X_train

oneHot = OneHotEncoder(drop = 'first', sparse = False)

oneHot.fit(X_train[categorical_features])

X_train_cat = oneHot.transform(X_train[categorical_features])

X_train_cat_DF = pd.DataFrame(X_train_cat, columns = oneHot.get_feature_names(categorical_features ))


##### Persisting the OneHotEncoders -Step 2 - No 3i

In [14]:
# accessing the models folder from root

!cd ../models

In [15]:
import joblib

joblib.dump(oneHot, '../models/oneHot.joblib')

['../models/oneHot.joblib']

In [16]:
# StandardScaling for the continuous variables

from sklearn.preprocessing import StandardScaler

# Scaling for X_train

stdScaler = StandardScaler()

stdScaler.fit(X_train[continuous_features])

X_train_cont = stdScaler.transform(X_train[continuous_features])

X_train_cont_DF = pd.DataFrame(X_train_cont, columns = continuous_features)


  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):


##### Persisting the Scaler - Step 3 - No 3i

In [17]:
!cd ../models

In [18]:
import joblib

joblib.dump(stdScaler, '../models/stdScaler.joblib')


['../models/stdScaler.joblib']

#### combining all the preprocessing dataset into a single train set

In [19]:
X_train = pd.concat([X_train_cont_DF, X_train_cat_DF],axis = 1)

X_train.head()

Unnamed: 0,YearBuilt,TotalBsmtSF,MiscVal,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
0,-0.032378,0.54613,-0.122987,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.106434,-1.146608,-0.122987,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.236584,-0.67926,-0.122987,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.138972,1.890034,-0.122987,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.813597,-0.583107,-0.122987,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


##### model training

In [20]:
# fit tthe model using LinearRegression

from sklearn.linear_model import LinearRegression

LinReg = LinearRegression()

LinReg.fit(X_train, y_train)


  array.dtypes.apply(is_sparse).any()):


LinearRegression()

##### Persisting the trained model - step 2 - No 2i

In [21]:
# switching to the models directory

!cd ../models

In [22]:
!pwd

/Users/OLALYTICS/dsp-olanrewaju-adegoke/notebooks


In [23]:
!ls

house_prices_modeling.ipynb           model-industrialization-1.ipynb
model-industrialization-1-Copy1.ipynb my-1st-notebook.ipynb


In [24]:
# saving the LinearRegression to the models folder

In [25]:
import joblib

# my_model is : LinReg

# my_folder name is : models

# joblib the model

joblib.dump(LinReg, '../models/LinReg.joblib')


['../models/LinReg.joblib']

### Model evaluation for the Splitted X_test

###### preprocessing of the X_test

In [26]:
# OneHotEncoding for categorical variables

from sklearn.preprocessing import OneHotEncoder

oneHot = OneHotEncoder(drop = 'first', sparse = False)

# OneHotEncoding for the test set

oneHot.fit(X_test[categorical_features])

X_test_cat = oneHot.transform(X_test[categorical_features])

X_test_cat_DF = pd.DataFrame(X_test_cat, columns = oneHot.get_feature_names(categorical_features ))


In [27]:
# StandardScaling for the continuous variables

from sklearn.preprocessing import StandardScaler

# StandardScaling for the test set

stdScaler = StandardScaler()

stdScaler.fit(X_test[continuous_features])

X_test_cont = stdScaler.transform(X_test[continuous_features])

X_test_cont_DF = pd.DataFrame(X_test_cont, columns = continuous_features)



  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):


In [28]:
# combining the test set for categorical and continuous features

X_test = pd.concat([X_test_cont_DF, X_test_cat_DF],axis = 1)

X_test.head()

Unnamed: 0,YearBuilt,TotalBsmtSF,MiscVal,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
0,-0.308311,0.016827,-0.064496,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.765085,0.984199,-0.064496,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-1.554834,-0.105292,-0.064496,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,-0.862321,-0.11487,-0.064496,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.215218,1.360134,-0.064496,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


##### model predictions on the X_test

In [29]:
# making predictions on the test set

y_predicted = LinReg.predict(X_test)

y_predicted[:5]

  array.dtypes.apply(is_sparse).any()):


array([154881.29770683, 277749.55163658, 101224.12235707, 152917.79658876,
       246174.67874271])

### Model evaluation and model performance

In [30]:
from sklearn.metrics import mean_squared_error, mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_predicted: np.ndarray, precision: int = 2) -> float:
    
    msle = mean_squared_log_error(y_test, y_predicted)
    
    rmsle = np.sqrt(msle)
    
    return round(msle, precision), round(rmsle, precision)


In [31]:

compute_rmsle(y_test, y_predicted)
      
      

(0.07, 0.27)

#### Step 1 - No 8

In [32]:
dataPATH = '/Users/OLALYTICS/dsp-olanrewaju-adegoke/data/'

dataset = dataPATH + 'test.csv'


In [33]:
# function to read dataset

def read_data(dataset):
    """
    Reads a dataset from a CSV file.

    Args:
        file_path (str): The file path of the CSV file.

    Returns:
        pd.DataFrame: The loaded dataset as a pandas DataFrame.
    """
    try:
        data_csv = pd.read_csv(dataset)
        
        return data_csv
    
    except Exception as e:
        
        print(f"Error: {e}")
        
        return None


In [34]:
testtt = read_data(dataset)

testtt.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


### Model Inference - Inference is done on the test.csv dataset

In [35]:
# loading and reading the given test.csv dataset

dataPATH = '/Users/OLALYTICS/dsp-olanrewaju-adegoke/data/'

dataset = dataPATH + 'test.csv'

test_csv_master = pd.read_csv(dataset)

test_csv = test_csv_master.copy()

test_csv.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


#### preprocessing of the test_csv dataset

In [36]:
# defining the categorical and continuous features of the test_csv

categorical_features = ['MSZoning','HouseStyle']

continuous_features = ['YearBuilt','TotalBsmtSF','MiscVal']


In [37]:
# selecting the categorical and continuous features of the test_csv

test_csv_features = test_csv[categorical_features + continuous_features]

test_csv_features.head()


Unnamed: 0,MSZoning,HouseStyle,YearBuilt,TotalBsmtSF,MiscVal
0,RH,1Story,1961,882.0,0
1,RL,1Story,1958,1329.0,12500
2,RL,2Story,1997,928.0,0
3,RL,2Story,1998,926.0,0
4,RL,1Story,1992,1280.0,0


# OneHotEncoding for categorical variables

from sklearn.preprocessing import OneHotEncoder

# OneHotEncoding for test_csv

oneHot = OneHotEncoder(drop = 'first', sparse = False)

oneHot.fit(test_csv_features[categorical_features])

test_csv_cat = oneHot.transform(test_csv_features[categorical_features])

test_csv_cat_DF = pd.DataFrame(test_csv_cat, columns = oneHot.get_feature_names(categorical_features ))


##### Persist for the inference dataset for OneHotEncoder - loading the existing joblib files to perform the task

In [40]:
# checking for missing and NaN values and correcting it

test_csv_features.isna().sum()

test_csv_features.dropna(inplace = True)

test_csv_features.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_csv_features.dropna(inplace = True)


Unnamed: 0,MSZoning,HouseStyle,YearBuilt,TotalBsmtSF,MiscVal
0,RH,1Story,1961,882.0,0
1,RL,1Story,1958,1329.0,12500
2,RL,2Story,1997,928.0,0
3,RL,2Story,1998,926.0,0
4,RL,1Story,1992,1280.0,0


In [38]:
import joblib

loaded_oneHot = joblib.load('../models/oneHot.joblib')

In [41]:
test_csv_cat = loaded_oneHot.transform(test_csv_features[categorical_features])


In [42]:
test_csv_cat_DF = pd.DataFrame(test_csv_cat, columns = loaded_oneHot.get_feature_names(categorical_features ))


# StandardScaling for the continuous variables

from sklearn.preprocessing import StandardScaler

# StandardScaling for the test_csv

stdScaler = StandardScaler()

stdScaler.fit(test_csv_features[continuous_features])

test_csv_cont = stdScaler.transform(test_csv_features[continuous_features])

test_csv_cont_DF = pd.DataFrame(test_csv_cont, columns = continuous_features)



##### Persist for the inference dataset for StandardScaler - loading the existing joblib files to perform the task

In [43]:
import joblib

loaded_stdScaler = joblib.load('../models/stdScaler.joblib')

In [44]:

test_csv_cont = loaded_stdScaler.transform(test_csv_features[continuous_features])


  array.dtypes.apply(is_sparse).any()):


In [46]:
test_csv_cont_DF = pd.DataFrame(test_csv_cont, columns = continuous_features)


In [47]:
# combining the test set for categorical and continuous features

test_set = pd.concat([test_csv_cont_DF, test_csv_cat_DF],axis = 1)

test_set.tail()

Unnamed: 0,YearBuilt,TotalBsmtSF,MiscVal,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
1449,-0.032378,-1.148844,-0.122987,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1450,-0.032378,-1.148844,-0.122987,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1451,-0.357753,0.367241,-0.122987,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1452,0.683447,-0.330426,1.948222,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1453,0.715984,-0.142593,-0.122987,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# already taken into consideration

# checking for missing and NaN values and correcting it

test_set.isna().sum()

test_set.dropna(inplace = True)

test_set.head()

#### predicting the house prices using the test_csv dataset

# making predictions on the test_csv set

y_predicted_test_csv = LinReg.predict(test_set)

y_predicted_test_csv[:5]

### Step 2 - No 2ii

In [None]:
# load the model from models folder

In [48]:
LinReg_model = joblib.load('../models/LinReg.joblib')

In [49]:
pred = LinReg_model.predict(test_set)

pred[:5]

  array.dtypes.apply(is_sparse).any()):


array([112202.71314909, 171175.40084421, 225391.38660654, 225935.80195261,
       197418.35870414])

#### Step 2

### Object Persistance

#### done!!!

#### selecting the features with continuous and categorical variables and also the target variable known as the label.

In [None]:
#useful_data = house_data_csv[['MSZoning','HouseStyle','YearBuilt','TotalBsmtSF','MiscVal','SalePrice']]

#useful_data.head()

# checking for missing values

useful_data.info()

# checking for NaN

useful_data.isna().sum()

# checking for duplicate and correcting it

useful_data.drop_duplicates(inplace = True)

useful_data.duplicated(keep = 'first').sum()

# defining the categorical and continuous features

catFeature = ['MSZoning','HouseStyle']

contFeature = ['YearBuilt','TotalBsmtSF','MiscVal']


# selecting the features and target

features = useful_data[catFeature + contFeature]

target = useful_data['SalePrice']

#### Features engineering - OneHotEncoding for categorical variables and Scaling for continuous variables

# import the libraries

from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Encoding the categorical variables

onehot_encoder = OneHotEncoder(drop = 'first', sparse = False)

onehot_encoder.fit(features[catFeature])

catFeature_encoded = onehot_encoder.transform(features[catFeature])

catFeature_encoded_DF = pd.DataFrame(catFeature_encoded, columns = onehot_encoder.get_feature_names(catFeature))

catFeature_encoded_DF.head()


# Scaling the continuous variables

std_scaler = StandardScaler()

std_scaler.fit(features[contFeature])

contFeature_scaled = std_scaler.transform(features[contFeature])

contFeature_scaled_DF = pd.DataFrame(contFeature_scaled, columns = contFeature)

contFeature_scaled_DF.head()


In [None]:
# combining the dataset after full scaling

combined_dataset = pd.concat([contFeature_scaled_DF, catFeature_encoded_DF, target], axis = 1)

combined_dataset.tail()

# checking for NaN and removing it

combined_dataset.isna().sum()

combined_dataset.dropna(inplace = True)

combined_dataset.head()


# selecting the X and y from the final_dataset

X = combined_dataset.drop('SalePrice', axis = 1)

X.head()

y = combined_dataset['SalePrice']

y.head()

In [None]:
##### train and test split for training the model

from sklearn.model_selection import train_test_split



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
##### fitting of the model by using the LinearRegression

from sklearn.linear_model import LinearRegression

LinReg = LinearRegression()

LinReg.fit(X_train, y_train)

In [None]:
##### predicting the housing prices using the LinearRegression Model

y_predicted = LinReg.predict(X_test)

y_predicted[:5]

In [None]:
##### evaluating the model using the mean_squared_log_error

from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_predicted: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_predicted))
    return round(rmsle, precision)


print('The mean squared log error is:', compute_rmsle(y_test, y_predicted))


#### testing the equality of the dataframe using parquet

In [None]:
#dataPATH = '/Users/OLALYTICS/dsp-olanrewaju-adegoke/data/'

In [None]:
#combined_dataset.head()

In [None]:
# saving the dataset into parquet

#combined_dataset.to_parquet(dataPATH + 'combined_parq_df.parquet', index = False)

In [None]:
# to check the available data in the local repository

#!ls '/Users/OLALYTICS/dsp-olanrewaju-adegoke/data/'

In [None]:
# reading data from the parquet

#combined_parq_df = pd.read_parquet(dataPATH + 'combined_parq_df.parquet')

#combined_parq_df.head()

#### testing for difference or bugs

In [None]:
# resetting the index of both dataset

In [None]:
#combined_parq_df = combined_parq_df.reset_index(drop = True)

#combined_dataset = combined_dataset.reset_index(drop = True)

In [None]:
# the testing code to be sure the output didn't change regardless of refactoring

#pd.testing.assert_frame_equal(combined_parq_df, combined_dataset)

In [None]:
# to ascertain the testinng is functional and working

#pd.testing.assert_frame_equal(combined_parq_df.drop(columns = ['YearBuilt']), combined_dataset)

