## Predicting the sales prices and practice feature engineering with Linear Regression

**House Prices - Advanced Regression Techniques**

##### data source: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data?select=train.csv

`Adegoke Olanrewaju`

##### importing the necessary libraries

In [1]:
#!ls /Users/OLALYTICS/dsp-olanrewaju-adegoke/data/

In [2]:
import numpy as np

import pandas as pd

import sklearn

In [3]:
#file_path = '/Users/OLALYTICS/dsp-olanrewaju-adegoke/data/train.csv'

In [4]:
#def load_data(file_path):
    #data = pd.read_csv(file_path)
    #return data.head()

In [5]:
#load_data(file_path)

##### loading the dataset through its path from my machine

In [6]:
dataset_master = pd.read_csv('/Users/OLALYTICS/dsp-olanrewaju-adegoke/data/train.csv')

training_data_csv = dataset_master.copy()

training_data_csv.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [7]:
#testing_data_csv = pd.read_csv('/Users/OLALYTICS/dsp-olanrewaju-adegoke/data/test.csv')

#testing_data_csv.head()

#### Explanatory Data Analysis

In [8]:
training_data_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [9]:
#training_data_csv.describe()

In [10]:
#training_data_csv.corr()

In [11]:
# selecting useful dataset

useful_dataset_house = training_data_csv[['MSZoning','HouseStyle','YearBuilt','TotalBsmtSF','MiscVal', 'SalePrice']]

useful_dataset_house.head()
    

Unnamed: 0,MSZoning,HouseStyle,YearBuilt,TotalBsmtSF,MiscVal,SalePrice
0,RL,2Story,2003,856,0,208500
1,RL,1Story,1976,1262,0,181500
2,RL,2Story,2001,920,0,223500
3,RL,2Story,1915,756,0,140000
4,RL,2Story,2000,1145,0,250000


In [12]:
# no missing values

useful_dataset_house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   MSZoning     1460 non-null   object
 1   HouseStyle   1460 non-null   object
 2   YearBuilt    1460 non-null   int64 
 3   TotalBsmtSF  1460 non-null   int64 
 4   MiscVal      1460 non-null   int64 
 5   SalePrice    1460 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 68.6+ KB


In [13]:
# checking for duplicates

useful_dataset_house.duplicated(keep = 'first').sum()

1

In [14]:
# removing duplicates

useful_dataset_house = useful_dataset_house.drop_duplicates()

useful_dataset_house.duplicated(keep = 'first').sum()

0

In [15]:
# cleaned useful dataset

useful_dataset_house.head()

Unnamed: 0,MSZoning,HouseStyle,YearBuilt,TotalBsmtSF,MiscVal,SalePrice
0,RL,2Story,2003,856,0,208500
1,RL,1Story,1976,1262,0,181500
2,RL,2Story,2001,920,0,223500
3,RL,2Story,1915,756,0,140000
4,RL,2Story,2000,1145,0,250000


In [16]:
# extracting the dataset of interest

categorical_useful_features = useful_dataset_house[['MSZoning','HouseStyle']]

continuous_useful_features = useful_dataset_house[['YearBuilt','TotalBsmtSF','MiscVal']]

target_feature = useful_dataset_house['SalePrice']


In [17]:
# OneHotEncoder for categorical features

from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder()

categorical_useful_features_encoded = onehot_encoder.fit_transform(categorical_useful_features).toarray()

categorical_useful_features_encoded = pd.DataFrame(categorical_useful_features_encoded, columns = onehot_encoder.get_feature_names())

categorical_useful_features_encoded.head()


Unnamed: 0,x0_C (all),x0_FV,x0_RH,x0_RL,x0_RM,x1_1.5Fin,x1_1.5Unf,x1_1Story,x1_2.5Fin,x1_2.5Unf,x1_2Story,x1_SFoyer,x1_SLvl
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [18]:
# scaling of the continuous features

from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()

std_scaler.fit(continuous_useful_features)

StandardScaler()

In [19]:
continuous_useful_features_scaled = std_scaler.transform(continuous_useful_features)

continuous_useful_features_scaled[:6]


array([[ 1.05180032, -0.45928807, -0.08771809],
       [ 0.15748627,  0.4661749 , -0.08771809],
       [ 0.98555484, -0.31340228, -0.08771809],
       [-1.86300104, -0.68723461, -0.08771809],
       [ 0.95243209,  0.19947744, -0.08771809],
       [ 0.7205729 , -0.59605599,  1.32322599]])

In [20]:
continuous_useful_features_scaled = pd.DataFrame(data = continuous_useful_features_scaled, columns = ['YearBuilt','TotalBsmtSF','MiscVal'])

continuous_useful_features_scaled.head()


Unnamed: 0,YearBuilt,TotalBsmtSF,MiscVal
0,1.0518,-0.459288,-0.087718
1,0.157486,0.466175,-0.087718
2,0.985555,-0.313402,-0.087718
3,-1.863001,-0.687235,-0.087718
4,0.952432,0.199477,-0.087718


In [21]:
# combining to form the clean dataset

In [22]:
final_dataset = continuous_useful_features_scaled.join(categorical_useful_features_encoded).join(target_feature)

final_dataset.head()


Unnamed: 0,YearBuilt,TotalBsmtSF,MiscVal,x0_C (all),x0_FV,x0_RH,x0_RL,x0_RM,x1_1.5Fin,x1_1.5Unf,x1_1Story,x1_2.5Fin,x1_2.5Unf,x1_2Story,x1_SFoyer,x1_SLvl,SalePrice
0,1.0518,-0.459288,-0.087718,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,208500.0
1,0.157486,0.466175,-0.087718,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,181500.0
2,0.985555,-0.313402,-0.087718,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,223500.0
3,-1.863001,-0.687235,-0.087718,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,140000.0
4,0.952432,0.199477,-0.087718,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,250000.0


In [23]:
X = final_dataset.drop('SalePrice', axis = 1)

X.head()

Unnamed: 0,YearBuilt,TotalBsmtSF,MiscVal,x0_C (all),x0_FV,x0_RH,x0_RL,x0_RM,x1_1.5Fin,x1_1.5Unf,x1_1Story,x1_2.5Fin,x1_2.5Unf,x1_2Story,x1_SFoyer,x1_SLvl
0,1.0518,-0.459288,-0.087718,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.157486,0.466175,-0.087718,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.985555,-0.313402,-0.087718,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-1.863001,-0.687235,-0.087718,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.952432,0.199477,-0.087718,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [24]:
y = final_dataset[['SalePrice']]

y.head()

Unnamed: 0,SalePrice
0,208500.0
1,181500.0
2,223500.0
3,140000.0
4,250000.0


In [25]:
y['SalePrice'].fillna(y['SalePrice'].mean(), inplace = True) 

y.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


Unnamed: 0,SalePrice
0,208500.0
1,181500.0
2,223500.0
3,140000.0
4,250000.0


In [26]:
y.isna().value_counts()

SalePrice
False        1459
dtype: int64

# selecting the categorical features

categorical_features = useful_dataset_house[['MSZoning','HouseStyle']]

categorical_features.head()

In [27]:
#categorical_features['MSZoning'].unique()

#categorical_features['HouseStyle'].unique()

In [28]:
# Pre-processing and feature engineering

In [29]:
#categorical_columns = categorical_features[['MSZoning','HouseStyle']]

#categorical_columns.head()

##### selecting the features dataset having continuous and categorical variables

In [30]:
#features = training_data_csv[['MSZoning','HouseStyle','YearBuilt','TotalBsmtSF','MiscVal']]

#features.head()

In [31]:
# no missing values

#features.info()

In [32]:
#help(features.duplicated)

#features.duplicated(keep = 'first').sum()

#features.duplicated(keep = 'last').sum()


In [33]:
# removing duplicates

#features = features.drop_duplicates()

#features.head()

In [34]:
#features.duplicated(keep = 'first').sum()

##### selecting the target variable in this case is the SalePrice

In [35]:
#target = training_data_csv['SalePrice']

#target.head()

###### data preprocessing 

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.3, random_state = 42)



In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [38]:
#X_train.head()

In [39]:
#X_test.head()

In [40]:
#y_train.head()

##### performing encoding on the X_train dataset using the frequency_count

In [41]:
#freq_counts_zoning = X_train['MSZoning'].value_counts()

#freq_counts_zoning

In [42]:
#freq_counts_style = X_train['HouseStyle'].value_counts()

#freq_counts_style

In [43]:
#X_train['MSZoning'] = X_train['MSZoning'].map(freq_counts_zoning)


In [44]:
#X_train['HouseStyle'] = X_train['HouseStyle'].map(freq_counts_style)

In [45]:
#X_train.head()

###### performing encoding of the X_test dataset

In [46]:
#freq_counts_test_zoning = X_test['MSZoning'].value_counts()

In [47]:
#X_test['MSZoning'] = X_test['MSZoning'].map(freq_counts_test_zoning)

In [48]:
#freq_counts_test_style = X_test['HouseStyle'].value_counts()

In [49]:
#X_test['HouseStyle'] = X_test['HouseStyle'].map(freq_counts_test_style)

In [50]:
#X_test.head()

##### scaling the features dataset

from sklearn.preprocessing import StandardScaler

# instantiating the scaler
stdScaler = StandardScaler()

# fit the scaler on the training data set 
stdScaler.fit(X_train)

# transforming the training dataset
X_train_scaled = stdScaler.transform(X_train)

# transforming the test dataset
X_test_scaled = stdScaler.transform(X_test)


###### fitting of the model by using the LinearRegression

from sklearn.linear_model import LinearRegression

LinReg = LinearRegression()

LinReg.fit(X_train_scaled, y_train)

In [51]:
from sklearn.linear_model import LinearRegression

LinReg = LinearRegression()

LinReg.fit(X_train, y_train)

LinearRegression()

##### predicting the housing prices using the LinearRegression Model

y_predicted = LinReg.predict(X_test_scaled)

y_predicted[:5]

In [52]:
y_predicted = LinReg.predict(X_test)

y_predicted[:5]

array([[192318.23075716],
       [175537.3999941 ],
       [195621.19054166],
       [216256.30344516],
       [182494.54308594]])

##### evaluating the model using the mean_squared_log_error

In [53]:
from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_predicted: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_predicted))
    return round(rmsle, precision)


In [54]:
print('The mean squared log error of the model is:', compute_rmsle(y_test, y_predicted))



The mean squared log error of the model is: 0.43


##### evalauting the model the mean_squared_error to see the difference

In [55]:
from sklearn.metrics import mean_squared_error

def compute_rmsle1(y_test: np.ndarray, y_predicted: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_error(y_test, y_predicted))
    return round(rmsle, precision)

In [56]:
print('The mean squared log error of the model is:', compute_rmsle1(y_test, y_predicted))

The mean squared log error of the model is: 80447.94
