# House Price Prediction Using Regularization for Surprise Housing

## Importing and Reading data

In [1]:
# Supress warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# import python libraries
import pandas as pd
import numpy as np

In [3]:
# Importing dataset into 'train_df' python dataframe
train_df = pd.read_csv('Datasets/train.csv')
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# Let us see the shape of the data
train_df.shape

(1460, 81)

In [5]:
# Let us check for the null values
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

We have 19 fields with Null values, let us print the field names.

In [6]:
train_df.columns[train_df.isna().any()].to_list()

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

### Fixing Null values in the Numerical fields 
#### Updating Null values in the numerical field 'Alley'

Let us check the values in the field "Alley"

In [7]:
train_df.Alley.value_counts()

Grvl    50
Pave    41
Name: Alley, dtype: int64

Based on Data Description, Let us fill value 'No alley access' in the field "Alley".

In [8]:
train_df.Alley = train_df.Alley.fillna('No alley access')
train_df.Alley

0       No alley access
1       No alley access
2       No alley access
3       No alley access
4       No alley access
             ...       
1455    No alley access
1456    No alley access
1457    No alley access
1458    No alley access
1459    No alley access
Name: Alley, Length: 1460, dtype: object

#### Updating Null values in the numerical field 'BsmtQual' and 'BsmtCond'

In [9]:
# let us check for the values in the "BsmtQual" field
train_df.BsmtQual.value_counts()

TA    649
Gd    618
Ex    121
Fa     35
Name: BsmtQual, dtype: int64

In [10]:
# let us check for the values in the "BsmtCond" field
train_df.BsmtCond.value_counts()

TA    1311
Gd      65
Fa      45
Po       2
Name: BsmtCond, dtype: int64

Let us fill value 'NA' in the fields "BsmtQual" and "BsmtCond" to keep the consistency in the values.

In [11]:
# Based on Data Description, Let us fill value 'NA' in the field "BsmtQual"
train_df.BsmtQual = train_df.BsmtQual.fillna('NA')
train_df.BsmtQual.value_counts()

TA    649
Gd    618
Ex    121
NA     37
Fa     35
Name: BsmtQual, dtype: int64

In [12]:
# Based on Data Description, Let us fill value 'NA' in the field "BsmtCond"
train_df.BsmtCond = train_df.BsmtCond.fillna('NA')
train_df.BsmtCond.value_counts()

TA    1311
Gd      65
Fa      45
NA      37
Po       2
Name: BsmtCond, dtype: int64

#### Updating Null values in the numerical field 'BsmtExposure'

In [13]:
# let us check for the values in the "BsmtExposure" field
train_df.BsmtExposure.value_counts()

No    953
Av    221
Gd    134
Mn    114
Name: BsmtExposure, dtype: int64

In [14]:
# Based on Data Description, Let us fill value 'NA' in the field "BsmtExposure" for 'No Basement' 
train_df.BsmtExposure = train_df.BsmtExposure.fillna('NA')
train_df.BsmtExposure.value_counts()

No    953
Av    221
Gd    134
Mn    114
NA     38
Name: BsmtExposure, dtype: int64

#### Updating Null values in the numerical field 'BsmtFinType1'

In [15]:
# let us check for the values in the "BsmtFinType1" field
train_df.BsmtFinType1.value_counts()

Unf    430
GLQ    418
ALQ    220
BLQ    148
Rec    133
LwQ     74
Name: BsmtFinType1, dtype: int64

In [16]:
# Based on Data Description, Let us fill value 'NA' in the field "BsmtFinType1" for 'No Basement' 
train_df.BsmtFinType1 = train_df.BsmtFinType1.fillna('NA')
train_df.BsmtFinType1.value_counts()

Unf    430
GLQ    418
ALQ    220
BLQ    148
Rec    133
LwQ     74
NA      37
Name: BsmtFinType1, dtype: int64

#### Updating Null values in the numerical field 'BsmtFinType2'

In [17]:
# let us check for the values in the "BsmtFinType2" field
train_df.BsmtFinType2.value_counts()

Unf    1256
Rec      54
LwQ      46
BLQ      33
ALQ      19
GLQ      14
Name: BsmtFinType2, dtype: int64

In [18]:
# Based on Data Description, Let us fill value 'NA' in the field "BsmtFinType2" for 'No Basement' 
train_df.BsmtFinType2 = train_df.BsmtFinType2.fillna('NA')
train_df.BsmtFinType2.value_counts()

Unf    1256
Rec      54
LwQ      46
NA       38
BLQ      33
ALQ      19
GLQ      14
Name: BsmtFinType2, dtype: int64

#### Updating Null values in the numerical field 'FireplaceQu'

In [19]:
# let us check for the values in the "FireplaceQu" field
train_df.FireplaceQu.value_counts()

Gd    380
TA    313
Fa     33
Ex     24
Po     20
Name: FireplaceQu, dtype: int64

In [20]:
# Based on Data Description, Let us fill value 'NA' in the field "FireplaceQu" for 'No Fireplace' 
train_df.FireplaceQu = train_df.FireplaceQu.fillna('NA')
train_df.FireplaceQu.value_counts()

NA    690
Gd    380
TA    313
Fa     33
Ex     24
Po     20
Name: FireplaceQu, dtype: int64

#### Updating Null values in the numerical field 'GarageType'

In [21]:
# let us check for the values in the "GarageType" field
train_df.GarageType.value_counts()

Attchd     870
Detchd     387
BuiltIn     88
Basment     19
CarPort      9
2Types       6
Name: GarageType, dtype: int64

In [22]:
# Based on Data Description, Let us fill value 'NA' in the field "GarageType" for 'No Garage' 
train_df.GarageType = train_df.GarageType.fillna('NA')
train_df.GarageType.value_counts()

Attchd     870
Detchd     387
BuiltIn     88
NA          81
Basment     19
CarPort      9
2Types       6
Name: GarageType, dtype: int64

#### Updating Null values in the numerical field 'GarageFinish'

In [23]:
# let us check for the values in the "GarageFinish" field
train_df.GarageFinish.value_counts()

Unf    605
RFn    422
Fin    352
Name: GarageFinish, dtype: int64

In [24]:
# Based on Data Description, Let us fill value 'NA' in the field "GarageFinish" as there is no Garage.
train_df.GarageFinish = train_df.GarageFinish.fillna('NA')
train_df.GarageFinish.value_counts()

Unf    605
RFn    422
Fin    352
NA      81
Name: GarageFinish, dtype: int64

#### Updating Null values in the numerical field 'GarageQual'

In [25]:
# let us check for the values in the "GarageQual" field
train_df.GarageQual.value_counts()

TA    1311
Fa      48
Gd      14
Ex       3
Po       3
Name: GarageQual, dtype: int64

In [26]:
# Based on Data Description, Let us fill value 'NA' in the field "GarageQual" as there is no Garage.
train_df.GarageQual = train_df.GarageQual.fillna('NA')
train_df.GarageQual.value_counts()

TA    1311
NA      81
Fa      48
Gd      14
Ex       3
Po       3
Name: GarageQual, dtype: int64

#### Updating Null values in the numerical field 'GarageCond'

In [27]:
# let us check for the values in the "GarageCond" field
train_df.GarageCond.value_counts()

TA    1326
Fa      35
Gd       9
Po       7
Ex       2
Name: GarageCond, dtype: int64

In [28]:
# Based on Data Description, Let us fill value 'NA' in the field "GarageCond" as there is no Garage.
train_df.GarageCond = train_df.GarageCond.fillna('NA')
train_df.GarageCond.value_counts()

TA    1326
NA      81
Fa      35
Gd       9
Po       7
Ex       2
Name: GarageCond, dtype: int64

#### Updating Null values in the numerical field 'PoolQC'

In [29]:
# let us check for the values in the "PoolQC" field
train_df.PoolQC.value_counts()

Gd    3
Ex    2
Fa    2
Name: PoolQC, dtype: int64

In [30]:
# Based on Data Description, Let us fill value 'NA' in the field "PoolQC" for 'No Pool'
train_df.PoolQC = train_df.PoolQC.fillna('NA')
train_df.PoolQC.value_counts()

NA    1453
Gd       3
Ex       2
Fa       2
Name: PoolQC, dtype: int64

#### Updating Null values in the numerical field 'Fence'

In [31]:
# let us check for the values in the "Fence" field
train_df.Fence.value_counts()

MnPrv    157
GdPrv     59
GdWo      54
MnWw      11
Name: Fence, dtype: int64

In [32]:
# Based on Data Description, Let us fill value 'NA' in the field "Fence" for 'No Fence'
train_df.Fence = train_df.Fence.fillna('NA')
train_df.Fence.value_counts()

NA       1179
MnPrv     157
GdPrv      59
GdWo       54
MnWw       11
Name: Fence, dtype: int64

#### Updating Null values in the numerical field 'MiscFeature'

In [33]:
# let us check for the values in the "MiscFeature" field
train_df.MiscFeature.value_counts()

Shed    49
Gar2     2
Othr     2
TenC     1
Name: MiscFeature, dtype: int64

In [34]:
# Based on Data Description, Let us fill value 'NA' in the field "MiscFeature" for 'None'
train_df.MiscFeature = train_df.MiscFeature.fillna('NA')
train_df.MiscFeature.value_counts()

NA      1406
Shed      49
Gar2       2
Othr       2
TenC       1
Name: MiscFeature, dtype: int64

#### Updating Null values in the numerical field 'MasVnrType'

In [35]:
# let us check for the values in the "MasVnrType" field
train_df.MasVnrType.value_counts()

None       864
BrkFace    445
Stone      128
BrkCmn      15
Name: MasVnrType, dtype: int64

In [36]:
# Since there are only 8 Null values, let us update the Null values with NA.
train_df.MasVnrType = train_df.MasVnrType.fillna('NA')
train_df.MasVnrType.value_counts()

None       864
BrkFace    445
Stone      128
BrkCmn      15
NA           8
Name: MasVnrType, dtype: int64

#### Updating Null values in the numerical field 'Electrical'

In [37]:
# let us check for the values in the "Electrical" field
train_df.Electrical.value_counts()

SBrkr    1334
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: Electrical, dtype: int64

In [38]:
# Since there is only 1 Null values, let us update the Null value with None.
train_df.Electrical = train_df.Electrical.fillna('NA')
train_df.Electrical.value_counts()

SBrkr    1334
FuseA      94
FuseF      27
FuseP       3
Mix         1
NA          1
Name: Electrical, dtype: int64

### Fixing Null values in the Numerical fields 
#### Updating Null values in the numerical field 'LotFrontage'

In [39]:
# Let us check for the minimum value in the field 'LotFrontage'
min(train_df.LotFrontage)

21.0

This means there is no 0 value in the field 'LotFrontage'. <br/>
Let us update the median value of field 'LotFrontage' for Null values.

In [40]:
LotFrontage_med = train_df.LotFrontage[train_df.LotFrontage!=0].median()
LotFrontage_med

69.0

In [41]:
# Let us check for the average of LotFrontage field.
train_df.LotFrontage[train_df.LotFrontage!=0].mean()

70.04995836802665

In [42]:
# Both median value and the average value of the field 'LotFrontage' is almost close, 
# hence, let us update above median value in the field 'LotFrontage'
train_df.LotFrontage = train_df.LotFrontage.fillna(LotFrontage_med)

#### Updating Null values in the numerical field 'MasVnrArea'

In [43]:
# Let us check for the values in the "MasVnrArea" field and if it contains value 0
train_df.MasVnrArea.sort_values().value_counts()

0.0       861
180.0       8
108.0       8
72.0        8
16.0        7
         ... 
254.0       1
255.0       1
258.0       1
259.0       1
1600.0      1
Name: MasVnrArea, Length: 327, dtype: int64

Thus, there are 861 records with 0 vaulue. <br/>
Let us check for the non-zero values.

In [44]:
train_df.MasVnrArea[train_df.MasVnrArea!=0].sort_values().value_counts()

180.0     8
72.0      8
108.0     8
120.0     7
16.0      7
         ..
297.0     1
81.0      1
299.0     1
67.0      1
1600.0    1
Name: MasVnrArea, Length: 326, dtype: int64

In [45]:
# Let us check for the median of non-zero MasVnrArea values.
MasVnrArea_med = train_df.MasVnrArea[train_df.MasVnrArea!=0].median()
MasVnrArea_med

203.0

In [46]:
# Let us update above median value in the field 'MasVnrArea'
train_df.MasVnrArea = train_df.MasVnrArea.fillna(MasVnrArea_med)

#### Updating 'GarageYrBlt' field with value 0

In [47]:
train_df.GarageYrBlt.value_counts()

2005.0    65
2006.0    59
2004.0    53
2003.0    50
2007.0    49
          ..
1927.0     1
1900.0     1
1906.0     1
1908.0     1
1933.0     1
Name: GarageYrBlt, Length: 97, dtype: int64

In [48]:
train_df.GarageYrBlt = train_df.GarageYrBlt.fillna(0)

In [49]:
train_df.GarageYrBlt.value_counts()

0.0       81
2005.0    65
2006.0    59
2004.0    53
2003.0    50
          ..
1927.0     1
1900.0     1
1906.0     1
1908.0     1
1933.0     1
Name: GarageYrBlt, Length: 98, dtype: int64

In [50]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1460 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          1460 non-null   object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [51]:
train_df.columns[train_df.isna().any()].to_list()

[]