# House Prices - Advanced Regression Techniques

## Import the relevant Libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## Importing Datasets

In [2]:
train = pd.read_csv("train.csv", index_col="Id")
test = pd.read_csv("test.csv")

In [3]:
test_ids = test["Id"].values
test = pd.read_csv("test.csv", index_col="Id")

### Dataset View

In [4]:
train.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


## OVERVIEW

### Dataset statistics
- **Number of rows: $\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;$ 1460**
- **Number of columns: $\;\;\;\;\;\;\;\;\;\;\;\;\;$ 80**
- **Missing values: $\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;$ 6965** 
- **Missing values (%): $\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;$ 5.96%**

### Number of Rows and Columns

In [5]:
train.shape

(1460, 80)

### Column Names

In [6]:
train.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

### Column INFO

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

### Column Data Types

In [8]:
train.dtypes

MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 80, dtype: object

## Data Cleaning

### Checking for null values

In [9]:
train.isna().sum()

MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 80, dtype: int64

### Dealing with missing values

### For train dataset

#### Handling values for MSZoning

In [10]:
# Replacing 'C (all)' with 'C'

train["MSZoning"] = train["MSZoning"].replace("C (all)", "C")

#### Handling missing values for LotFrontage

In [11]:
# Replacing the null values with median

train["LotFrontage"].fillna(train["LotFrontage"].median(), inplace=True)

#### Handling missing values for Alley, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, FireplaceQu, GarageType, GarageFinish, GarageQual, GarageCond, PoolQC & Fence

In [12]:
# Replacing the missing values with 'NA'

obj_NA = ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence"]

for i in obj_NA:
    train[i].fillna("NA", inplace=True)

#### Handling missing values for MasVnrType

In [13]:
# Replacing the missing values with 'CBlock'

train["MasVnrType"].fillna("CBlock", inplace=True)

#### Handling the missing values for MasVnrArea & Electrical

In [14]:
drop_na = ["MasVnrArea", "Electrical"]

for i in drop_na:
    print (i, ":", round((train[i].isna().sum()/train.shape[0])*100, 2))

MasVnrArea : 0.55
Electrical : 0.07


Less than 1% of the data are missing in MasVnrArea & Electrical. So, drop only null values.

In [15]:
train = train.dropna(subset=drop_na, axis=0)

#### Handling the missing value for GarageYrBlt

In [16]:
# Replacing the null values with mode

train["GarageYrBlt"].fillna(train["GarageYrBlt"].mode()[0], inplace=True)

#### Handling the missing value for MiscFeature

In [17]:
round(train["MiscFeature"].isna().sum()/train.shape[0]*100, 2)

96.28

More than 90% of the data are missing. So, drop this column.

In [18]:
train.drop(columns=["MiscFeature"], inplace=True)

### For test dataset

#### Handling missing values for MSZoning, MasVnrType, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath, BsmtHalfBath, KitchenQual, Functional, GarageYrBlt, GarageCars, GarageArea & SaleType

In [19]:
# Replacing 'C (all)' with 'C' and the null values with mode

test["MSZoning"] = test["MSZoning"].replace("C (all)", "C")

obj_mode = ["MSZoning", "MasVnrType", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "BsmtFullBath", "BsmtHalfBath", "KitchenQual", "Functional", "GarageYrBlt", "GarageCars", "GarageArea", "SaleType"]

for i in obj_mode:
    test[i].fillna(test[i].mode()[0], inplace=True)

#### Handling missing values for LotFrontage

In [20]:
# Replacing the null values with median

obj_median = ["LotFrontage", "MasVnrArea"]

for i in obj_median:
    test[i].fillna(test[i].median(), inplace=True)

#### Handling missing values for Alley, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, FireplaceQu, GarageType, GarageFinish, GarageQual, GarageCond, PoolQC & Fence

In [21]:
# Replacing the missing values with 'NA'

obj_NA = ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence"]

for i in obj_NA:
    test[i].fillna("NA", inplace=True)

#### Handling missing values for Utilities

In [22]:
# Replacing the null values with 'NoSeWa'

test["Utilities"].fillna("NoSeWa", inplace=True)

#### Handling missing values for Exterior1st

In [23]:
# Replacing the null values with 'Stone'

test["Exterior1st"].fillna("Stone", inplace=True)

#### Handling missing values for Exterior2nd

In [24]:
# Replacing the null values with 'Other'

test["Exterior2nd"].fillna("Other", inplace=True)

#### Handling the missing value for MiscFeature

In [25]:
round(test["MiscFeature"].isna().sum()/test.shape[0]*100, 2)

96.5

More than 90% of the data are missing. So, drop this column.

In [26]:
test.drop(columns=["MiscFeature"], inplace=True)

#### Converting numerical column dtype to 'int64'

In [27]:
int_list = ["LotFrontage", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "BsmtFullBath", "BsmtHalfBath", "GarageYrBlt", "GarageCars", "GarageArea"]

In [28]:
for i in int_list:
    train[i] = train[i].astype("int64")
    test[i] = test[i].astype("int64")

Now there are no null values. So, there is no need for further data cleaning.

## Model Implementation

In [29]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

### Label Encoding

#### For train dataset

In [30]:
label_encoder = preprocessing.LabelEncoder()

for i in train.columns:
    train[i] = label_encoder.fit_transform(train[i])

#### For test dataset

In [31]:
label_encoder = preprocessing.LabelEncoder()

for i in test.columns:
    test[i] = label_encoder.fit_transform(test[i])

### Define the Dependent and the Independent Variables

In [32]:
X = train.drop(columns=["SalePrice"])
y = train["SalePrice"]

### Scaling of Data using StandardScaler

In [33]:
X_scaled = StandardScaler()
X_scaled = X_scaled.fit_transform(X)

### Training the Data

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=36)

#### Adjusted $R^2$

$R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

In [35]:
def adj_R2(x):
    n = X.shape[0]
    p = X.shape[1]
    return (round((1-(1-x)*(n-1)/(n-p-1))*100, 3))

In [36]:
dt = DecisionTreeRegressor()
rf = RandomForestRegressor()
ada = AdaBoostRegressor()
gb = GradientBoostingRegressor()
xgb = XGBRegressor()
knn = KNeighborsRegressor(n_neighbors=9)
svr = SVR(C=10000.0, kernel="rbf")

regressor_array = [dt, rf, ada, gb, xgb, knn, svr]

In [37]:
r2, adj_r2 = [], []

for i in regressor_array:
    i.fit(X_train, y_train)
    y_pred = i.predict(X_test)
    
    r2.append(round(r2_score(y_test, y_pred)*100, 3))
    adj_r2.append(adj_R2(r2_score(y_test, y_pred)))
    
model = ["Decision Tree", "Random Forest", "AdaBoost", "Gradient Boosting", "XGBoost", "k-Nearest Neighbor", "Support Vector Regression"]

In [38]:
Evaluation = pd.DataFrame({"Model":model, "R2 Score":r2, "Adjusted R2":adj_r2})
Evaluation

Unnamed: 0,Model,R2 Score,Adjusted R2
0,Decision Tree,75.29,73.885
1,Random Forest,86.749,85.995
2,AdaBoost,82.169,81.156
3,Gradient Boosting,89.155,88.538
4,XGBoost,84.347,83.457
5,k-Nearest Neighbor,86.157,85.37
6,Support Vector Regression,88.774,88.136


## Conclusion

After comparing different machine learning models based on r2 score and it is found that **Random Forest** & **Gradient Boosting** regressor works efficiently for this dataset.