# Import required packages and datasets

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

In [2]:
TRAIN_DATASET_PATH = "/kaggle/input/house-prices-advanced-regression-techniques/train.csv"
TEST_DATASET_PATH = "/kaggle/input/house-prices-advanced-regression-techniques/test.csv"

# Load and preview dataset

In [3]:
data = pd.read_csv(TRAIN_DATASET_PATH)
prices = data['SalePrice']
data = data.drop(columns=['SalePrice'])
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

# Data Preprocessing: Handling missing data
Upon [inspecting the dataset information](#Load-and-preview-dataset), we notice that some features contain `NaN` values.

First, let's identify which features contain `NaN` values and decide how to deal with them, depending on the proportion of missing values.

In [5]:
null_features = data.isna().mean().sort_values(ascending=False)
null_features = null_features[null_features > 0]
null_features = null_features.map(lambda x: str(round(x * 100, 4)) + "%")
null_features

PoolQC          99.5205%
MiscFeature     96.3014%
Alley           93.7671%
Fence           80.7534%
MasVnrType       59.726%
FireplaceQu     47.2603%
LotFrontage     17.7397%
GarageCond       5.5479%
GarageType       5.5479%
GarageYrBlt      5.5479%
GarageQual       5.5479%
GarageFinish     5.5479%
BsmtFinType2     2.6027%
BsmtExposure     2.6027%
BsmtFinType1     2.5342%
BsmtCond         2.5342%
BsmtQual         2.5342%
MasVnrArea       0.5479%
Electrical       0.0685%
dtype: object

Some features like `PoolQC` have a majority of `NaN` values. For such columns, we will completely remove them as they do not provide information for our analysis.

In addition, the `Id` column is irrelevant to the dataset - we will also remove this column.

In [6]:
drop_features = ['Id', 'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType', 'FireplaceQu']

In [7]:
data = data.drop(columns=drop_features)
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,61,0,0,0,0,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,42,0,0,0,0,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,84,0,0,0,0,0,12,2008,WD,Normal


Some columns consist of multiple data types. We will detect and investigate these columns.

In [8]:
label_encoder = LabelEncoder()
for col in list(data.columns):
    unique_data_types = data[col].apply(lambda x: x if pd.notnull(x) else 'NaN').apply(type).unique()
    if len(unique_data_types) > 1:
        print(col, unique_data_types)
        string_values = data[data[col].apply(lambda x: isinstance(x, int))][col]
    data[col] = label_encoder.fit_transform(data[col])
data.head()

LotFrontage [<class 'float'> <class 'str'>]
MasVnrArea [<class 'float'> <class 'str'>]
GarageYrBlt [<class 'float'> <class 'str'>]


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,5,3,36,327,1,3,3,0,4,0,...,49,0,0,0,0,0,1,2,8,4
1,0,3,51,498,1,3,3,0,2,0,...,0,0,0,0,0,0,4,1,8,4
2,5,3,39,702,1,0,3,0,4,0,...,30,0,0,0,0,0,8,2,8,4
3,6,3,31,489,1,0,3,0,0,0,...,24,108,0,0,0,0,1,0,8,0
4,5,3,55,925,1,0,3,0,2,0,...,70,0,0,0,0,0,11,2,8,4


For features with a relatively low proportion of `NaN` values, we simply replace them with zeros.

In [9]:
data = data.fillna(0)
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,5,3,36,327,1,3,3,0,4,0,...,49,0,0,0,0,0,1,2,8,4
1,0,3,51,498,1,3,3,0,2,0,...,0,0,0,0,0,0,4,1,8,4
2,5,3,39,702,1,0,3,0,4,0,...,30,0,0,0,0,0,8,2,8,4
3,6,3,31,489,1,0,3,0,0,0,...,24,108,0,0,0,0,1,0,8,0
4,5,3,55,925,1,0,3,0,2,0,...,70,0,0,0,0,0,11,2,8,4


We create our first pipeline class as follows for processing `NaN` values:

In [10]:
class ProcessNullValues(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        drop_features = ['Id', 'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType', 'FireplaceQu']
        X = X.drop(columns=drop_features)
        
        label_encoder = LabelEncoder()
        for col in list(X.columns):
            unique_data_types = X[col].apply(lambda x: x if pd.notnull(x) else 'NaN').apply(type).unique()
            if len(unique_data_types) > 1:
                string_values = X[X[col].apply(lambda x: isinstance(x, int))][col]
            X[col] = label_encoder.fit_transform(X[col])
        
        X = X.fillna(0)
        return X

# Data Preprocessing: Encoding categorical features
Upon [inspecting the dataset information](#Load-and-preview-dataset), we notice that some features are categorical (non-numerical). These features must be processed beforehand.

Note that some features such as `KitchenQual` assign ordinal quality ratings (better quality = better rating). Some machine learning models can handle ordinal categorical data even if you don't encode them in a specific order. For instance, decision trees and random forests can naturally split and make decisions based on ordinal features without explicitly specifying an order.

Let's first identify all categorical columns.

In [11]:
categorical_columns = data.select_dtypes(include=['object']).columns
categorical_columns

Index([], dtype='object')

# Feature correlation analysis
In this section, we discover any correlations between features, as well as dominant features that are highly-correlated with the housing sales prices.

## Correlations between features

In [12]:
# feature_correlation = data.corr()
# plt.figure(figsize=(16, 10))
# sns.heatmap(data=feature_correlation, annot=True)

## Correlations between features and housing sales prices

In [13]:
# correlation_data = data.corr()['SalePrice']
# correlation_df = pd.DataFrame({'Feature': correlation_data.index, 'Correlation': correlation_data.values})
# correlation_df = correlation_df.sort_values(by='Correlation', ascending=False)
# correlation_df.reset_index(drop=True, inplace=True)
# correlation_df.head()

# Baseline model: linear regression
We will examine the performance of a simple LR model to serve as a baseline reference for model selection and optimization.

In [14]:
data = data.fillna(0)
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,5,3,36,327,1,3,3,0,4,0,...,49,0,0,0,0,0,1,2,8,4
1,0,3,51,498,1,3,3,0,2,0,...,0,0,0,0,0,0,4,1,8,4
2,5,3,39,702,1,0,3,0,4,0,...,30,0,0,0,0,0,8,2,8,4
3,6,3,31,489,1,0,3,0,0,0,...,24,108,0,0,0,0,1,0,8,0
4,5,3,55,925,1,0,3,0,2,0,...,70,0,0,0,0,0,11,2,8,4


In [15]:
X_train, X_test, y_train, y_test = train_test_split(data, prices, test_size=0.2, random_state=42)

In [16]:
lr_model = LinearRegression().fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
print(f"Linear regression model loss: {mean_squared_error(y_test, lr_predictions)}")

Linear regression model loss: 1244618207.2241457


# Create pipeline for submission

In [17]:
pipeline = Pipeline([
    ('clean', ProcessNullValues())
])

# Submission

In [18]:
test_dataset = pd.read_csv(TEST_DATASET_PATH)
submission = test_dataset[['Id']]

In [19]:
test_dataset = pipeline.fit_transform(test_dataset)

In [20]:
test_predictions = lr_model.predict(test_dataset)
test_predictions

array([114371.65810669, 160359.99092427, 181255.7531222 , ...,
       173940.87755669, 113397.29450703, 264387.64720496])

In [21]:
submission['SalePrice'] = test_predictions
submission = submission.set_index('Id')
submission = submission.to_csv("submission.csv", encoding='utf-8')

In [22]:
!head submission.csv

Id,SalePrice
1461,114371.65810669321
1462,160359.9909242739
1463,181255.75312220157
1464,190227.3454971336
1465,186725.9422570404
1466,174109.29690019842
1467,175302.66624869252
1468,153908.46726096174
1469,200611.9748537819
