# House Prices: Advanced Regression Techniques

A Kaggle competition on predicting sales prices and practice feature engineering, regression techniques such as random forests, and gradient boosting 

https://www.kaggle.com/c/house-prices-advanced-regression-techniques/overview

## 1- Load libraries

In [1]:
import numpy as np
import pandas as pd
import sklearn
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import tensorflow as tf
from tensorflow.keras import layers

print(tf.VERSION)
print(tf.keras.__version__)

  from ._conv import register_converters as _register_converters


1.12.0
2.1.6-tf


## 2- Import and preprocess data

In [2]:
data_df = pd.read_csv('data/train.csv')
test_data_df = pd.read_csv('data/test.csv')

Make a copy of data_df and test_data_df for preprocessing.

In [3]:
train_df = data_df.copy()
test_df = test_data_df.copy()

## 3- Exploratory Data Analysis

### Training data

In [4]:
train_df['PoolQC'].fillna('None', inplace=True)
test_df[(test_df['PoolArea'] != 0) & (test_df['PoolQC'].isnull())].loc[:, ['PoolQC']].fillna('Fa', inplace=True)
test_df['PoolQC'].fillna('None', inplace=True)

In [5]:
train_df.drop(columns=['MiscFeature'], inplace=True)
test_df.drop(columns=['MiscFeature'], inplace=True)

In [6]:
train_df['Alley'].fillna('None', inplace=True)
test_df['Alley'].fillna('None', inplace=True)

In [7]:
train_df.drop(columns=['Fence'], inplace=True)
test_df.drop(columns=['Fence'], inplace=True)

In [8]:
train_df['FireplaceQu'].fillna('None', inplace=True)
test_df['FireplaceQu'].fillna('None', inplace=True)

In [9]:
test_df['LotFrontage'].fillna(test_df['LotFrontage'].median(), inplace=True)
train_df['LotFrontage'].fillna(data_df['LotFrontage'].median(), inplace=True)

In [10]:
train_df['GarageCond'].fillna('None', inplace=True)
train_df['GarageType'].fillna('None', inplace=True)
train_df['GarageYrBlt'].fillna('None', inplace=True)
train_df['GarageFinish'].fillna('None', inplace=True)
train_df['GarageQual'].fillna('None', inplace=True)

In [11]:
test_df['GarageArea'].fillna(test_df['GarageArea'].median(), inplace=True)

In [12]:
test_df.at[666, 'GarageYrBlt'] = 1983
test_df.at[1116, 'GarageYrBlt'] = 1999

test_df.at[666, 'GarageFinish'] = 'Unf'
test_df.at[1116, 'GarageFinish'] = 'Unf'

test_df.at[666, 'GarageQual'] = 'TA'
test_df.at[1116, 'GarageQual'] = 'TA'

In [13]:
test_df['GarageCond'].fillna('None', inplace=True)
test_df['GarageType'].fillna('None', inplace=True)
test_df['GarageYrBlt'].fillna('None', inplace=True)
test_df['GarageFinish'].fillna('None', inplace=True)
test_df['GarageQual'].fillna('None', inplace=True)

In [14]:
train_df.at[332, 'BsmtFinType2'] = 'Unf'
train_df.at[948, 'BsmtExposure'] = 'No'

In [15]:
train_df['BsmtExposure'].fillna('None', inplace=True)
train_df['BsmtFinType2'].fillna('None', inplace=True)
train_df['BsmtFinType1'].fillna('None', inplace=True)
train_df['BsmtCond'].fillna('None', inplace=True)
train_df['BsmtQual'].fillna('None', inplace=True)

In [16]:
test_df.at[660, 'BsmtQual'] = None
test_df.at[660, 'BsmtCond'] = None
test_df.at[660, 'BsmtExposure'] = None
test_df.at[660, 'BsmtFinType1'] = None
test_df.at[660, 'BsmtFinSF1'] = 0
test_df.at[660, 'BsmtFinType2'] = None
test_df.at[660, 'BsmtFinSF2'] = 0
test_df.at[660, 'BsmtUnfSF'] = 0
test_df.at[660, 'TotalBsmtSF'] = 0
test_df.at[660, 'BsmtFullBath'] = None
test_df.at[660, 'BsmtHalfBath'] = None

In [17]:
test_df.at[27, 'BsmtExposure'] = 'No'
test_df.at[580, 'BsmtCond'] = 'TA'
test_df.at[725, 'BsmtCond'] = 'TA'
test_df.at[757, 'BsmtQual'] = 'TA'
test_df.at[758, 'BsmtQual'] = 'TA'
test_df.at[888, 'BsmtExposure'] = 'No'
test_df.at[1064, 'BsmtCond'] = 'TA'

In [18]:
test_df['BsmtExposure'].fillna('None', inplace=True)
test_df['BsmtFinType2'].fillna('None', inplace=True)
test_df['BsmtFinType1'].fillna('None', inplace=True)
test_df['BsmtCond'].fillna('None', inplace=True)
test_df['BsmtQual'].fillna('None', inplace=True)

In [19]:
train_df['MasVnrArea'].fillna(0, inplace=True)

In [20]:
train_df['MasVnrType'].fillna('None', inplace=True)

In [21]:
test_df.at[1150, 'MasVnrType'] = 'BrkFace'
test_df['MasVnrType'].fillna('None', inplace=True)
test_df['MasVnrArea'].fillna(test_df['MasVnrArea'].median(), inplace=True)

In [22]:
train_df['Electrical'].fillna('Sbrkr', inplace=True)

In [23]:
test_df['MSZoning'].fillna('RL', inplace=True)
test_df['Utilities'].fillna('AllPub', inplace=True)
test_df['Exterior1st'].fillna('VinylSd', inplace=True)
test_df['Exterior2nd'].fillna('VinylSd', inplace=True)
test_df['BsmtFullBath'].fillna(0, inplace=True)
test_df['BsmtHalfBath'].fillna(0, inplace=True)
test_df['KitchenQual'].fillna('TA', inplace=True)
test_df['Functional'].fillna('Typ', inplace=True)
test_df['GarageCars'].fillna(2, inplace=True)
test_df['SaleType'].fillna('WD', inplace=True)

In [24]:
test_df.isnull().sum().sum()

0

In [25]:
train_df.isnull().sum().sum()

0

### Drop Id, SalePrice

In [26]:
train_df.drop(columns=['Id', 'SalePrice'], inplace=True)
test_df.drop(columns=['Id'], inplace=True)

In [27]:
train_df.shape, test_df.shape

((1460, 77), (1459, 77))

### Process outliers

In [28]:
outliers = {934, 523, 1298, 691, 1182}
print('Current list of outliers ', list(outliers))

Current list of outliers  [934, 523, 1298, 691, 1182]


In [29]:
train_df.drop(index=list(outliers), inplace=True)

In [30]:
data_df.drop(index=list(outliers), inplace=True)

### Process categorical features

In [31]:
dummies = pd.get_dummies(pd.concat((train_df, test_df), axis=0))

In [32]:
dummies.shape

(2914, 395)

In [33]:
X = dummies.iloc[:train_df.shape[0]]
X_test = dummies.iloc[train_df.shape[0]:]

In [34]:
X.shape, X_test.shape

((1455, 395), (1459, 395))