# Housing Prices: Modeling and predicting

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

train = pd.read_csv('data/train.csv', index_col='Id')
test = pd.read_csv('data/test.csv', index_col='Id')

# Separate out the target values
target = train.SalePrice
train  = train.drop('SalePrice', axis=True)

***
## Data preprocessing
### Convert 'MSSubClass' from int to str so that the column is treated categorically

In [2]:
train['MSSubClass'] = train.MSSubClass.astype('str')
test['MSSubClass'] = test.MSSubClass.astype('str')

### Convert ordinal objects to integers

In [3]:
# Simplified mapping function
def mapper(column, map_dictionary, df):
    """Short version to map and replace column""" 
    df[column] = df[column].map(map_dictionary)
    
# Function to map each column
def column_mapper(df):
    map_dict = {'Grvl':0, "Pave":1}
    mapper('Street',map_dictionary = map_dict, df=df)

    map_dict = {np.nan:0, 'Grvl':1, "Pave":2}
    mapper('Alley', map_dictionary = map_dict, df=df)

    map_dict = {'IR3':0, 'IR2':1, "IR1":2, "Reg":3}
    mapper('LotShape', map_dictionary = map_dict, df=df)

    map_dict = {'ELO':0, 'NoSeWa':1, "NoSewr":2, "AllPub":3}
    mapper('Utilities', map_dictionary = map_dict, df=df)

    map_dict = {'Gtl':0, 'Mod':1, "Sev":2}
    mapper('LandSlope',map_dictionary = map_dict, df=df)

    map_dict = {'Po':0, 'Fa':1, "TA":2, "Gd":3, "Ex":4}
    mapper('ExterQual',map_dictionary = map_dict, df=df)

    map_dict = {'Po':0, 'Fa':1, "TA":2, "Gd":3, "Ex":4}
    mapper('ExterCond',map_dictionary = map_dict, df=df)

    map_dict = {np.nan: 0, 'Po':1, 'Fa':2, "TA":3, "Gd":4, "Ex":5}
    mapper('BsmtQual',map_dictionary = map_dict, df=df)

    map_dict = {np.nan: 0, 'Po':1, 'Fa':2, "TA":3, "Gd":4, "Ex":5}
    mapper('BsmtCond',map_dictionary = map_dict, df=df)

    map_dict = {np.nan: 0, 'No':1, 'Mn':2, "Av":3, "Gd":4}
    mapper('BsmtExposure',map_dictionary = map_dict, df=df)

    map_dict = {np.nan: 0, 'Unf':1, 'LwQ':2, "Rec":3, "BLQ":4, 'ALQ':5, 'GLQ': 6}
    mapper('BsmtFinType1',map_dictionary = map_dict, df=df)

    map_dict = {np.nan: 0, 'Unf':1, 'LwQ':2, "Rec":3, "BLQ":4, 'ALQ':5, 'GLQ': 6}
    mapper('BsmtFinType2',map_dictionary = map_dict, df=df)

    map_dict = {'Po': 0, 'Fa':1, 'TA':2, "Gd":3, "Ex":4}
    mapper('HeatingQC',map_dictionary = map_dict, df=df)

    map_dict = {'N': 0, 'Y':1}
    mapper('CentralAir',map_dictionary = map_dict, df=df)

    map_dict = {'FuseP': 0, 'FuseF':1, 'Mix':2, np.nan:2, 'FuseA':3, 'SBrkr':4}
    mapper('Electrical',map_dictionary = map_dict, df=df)

    map_dict = {'Po': 0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4}
    mapper('KitchenQual',map_dictionary = map_dict, df=df)

    map_dict = {'Sal': 0, 'Sev':1, 'Maj2':2, 'Maj1':3, 'Mod':4, 'Min2':5, 'Min1':6, 'Typ':7}
    mapper('Functional',map_dictionary = map_dict, df=df)

    map_dict = {np.nan:0, 'Po': 1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
    mapper('FireplaceQu',map_dictionary = map_dict, df=df)

    map_dict = {np.nan:0, 'Unf': 1, 'RFn':2, 'Fin':3}
    mapper('GarageFinish',map_dictionary = map_dict, df=df)

    map_dict = {np.nan:0, 'Po': 1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
    mapper('GarageQual',map_dictionary = map_dict, df=df)

    map_dict = {np.nan:0, 'Po': 1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
    mapper('GarageCond',map_dictionary = map_dict, df=df)

    map_dict = {'N':0, 'P': 1, 'Y':2}
    mapper('PavedDrive',map_dictionary = map_dict, df=df)

    map_dict = {np.nan:0, 'Fa': 1, 'TA':2, 'Gd':3, 'Ex':4}
    mapper('PoolQC',map_dictionary = map_dict, df=df)

    map_dict = {np.nan:0, 'MnWw': 1, 'GdWo':2, 'MnPrv':3, 'GdPrv':4}
    mapper('Fence',map_dictionary = map_dict, df=df)

In [4]:
# Convert the columns in the train and test set
column_mapper(train)
column_mapper(test)

### Address null values

In [5]:
# Function to list all columns with null values
def null_columns_df(df):
    null_count_df = pd.DataFrame(df.isnull().sum(), columns=['null_count'])
    
    # Redefine the dataframe to contain only null counts > 0
    null_count_df = null_count_df[null_count_df.null_count > 0]
    
    dtype_list = []
    for column in null_count_df.index:
        dtype = df[column].dtype
        dtype_list.append(dtype)
     
    # Create column to list the dtypes of each column
    null_count_df['type'] = dtype_list
    
    return null_count_df[null_count_df.null_count > 0]

We found in the exploratory data analysis that the test data set has more columns with null values, and the train data set completely overlaps with the test data set. Therefore, we will us the test data set as a guide to deal with the null values.

#### Object nulls

In [6]:
df = null_columns_df(test)
df.loc[ df.type == 'object',:]

Unnamed: 0,null_count,type
MSZoning,4,object
Exterior1st,1,object
Exterior2nd,1,object
MasVnrType,16,object
GarageType,76,object
MiscFeature,1408,object
SaleType,1,object


In [7]:
def fillna_object(df):
    # Columns to be filled with the mode.
    fill = df.MSZoning.value_counts().idxmax()
    df.MSZoning.fillna(fill, inplace=True)
    
    fill = df.Exterior1st.value_counts().idxmax()
    df.Exterior1st.fillna(fill, inplace=True)
    
    fill = df.Exterior2nd.value_counts().idxmax()
    df.Exterior2nd.fillna(fill, inplace=True)
        
    fill = df.MasVnrType.value_counts().idxmax()
    df.MasVnrType.fillna(fill, inplace=True)
    
    fill = df.SaleType.value_counts().idxmax()
    df.SaleType.fillna(fill, inplace=True)
    
    # Columns were null values are assumed to be no feature present
    df.GarageType.fillna('no_garage', inplace=True)
    
    df.MiscFeature.fillna('no_feature', inplace=True)

In [8]:
fillna_object(train)
fillna_object(test)

#### Numeric nulls

In [9]:
df = null_columns_df(test)
df.loc[df.type != 'object',:]

Unnamed: 0,null_count,type
LotFrontage,227,float64
Utilities,2,float64
MasVnrArea,15,float64
BsmtFinSF1,1,float64
BsmtFinSF2,1,float64
BsmtUnfSF,1,float64
TotalBsmtSF,1,float64
BsmtFullBath,2,float64
BsmtHalfBath,2,float64
KitchenQual,1,float64


In [10]:
def fillna_num(df):
    # Columns to be filled with the median.
    df.Utilities.fillna(df.Utilities.median(), inplace=True)
    df.KitchenQual.fillna(df.KitchenQual.median(), inplace=True)
    df.Functional.fillna(df.Functional.median(), inplace=True)
    
    # Columns were null values are assumed to be no feature present (value=0)
    df.LotFrontage.fillna(0, inplace=True)    
    df.MasVnrArea.fillna(0, inplace=True)
    df.BsmtFinSF1.fillna(0, inplace=True)
    df.BsmtFinSF2.fillna(0, inplace=True)
    df.BsmtUnfSF.fillna(0, inplace=True)
    df.BsmtHalfBath.fillna(0, inplace=True)
    df.TotalBsmtSF.fillna(0, inplace=True)
    df.BsmtFullBath.fillna(0, inplace=True)
    df.BsmtFinSF2.fillna(0, inplace=True)
    df.GarageCars.fillna(0, inplace=True)
    df.GarageArea.fillna(0, inplace=True)
    
    # The GarageYrBlt is a problem as the null value likely means no garage.
    # Using 0 would distort the data when scaled. I will set it to the median
    df.GarageYrBlt.fillna(df.GarageYrBlt.median(), inplace=True)

In [11]:
fillna_num(train)
fillna_num(test)

### Address the outlier

In [12]:
test.GarageYrBlt[test.GarageYrBlt > 2016]

Id
2593    2207.0
Name: GarageYrBlt, dtype: float64

In [13]:
# The house was built in 2006, so the garage is likely to be built in 2007
test.loc[2593, 'GarageYrBlt'] = 2007

***
## Get dummy variables and scale the data

In [14]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

# Address the fact that not all categrical features are present, thus columns must be added
train_columns = set(train.columns)
test_columns = set(test.columns)

# These columns must be added as 0's
test_columns_zero = list(train_columns.difference(test_columns))
train_columns_zero = list(test_columns.difference(train_columns))

# Create these columns and set their values to 0
def create_0_columns(df, column_list):
    for column in column_list:
        df[column] = 0

create_0_columns(test, test_columns_zero)
create_0_columns(train, train_columns_zero)

# Keep the columns in the same order
train_columns = train.columns
test = test[train_columns]

# Scale the data
from sklearn.preprocessing import StandardScaler
sclr = StandardScaler()
train = sclr.fit_transform(train)
test = sclr.transform(test)

Now we are ready to model the data
***
## Data fitting
To fit the data we will use a linear support vector regressor. The LinearSVR fits very quickly with this data set size, so it is easy to use a grid search cross validation on a wide range of parameters.

In [16]:
from sklearn.svm import LinearSVR
from sklearn.model_selection import GridSearchCV

In [17]:
param={'C': list(range(5000,7010,100)), 
       'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'],
       'epsilon': np.array(range(50,70,2))/10}

svr = LinearSVR(random_state=2049)
gridCV = GridSearchCV(svr, param, 
                      cv=3, verbose=0,
                      return_train_score=True, n_jobs=-1)
gridCV.fit(train, target)

# Make and print results dataframe
cv_result = pd.DataFrame(gridCV.cv_results_)
param_columns = ['param_'+key for key in list(param.keys())]
cv_columns = ['mean_test_score', 'std_test_score', 'rank_test_score'] + param_columns
cv_result[cv_columns].sort_values('rank_test_score').head(10)

Unnamed: 0,mean_test_score,std_test_score,rank_test_score,param_epsilon,param_C,param_loss
144,0.732399,0.086663,1,5.4,5700,epsilon_insensitive
130,0.732364,0.086919,2,6.0,5600,epsilon_insensitive
272,0.732358,0.086514,3,6.2,6300,epsilon_insensitive
122,0.732358,0.0871,4,5.2,5600,epsilon_insensitive
252,0.732353,0.086859,5,6.2,6200,epsilon_insensitive
276,0.73231,0.086586,6,6.6,6300,epsilon_insensitive
248,0.732307,0.086928,7,5.8,6200,epsilon_insensitive
132,0.732303,0.087295,8,6.2,5600,epsilon_insensitive
376,0.732278,0.086669,9,6.6,6800,epsilon_insensitive
274,0.732271,0.086496,10,6.4,6300,epsilon_insensitive


### Check the estimated Kaggle score.
The scoring system as listed on the competition site
> Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price. (Taking logs means that errors in predicting expensive houses and cheap houses will affect the result equally.)

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train, target, random_state = 2049, test_size=.1)

svr = LinearSVR(random_state=2049, C=5700, loss='epsilon_insensitive', epsilon=5.4)
svr.fit(X_train, y_train)

prediction = svr.predict(X_test)

from sklearn.metrics import log_loss, mean_squared_error
score = mean_squared_error(np.log(y_test.values), np.nan_to_num(np.log(prediction)))
print('Estimated score:',np.sqrt(score))

Estimated score: 0.12961080050401005


***
## Prepare final submission

In [20]:
svr = LinearSVR(random_state=2049, C=5700, loss='epsilon_insensitive', epsilon=5.4)
svr.fit(train, target)

prediction = svr.predict(test)

test_df = pd.read_csv('data/test.csv',index_col='Id')
submission_df = pd.DataFrame(data=prediction, 
                             columns=['SalePrice'], 
                             index=test_df.index)
submission_df.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,124604.101042
1462,152308.866356
1463,185831.99682
1464,192284.529473
1465,220566.600546


In [None]:
submission_df.to_csv('submission.csv')