In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display
%matplotlib inline

In [2]:
df = pd.read_csv('../data/train.csv', index_col='Id', low_memory=False)

In [3]:
# Function for displaying all columns
def display_all(df):
    with pd.option_context("display.max_rows", 1000): 
        with pd.option_context("display.max_columns", 1000): 
            display(df)

display_all(df.tail().transpose())

Id,1456,1457,1458,1459,1460
MSSubClass,60,20,70,20,20
MSZoning,RL,RL,RL,RL,RL
LotFrontage,62,85,66,68,75
LotArea,7917,13175,9042,9717,9937
Street,Pave,Pave,Pave,Pave,Pave
Alley,,,,,
LotShape,Reg,Reg,Reg,Reg,Reg
LandContour,Lvl,Lvl,Lvl,Lvl,Lvl
Utilities,AllPub,AllPub,AllPub,AllPub,AllPub
LotConfig,Inside,Inside,Inside,Inside,Inside


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-

## Missing values and encoding categorical variables

From looking at variable descriptions we can assume that all null values are just feature values that certain houses don't have. For example basement quality of house with no basement is marked as missing value, whereas in reality it information that can help predict the house price. Therefore I'm not going to drop na values but replace them.

I will replace fields with NA with 0 in case of numerical variables and with "NA" string in case of categorical variables, in order to use OneHotEncoding and LabelEncoding for categorical variables.

In [5]:
# Get list of float variables
floats = (df.dtypes == 'float64')
float_cols = list(floats[floats].index)

# Get list of integer variables
integers = (df.dtypes == 'int64')
integer_cols = list(integers[integers].index)

# List of numerical variables
numerical_cols = float_cols + integer_cols
print(numerical_cols)

['LotFrontage', 'MasVnrArea', 'GarageYrBlt', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']


In [6]:
# Replace na values in every numerical variable with 0
for col in numerical_cols:
    df[col] = df[col].fillna(0)

As we can see from above there are lot of categorical variables (objects)

Pandas has a concept of a category data type, but by default it would not turn anything into a category for you. It is important that validation and test sets will use the same category mappings (in other words, if you used 1 for “high” for a training dataset, then 1 should also be for “high” in validation and test datasets).

In [7]:
# Get list of categorical variables
cat = (df.dtypes == 'object')
cat_cols = list(cat[cat].index)
print(cat_cols)

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


In [9]:
# Replacing missing values in categorical variables with "NA" string
for col in cat_cols:
    df[col] = df[col].fillna("NA")

In [10]:
df.isna().sum().sum()

0

Now there are no missing values and we can apply encoding to categorical variables.

When looking at variable decriptions in data_description.txt file, we can see that following columns are ordinal: ExterQual, ExterCond, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, HeatingQC, KitchenQual, Functional, FireplaceQu, GarageQual, GarageCond, PoolQC, Fence

In [11]:
ordinal = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence']

In [12]:
nominal =  [x for x in cat_cols if x not in ordinal]

In [14]:
print(len(ordinal), len(nominal), len(cat_cols))

15 28 43


In [15]:
display_all(df.tail().transpose())

Id,1456,1457,1458,1459,1460
MSSubClass,60,20,70,20,20
MSZoning,RL,RL,RL,RL,RL
LotFrontage,62,85,66,68,75
LotArea,7917,13175,9042,9717,9937
Street,Pave,Pave,Pave,Pave,Pave
Alley,,,,,
LotShape,Reg,Reg,Reg,Reg,Reg
LandContour,Lvl,Lvl,Lvl,Lvl,Lvl
Utilities,AllPub,AllPub,AllPub,AllPub,AllPub
LotConfig,Inside,Inside,Inside,Inside,Inside


In [17]:
# Getting dummy variables of all categorical variables
df = pd.get_dummies(df)

In [18]:
display_all(df.tail().transpose())

Id,1456,1457,1458,1459,1460
MSSubClass,60.0,20.0,70.0,20.0,20.0
LotFrontage,62.0,85.0,66.0,68.0,75.0
LotArea,7917.0,13175.0,9042.0,9717.0,9937.0
OverallQual,6.0,6.0,7.0,5.0,5.0
OverallCond,5.0,6.0,9.0,6.0,6.0
YearBuilt,1999.0,1978.0,1941.0,1950.0,1965.0
YearRemodAdd,2000.0,1988.0,2006.0,1996.0,1965.0
MasVnrArea,0.0,119.0,0.0,0.0,0.0
BsmtFinSF1,0.0,790.0,275.0,49.0,830.0
BsmtFinSF2,0.0,163.0,0.0,1029.0,290.0


## Training model

I'm going to use RandomForestRegressor as the baseline model as it has only few statistical assumptions and doesn't assume anything about the data. Since the training set is quite small, I'm going to use cross validation with 5 folds for assessing model performance.

In [22]:
# Splitting df to target and features
trainY = df.pop('SalePrice')
trainX = df

Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price. (Taking logs means that errors in predicting expensive houses and cheap houses will affect the result equally.)

In [29]:
from sklearn.metrics import mean_squared_log_error

def rmsle(testY, predY):
    np.sqrt(mean_squared_log_error(testY, predY))

def print_score(m):
    res = [rmsle(m.predict(trainX, trainY)), rmse(m.predict(testX, testY)),
           m.score(trainX, trainY), m.score(testX, testY)]
    print(res)
    
model = RandomForestRegressor(n_jobs=-1)
%time model.fit(trainX, trainY)
model.score



Wall time: 398 ms


NameError: name 'rmse' is not defined