Import libraries needed for machine learning exercise of predicting housing prices.

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

Import training and test datasets

In [2]:
train = pd.read_csv('train.csv', index_col='Id')
test = pd.read_csv('test.csv', index_col='Id')

Without data in the training dataset the model will not be able to use the variable to train the model, and without data in the test dataset the model cannot use the variable to predict anything. The model should be as or more accurate if we drop variables that don't have at least some values filled in on both the training and test datasets. 

Identify columns with categorical data, from those identify columns that have data in both training and test datasets. 

In [3]:
# All categorical columns
object_cols = [col for col in train.columns if train[col].dtype == "object"]

# Columns that can be safely label encoded
good_label_cols = [col for col in object_cols if 
                   set(train[col]) == set(test[col])]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be label encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

Categorical columns that will be label encoded: ['Street', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'BldgType', 'RoofStyle', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageCond', 'PavedDrive', 'Fence', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['Functional', 'MSZoning', 'GarageQual', 'Heating', 'SaleType', 'RoofMatl', 'Utilities', 'Exterior1st', 'HouseStyle', 'PoolQC', 'KitchenQual', 'Exterior2nd', 'MiscFeature', 'Electrical', 'Condition2']


Separate SalePrice from training dataset. Split training dataset into a train and validate data-subsets. 

In [4]:
y = train.SalePrice
train.drop(['SalePrice'], axis=1, inplace=True)

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(train, y, train_size=0.8, test_size=0.2, random_state=1)

For the same reasons listed above there is a need to identify columns with no data present in our new train and validate data-subsets. The original bad columns should also be subtracted as they will never be of use in predicting the SalesPrice in the test dataset. 

In [7]:

# Columns that can be safely label encoded
good_label_cols2 = [col for col in object_cols if 
                   set(X_train[col]) == set(X_valid[col])]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols2 = list(set(object_cols)-set(good_label_cols2))

#columns that are valid between train and valid and also full and test
good_good_cols = list(set(good_label_cols2)- set(bad_label_cols))
        
print('Categorical columns that will be label encoded:', good_label_cols2)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols2)

print('\nCategorical columns left after dropping all bad columns:', good_good_cols)

Categorical columns that will be label encoded: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'BldgType', 'HouseStyle', 'ExterQual', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'CentralAir', 'KitchenQual', 'FireplaceQu', 'GarageType', 'GarageFinish', 'PavedDrive', 'Fence', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['GarageCond', 'Functional', 'HeatingQC', 'ExterCond', 'GarageQual', 'Foundation', 'Heating', 'SaleType', 'RoofMatl', 'Utilities', 'Exterior1st', 'Condition1', 'Neighborhood', 'PoolQC', 'Exterior2nd', 'MasVnrType', 'RoofStyle', 'MiscFeature', 'Electrical', 'Condition2']

Categorical columns left after dropping all bad columns: ['BsmtFinType1', 'Street', 'Alley', 'LotShape', 'BsmtQual', 'FireplaceQu', 'BsmtExposure', 'CentralAir', 'ExterQual', 'BsmtFinType2', 'LotConfig', 'Fence', 'GarageType', 'BsmtCond', 'GarageFinish', 'BldgType', 'SaleCondition', 'PavedDrive', 'LandContour

Identify columns with numerical data, add those to columns with categorical data present in train, validate, and test datasets, then make a copy of train and validate data-subsets with only those columns present.

In [8]:
# Select numerical columns
numerical_cols = [cname for cname in X_train.columns if 
                X_train[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = good_good_cols + numerical_cols

In [9]:
X_train2 = X_train[my_cols].copy()
X_valid2 = X_valid[my_cols].copy()

Numerical data that is missing will be replaced with a constant value (0) using SimpleImputer. The same will be done with categorical data then OneHotEncoder will transform that into numerical data. These will be packaged together as - preprocessor - . XGBRegressor will be used as the model. The model and preprocessor will be packaged as a pipeline.

In [10]:
#from sklearn.impute import SimpleImputer
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[('impute', SimpleImputer(strategy='constant')),
                                          ('label', OneHotEncoder(handle_unknown='ignore', sparse=False))])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, good_good_cols)
    ])

# Define model
model = XGBRegressor(max_depth=3, randon_state=1, n_estimators = 1000, learning_rate=0.07, 
                     objective='reg:squarederror')

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),('model', model)])



Fit model to training subset and corresponding SalePrice.

In [11]:
my_pipeline.fit(X_train2, y_train)

  if getattr(data, 'base', None) is not None and \


Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCo

Use the trained model to predict SalePrice on the validate subset, then calculate how accurate those predictions were on average.

In [12]:
preds = my_pipeline.predict(X_valid2)

print('MAE:', mean_absolute_error(y_valid, preds))

MAE: 14819.280581121575


Use the model as is to predict SalePrice from test data.

In [16]:
test_preds = my_pipeline.predict(test[my_cols])

output = pd.DataFrame({'Id': test.index, 'SalePrice': test_preds})
output.to_csv('submission7.csv', index=False)

Fit model to full training dataset, then predict price on test dataset

In [17]:
my_pipeline.fit(train[my_cols], y)

my_pipeline.predict(test[my_cols])

output2 = pd.DataFrame({'Id': test.index, 'SalePrice': test_preds})
output2.to_csv('submission8.csv', index=False)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
