In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split


In [2]:
# Read the data
X_full = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv('test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

categorical_cols = [cname for cname in X_full.columns if X_full[cname].dtype == "object"]
numerical_cols = [cname for cname in X_full.columns if X_full[cname].dtype in ['int64', 'float64']]

In [3]:
# Shape
X_full.shape

(1460, 79)

In [4]:
# Missing values, categorical
mis_cat = X_full[categorical_cols].isnull().sum()
mis_cat[mis_cat > 0]

Alley           1369
MasVnrType         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [5]:
# Missing values, continueous
mis_num = X_full[numerical_cols].isnull().sum()
mis_num[mis_num > 0]

LotFrontage    259
MasVnrArea       8
GarageYrBlt     81
dtype: int64

# Treatment of missing features

- Missing categorical features (except MiscFeature, Electrical) seem to be caused by simply not having that part of the house, in this case the 'NaN' for feature X can be interpreted as 'does not have feature X in the house' and so deserves a separate category. So it makes sense to impute the NaNs with a new category.
- MiscFeature: Is mostly empty, however 49 of the values are Shed, prompting me to create a new feature HasShed out of this.
- Electrical: Impute by most common category (SBrkr)

- Missing continuous features, we use median for LotFrontage, 0 for MasVnrArea and the median for GarageYrBlt

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# We create an explicit for feature for 'HasShed' from 'MiscFeature'
X_full['Has_shed'] = np.where(X_full['MiscFeature']=='Shed',1,0)
X_full_final = X_full.drop('MiscFeature',axis=1)

# We need a bunch of transformers based on the above
# Preprocessing for numerical data
numerical_transformer_median = SimpleImputer(strategy='median') 

numerical_transformer_constant = SimpleImputer(strategy='constant',fill_value=0) 

# Preprocessing for categorical data
categorical_transformer_constant = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='NotPresentInTheHouse')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

categorical_transformer_mode = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

categorical_transformer_other = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


# Column categories
num_col_median = ['LotFrontage','GarageYrBlt']

num_col_constant = ['MasVnrArea']

cat_col_constant = (['Alley',
                         'MasVnrType',
                         'BsmtQual',
                         'BsmtCond',
                         'BsmtExposure',
                         'BsmtFinType1',
                         'BsmtFinType2',
                         'FireplaceQu',
                         'GarageType',
                         'GarageFinish',
                         'GarageQual',
                         'GarageCond',
                         'PoolQC',
                         'Fence'])

cat_col_mode = ['Electrical']

cat_col_other = (['MSZoning',
             'Street',
             'LotShape',
             'LandContour',
             'Utilities',
             'LotConfig',
             'LandSlope',
             'Neighborhood',
             'Condition1',
             'Condition2',
             'BldgType',
             'HouseStyle',
             'RoofStyle',
             'RoofMatl',
             'Exterior1st',
             'Exterior2nd',
             'ExterQual',
             'ExterCond',
             'Foundation',
             'Heating',
             'HeatingQC',
             'CentralAir',
             'KitchenQual',
             'Functional',
             'PavedDrive',
             'SaleType',
             'SaleCondition'])

In [14]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num_median', numerical_transformer_median, num_col_median),
        ('num_constant', numerical_transformer_constant, num_col_constant),
        ('cat_constant', categorical_transformer_constant, cat_col_constant),
        ('cat_mode', categorical_transformer_mode, cat_col_mode),
        ('cat_other', categorical_transformer_other, cat_col_other),
    ], remainder = 'passthrough' )

In [15]:
# Preprocessing
# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full_final, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

X_train_processed = preprocessor.fit_transform(X_train_full)
X_valid_processed = preprocessor.transform(X_valid_full)

In [9]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

n_estimators = [500,1000,2000]
learning_rate = [0.1,0.05,0.025,0.01]

for n in n_estimators:
    for l in learning_rate:

        model = XGBRegressor(n_estimators=n, learning_rate=l)
        model.fit(X_train_processed, y_train,verbose=False)
        
        predictions = model.predict(X_valid_processed)

        print('n_estimators: ', n, 'lr: ', l, 'mae:', mean_absolute_error(predictions,y_valid))

n_estimators:  500 lr:  0.1 mae: 16249.015825663528
n_estimators:  500 lr:  0.05 mae: 15881.692543343323
n_estimators:  500 lr:  0.025 mae: 16436.840967465752
n_estimators:  500 lr:  0.01 mae: 17689.934904751713
n_estimators:  1000 lr:  0.1 mae: 16100.47886344178
n_estimators:  1000 lr:  0.05 mae: 15497.979505565068
n_estimators:  1000 lr:  0.025 mae: 16047.054526969177
n_estimators:  1000 lr:  0.01 mae: 16696.99918396832
n_estimators:  2000 lr:  0.1 mae: 16128.148892337329
n_estimators:  2000 lr:  0.05 mae: 15441.344753317637
n_estimators:  2000 lr:  0.025 mae: 15753.070740582192
n_estimators:  2000 lr:  0.01 mae: 16178.143354023972


In [16]:
# Creating final model

model = XGBRegressor(n_estimators=2000, learning_rate=0.05)

# Training preprocessing
X_full_processed = preprocessor.fit_transform(X_full_final)

# Test preprocessing
X_test_full['Has_shed'] = np.where(X_test_full['MiscFeature']=='Shed',1,0)
X_test_full_final = X_test_full.drop('MiscFeature',axis=1)
X_test_processed = preprocessor.transform(X_test_full_final)

model.fit(X_full_processed,y)
predictions = model.predict(X_test_processed)


In [20]:
#Submitting
file_name = "solution.csv"
message = "XGBoost"
header = ['Id','SalePrice']



pd.DataFrame(
    data=list(zip([x for x in X_test_full.index.tolist()], [int(x) for x in predictions.tolist()]))
).to_csv('{}'.format(file_name), index=False, header=header)

In [21]:
%%bash -s "$file_name" "$message"
kaggle competitions submit -c home-data-for-ml-course -f $1 -m "$2"

Successfully submitted to Housing Prices Competition for Kaggle Learn Users

  0%|          | 0.00/17.0k [00:00<?, ?B/s]100%|██████████| 17.0k/17.0k [00:06<00:00, 2.89kB/s]
