In [1]:
import numpy as np 
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew

---
## Data Input

In [2]:
## Reading data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

#Save the 'Id' column
train_ID = train['Id']
test_ID = test['Id']

#Now drop the  'Id' colum since it's unnecessary for  the prediction process.
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)

---
## Data Preprocessing and Feature Engineering

In [3]:
## Log Transformation to the target variable
train["SalePrice"] = np.log1p(train["SalePrice"])

In [4]:
## Extra Abnormal List
abnormal_list = []
abnormal_list.extend(list(train[train['GrLivArea'] > 4500].index))
#abnormal_list.extend(list(train[train['LotFrontage'] > 300].index))
#abnormal_list.extend(list(train[(train['LotFrontage'] > 150) & (train['SalePrice'] < 11)].index))
#abnormal_list.extend(list(train[(train['GarageArea'] > 1200) & (train['SalePrice'] < 12.5)].index))
#abnormal_list.extend(list(train[(train['OpenPorchSF'] > 500) & (train['SalePrice'] < 11)].index))
#abnormal_list.extend(list(train[train['MasVnrArea'] > 1500].index))
#abnormal_list.extend(list(train[(train['PoolArea'] > 500) & (train['SalePrice'] > 13)].index))

abnormal_list = list(set(abnormal_list))
train = train.drop(abnormal_list)

In [5]:
ntrain = train.shape[0]
ntest = test.shape[0]
y_train = train.SalePrice.values
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.drop(['SalePrice'], axis=1, inplace=True)
print("all_data size is : {}".format(all_data.shape))

all_data size is : (2917, 79)


In [6]:
## Replace Nan Value to the features
# None
fillna_col_list = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 
                   'GarageType', 'GarageFinish', 'GarageQual', 
                   'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
                   'BsmtFinType1', 'BsmtFinType2', 'MasVnrType', 'MSSubClass']
for col in fillna_col_list:
    all_data[col] = all_data[col].fillna('None')
    
#Group by neighborhood and fill in missing value by the median LotFrontage of all the neighborhood
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

# 0 
fillna_col_list = ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 
                   'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 
                   'BsmtHalfBath', 'MasVnrArea']
for col in fillna_col_list:
    all_data[col] = all_data[col].fillna(0)
    
# Mode
fillna_col_list = ['Electrical', 'Exterior1st', 'Exterior2nd', 
                   'KitchenQual', 'MSZoning', 'SaleType']
for col in fillna_col_list:
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

## Special Handling based on the requirement
all_data["Functional"] = all_data["Functional"].fillna("Typ")

## Dropping the useless column
all_data = all_data.drop(['Utilities'], axis=1)

## Creating New Features
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

In [7]:
## Changing Numeric Features into Categorical Features
all_data['MSSubClass'] = all_data['MSSubClass'].apply(str)
all_data['OverallCond'] = all_data['OverallCond'].astype(str)
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)

In [8]:
from sklearn import preprocessing

## Type1 of quality (Ex, Gd, TA, Fa, Po, None)
quality_col1 = ['ExterQual', 'HeatingQC', 'KitchenQual', 'FireplaceQu',
                'GarageQual', 'PoolQC', 'BsmtCond', 'BsmtQual', 'ExterCond',
                'GarageCond']
quality_dict1 = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
for col in quality_col1:
    all_data[col] = all_data[col].apply(lambda x: quality_dict1[x])

## Type2 of quality (GLQ, ALQ, ...)
quality_col2 = ['BsmtFinType1', 'BsmtFinType2']
quality_dict2 = {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6} 
for col in quality_col2:
    all_data[col] = all_data[col].apply(lambda x: quality_dict2[x])

## Type 3 of quality (GdPrv, MnPrv, ...)
quality_col3 = ['Fence']
quality_dict3 = {'None': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4} 
for col in quality_col3:
    all_data[col] = all_data[col].apply(lambda x: quality_dict3[x])

## Type 4 of quality (Gd, Av, Mn, No, None)
quality_col4 = ['BsmtExposure']
quality_dict4 = {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4} 
for col in quality_col4:
    all_data[col] = all_data[col].apply(lambda x: quality_dict4[x])

## Type 5 of quality (Fin, Rfn, Unf, None)
quality_col5 = ['GarageFinish']
quality_dict5 = {'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3} 
for col in quality_col5:
    all_data[col] = all_data[col].apply(lambda x: quality_dict5[x])

## Type 6 of quality based on Observation()
quality_col6 = ['MSZoning']
quality_dict6 = {'C (all)': 1, 'RH': 2, 'RM': 3, 'RL': 4, 'FV': 5} 
for col in quality_col6:
    all_data[col] = all_data[col].apply(lambda x: quality_dict6[x])

## Type 7 of quality (Pave, Grvl, None)
quality_col7 = ['Alley']
quality_dict7 = {'None': 0, 'Grvl': 1, 'Pave': 2} 
for col in quality_col7:
    all_data[col] = all_data[col].apply(lambda x: quality_dict7[x])

## Type 8 of quality (SBrkr, FuseA, ...)
quality_col8 = ['Electrical']
quality_dict8 = {'Mix': 1, 'FuseP': 2, 'FuseF': 3, 'FuseA': 4, 'SBrkr': 5} 
for col in quality_col8:
    all_data[col] = all_data[col].apply(lambda x: quality_dict8[x])

## Type 9 of quality (Typ, Min1, Min2, ...)
quality_col9 = ['Functional']
quality_dict9 = {'Sev': 0, 'Maj2': 1, 'Maj1': 2, 'Mod': 3, 'Min2': 4, 'Min1': 5, 'Typ': 6} 
for col in quality_col9:
    all_data[col] = all_data[col].apply(lambda x: quality_dict9[x])

## Type 10 of quality 
quality_col10 = ['MiscFeature']
quality_dict10 = {'None':0, 'Othr': 1, 'Shed': 2, 'Gar2': 3, 'TenC': 4} 
for col in quality_col10:
    all_data[col] = all_data[col].apply(lambda x: quality_dict10[x])

## Final Type of quality can be encoded by LabelEncoder
le_col = ['PavedDrive', 'CentralAir', 'YrSold', 'MoSold', 'OverallCond']
for col in le_col:
    all_data[col] = preprocessing.LabelEncoder().fit_transform(all_data[col])

In [9]:
## Last Modify problematic rows
all_data.loc[2590, 'GarageYrBlt'] = 2007

In [10]:
#cols = ('LandSlope', 'LotShape', 'Street', 'MSSubClass')

In [11]:
# Check the skew of all numerical features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness = skewness[abs(skewness) > 0.75]

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    all_data[feat] = boxcox1p(all_data[feat], lam)

In [12]:
## Transforming the rest of the categorical features into numeric one-hot-encoding format 
all_data = pd.get_dummies(all_data)
print(all_data.shape)

(2917, 229)


In [13]:
train = all_data[:ntrain]
test = all_data[ntrain:]

---
## Modelling

In [14]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [15]:
#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [16]:
## Model Initialization
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [17]:
print("Model Result:")
score = rmsle_cv(lasso)
print("Lasso score: {:.4f} ({:.4f})".format(score.mean(), score.std()))
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})".format(score.mean(), score.std()))
score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})".format(score.mean(), score.std()))
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})".format(score.mean(), score.std()))
score = rmsle_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})" .format(score.mean(), score.std()))

Model Result:
Lasso score: 0.1144 (0.0053)
ElasticNet score: 0.1144 (0.0054)
Kernel Ridge score: 0.1165 (0.0049)
Gradient Boosting score: 0.1156 (0.0062)
LGBM score: 0.1151 (0.0058)


In [18]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [19]:
stacked_averaged_models = StackingAveragedModels(base_models = (ENet, GBoost, KRR),
                                                 meta_model = lasso)

score = rmsle_cv(stacked_averaged_models)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

Stacking Averaged models score: 0.1087 (0.0054)


In [20]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [21]:
stacked_averaged_models.fit(train.values, y_train)
stacked_train_pred = stacked_averaged_models.predict(train.values)
stacked_pred = np.expm1(stacked_averaged_models.predict(test.values))
print(rmsle(y_train, stacked_train_pred))

0.07556924359712457


In [22]:
model_lgb.fit(train, y_train)
lgb_train_pred = model_lgb.predict(train)
lgb_pred = np.expm1(model_lgb.predict(test.values))
print(rmsle(y_train, lgb_train_pred))

0.07179558782160174


In [23]:
'''RMSE on the entire Train data when averaging'''

print('RMSLE score on train data:')
print(rmsle(y_train,stacked_train_pred*0.9 + lgb_train_pred*0.1 ))

RMSLE score on train data:
0.07443163990612538


In [24]:
ensemble = stacked_pred*0.9 + lgb_pred*0.1
#ensemble = stacked_pred*0.95 + lgb_pred*0.05

In [25]:
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = ensemble
sub.to_csv('submission.csv',index=False)