## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import math

from datetime import datetime
from scipy.stats import skew
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
# from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats
import sklearn.linear_model as linear_model
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

sns.set()

## Scoring Metric

In [2]:
def root_mean_squared_log_error(y_valid, y_preds):
    """Calculate root mean squared error of log(y_true) and log(y_pred)"""
    if len(y_preds)!=len(y_valid): return 'error_mismatch'
    y_preds_new = [math.log(x) for x in y_preds]
    y_valid_new = [math.log(x) for x in y_valid]
    return mean_squared_error(y_valid_new, y_preds_new, squared=False)

## Loading data

In [3]:
train = pd.read_csv('data/train.csv')

In [4]:
print(train.shape)
train.head()

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
train[train['Fence'] == 'NA']

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice


## Missing values

In [6]:
missing = train.isna().sum()
missing_df = pd.DataFrame(missing[missing > 0].sort_values(ascending=False))
missing_df.columns = ['missing']
missing_df['percent'] = round(missing_df['missing'] / train.shape[0], 3)

missing_df

Unnamed: 0,missing,percent
PoolQC,1453,0.995
MiscFeature,1406,0.963
Alley,1369,0.938
Fence,1179,0.808
FireplaceQu,690,0.473
LotFrontage,259,0.177
GarageYrBlt,81,0.055
GarageType,81,0.055
GarageFinish,81,0.055
GarageQual,81,0.055


In [7]:
high_missing_features = list(missing_df[missing_df['percent'] > 0.4].index)
high_missing_features

['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']

In [8]:
train.drop(high_missing_features, axis=1, inplace=True)

## Outliers

In [9]:
train = train.drop(train[train['Id'] == 1299].index)
train = train.drop(train[train['Id'] == 524].index)
train = train.drop(train[train['Id'] == 332].index)

In [10]:
# train['SalePrice'] = np.log(train['SalePrice'])

train['GrLivArea'] = np.log(train['GrLivArea'])

train['HasBsmt'] = pd.Series(len(train['TotalBsmtSF']), index=train.index)
train['HasBsmt'] = 0 
train.loc[train['TotalBsmtSF']>0, 'HasBsmt'] = 1
train.loc[train['HasBsmt']==1,'TotalBsmtSF'] = np.log(train['TotalBsmtSF'])

In [11]:
features = [x for x in train.columns if x not in ['SalePrice', 'Id']]
X = train[features]
y = train['SalePrice']

## Feature Engineering

In [12]:
X['years_since_update'] = X['YearRemodAdd'] - X['YearBuilt']
X['geometry'] = X['LotArea'] / X['LotFrontage']
X['land_topology'] = X['LandSlope'] + '_' + X['LandContour']
X['TotalSF']=X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF']

X['Total_sqr_footage'] = (X['BsmtFinSF1'] + X['BsmtFinSF2'] +
                                 X['1stFlrSF'] + X['2ndFlrSF'])

X['Total_Bathrooms'] = (X['FullBath'] + (0.5 * X['HalfBath']) +
                               X['BsmtFullBath'] + (0.5 * X['BsmtHalfBath']))

X['Total_porch_sf'] = (X['OpenPorchSF'] + X['3SsnPorch'] +
                              X['EnclosedPorch'] + X['ScreenPorch'] +
                              X['WoodDeckSF'])

In [13]:
X['haspool'] = X['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
X['has2ndfloor'] = X['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
X['hasgarage'] = X['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
X['hasbsmt'] = X['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
X['hasfireplace'] = X['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

In [14]:
X.shape

(1457, 87)

## Preprocessing

In [15]:
X['Functional'] = X['Functional'].fillna('Typ') 
X['Electrical'] = X['Electrical'].fillna("SBrkr") 
X['KitchenQual'] = X['KitchenQual'].fillna("TA")

In [16]:
X['MSSubClass'] = X['MSSubClass'].apply(str)
X['YrSold'] = X['YrSold'].astype(str)
X['MoSold'] = X['MoSold'].astype(str)

In [17]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)

numerical_cols = [cname for cname in X_train.columns if 
                X_train[cname].dtype in ['int64', 'float64']]

categorical_cols = [cname for cname in X_train.columns if
                    X_train[cname].nunique() < 13 and 
                    X_train[cname].dtype == "object"]


numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
])

## Model Selection

In [35]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()

linear_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('linear_model', linear_model)
                     ])

linear_clf.fit(X_train, y_train)

linear_clf.fit(X_train, y_train)

linear_preds = linear_clf.predict(X_valid)

linear_rmsle = root_mean_squared_log_error(y_valid, linear_preds)
print(linear_rmsle)

0.15662274583352523


### DecisionTreeRegressor

In [19]:
from sklearn.tree import DecisionTreeRegressor

tree_model = DecisionTreeRegressor()

tree_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('tree_model', tree_model)
                     ])

tree_clf.fit(X_train, y_train)

tree_clf.fit(X_train, y_train)

tree_preds = tree_clf.predict(X_valid)

tree_rmsle = root_mean_squared_log_error(y_valid, tree_preds)
print(tree_rmsle)

0.20280408579390322


### RandomForestRegressor

In [20]:
from sklearn.ensemble import RandomForestRegressor

random_model = RandomForestRegressor()

random_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('random_model', random_model)
                     ])

random_clf.fit(X_train, y_train)

random_clf.fit(X_train, y_train)

random_preds = random_clf.predict(X_valid)

random_rmsle = root_mean_squared_log_error(y_valid, random_preds)
random_rmsle

0.1327041980925737

### XGBRegressor

In [21]:
xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.02, random_state=0)

xgb_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('xgb_model', xgb_model)
                     ])

xgb_clf.fit(X_train, y_train, xgb_model__verbose=False)

xgb_clf.fit(X_train, y_train)

xgb_preds = xgb_clf.predict(X_valid)

xgb_rmsle = root_mean_squared_log_error(y_valid, xgb_preds)
xgb_rmsle

0.12195740178197341

### XGBRegressor (hyper parameters tuning)

In [22]:
hp_model = XGBRegressor(learning_rate=0.01,n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7, nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)

hp_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('hp_model', hp_model)
                     ])

hp_clf.fit(X_train, y_train, hp_model__verbose=False)

hp_preds = hp_clf.predict(X_valid)

hp_rmsle = root_mean_squared_log_error(y_valid, hp_preds)
hp_rmsle

0.11416272740786974

### Light GBM

In [23]:
lightgbm_model = LGBMRegressor()
lightgbm_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('lightgbm_model', lightgbm_model)
                     ])

lightgbm_clf.fit(X_train, y_train, lightgbm_model__verbose=False)

lightgbm_preds = lightgbm_clf.predict(X_valid)
lightgbm_rmsle = root_mean_squared_log_error(y_valid, lightgbm_preds)
lightgbm_rmsle

0.12940040814273202

In [24]:
lightgbm_hp_model = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       )
lightgbm_hp_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('lightgbm_hp_model', lightgbm_hp_model)
                     ])

lightgbm_hp_clf.fit(X_train, y_train, lightgbm_hp_model__verbose=False)

lightgbm_hp_preds = lightgbm_hp_clf.predict(X_valid)
lightgbm_hp_rmsle = root_mean_squared_log_error(y_valid, lightgbm_hp_preds)
lightgbm_hp_rmsle



0.11480390713487151

In [25]:
models_selection = pd.DataFrame({
    'model': ['Linear Regression', 'Decision Tree Regressor', 'Random Forest Regressor', 'XGBRegressor', 'XGBRegressor (after HT)', 'LightGBM', 'LightGBM (after HT)'],
    'RMSLE': [linear_rmsle, tree_rmsle, random_rmsle, xgb_rmsle, hp_rmsle, lightgbm_rmsle, lightgbm_hp_rmsle]
}).sort_values(by='RMSLE')

In [26]:
models_selection

Unnamed: 0,model,RMSLE
4,XGBRegressor (after HT),0.114163
6,LightGBM (after HT),0.114804
3,XGBRegressor,0.121957
5,LightGBM,0.1294
2,Random Forest Regressor,0.132704
1,Decision Tree Regressor,0.156623
0,Linear Regression,0.202804


## Predict

In [27]:
X_test = pd.read_csv('./data/test.csv')

In [28]:
Ids = X_test.Id

In [29]:
X_test.drop(high_missing_features, axis=1, inplace=True)
X_test.drop(['Id'], axis=1, inplace=True)

In [30]:
X_test['GrLivArea'] = np.log(X_test['GrLivArea'])

X_test['HasBsmt'] = pd.Series(len(X_test['TotalBsmtSF']), index=X_test.index)
X_test['HasBsmt'] = 0 
X_test.loc[X_test['TotalBsmtSF']>0, 'HasBsmt'] = 1
X_test.loc[X_test['HasBsmt']==1,'TotalBsmtSF'] = np.log(X_test['TotalBsmtSF'])

In [31]:
X_test['years_since_update'] = X_test['YearRemodAdd'] - X_test['YearBuilt']
X_test['geometry'] = X_test['LotArea'] / X_test['LotFrontage']
X_test['land_topology'] = X_test['LandSlope'] + '_' + X_test['LandContour']
X_test['TotalSF']=X['TotalBsmtSF'] + X_test['1stFlrSF'] + X_test['2ndFlrSF']

X_test['Total_sqr_footage'] = (X_test['BsmtFinSF1'] + X_test['BsmtFinSF2'] +
                                 X['1stFlrSF'] + X['2ndFlrSF'])

X_test['Total_Bathrooms'] = (X_test['FullBath'] + (0.5 * X_test['HalfBath']) +
                               X_test['BsmtFullBath'] + (0.5 * X_test['BsmtHalfBath']))

X_test['Total_porch_sf'] = (X_test['OpenPorchSF'] + X_test['3SsnPorch'] +
                              X_test['EnclosedPorch'] + X_test['ScreenPorch'] +
                              X_test['WoodDeckSF'])
X_test['haspool'] = X_test['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
X_test['has2ndfloor'] = X_test['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
X_test['hasgarage'] = X_test['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
X_test['hasbsmt'] = X_test['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
X_test['hasfireplace'] = X_test['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

In [34]:
hp_clf.fit(X, y)
preds = hp_clf.predict(X_test)
output = pd.DataFrame({'Id': Ids,
                       'SalePrice': preds})
output.to_csv('submission.csv', index=False)