AIM: To implement in a single pipeline the eight transformations defined in Exploratory Data Analysis notebook, before training the ML model on the given data.

## Importing necessary libraries to build the pipeline

In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import FunctionTransformer

#ML libraries
import xgboost
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

#Basic Libraries to have
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import csv as csv


## Loading the data

In [2]:
train_df=pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test_df=pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

## Custom Functions

In [3]:
def grp_chk(df,grp_dict):
    bool_chk = {}
    #Basement
    bool_chk['Bsmt'] = ((df['BsmtQual'].isnull() & df['BsmtCond'].isnull() & df['BsmtExposure'].isnull() 
                 & df['BsmtFinType1'].isnull() & df['BsmtFinType2'].isnull()) & 
                 (((df['BsmtFinSF1']==0)| df['BsmtFinSF1'].isnull()) & 
                  ((df['BsmtFinSF2']==0) | df['BsmtFinSF2'].isnull()) & 
                  ((df['BsmtUnfSF']==0)| df['BsmtUnfSF'].isnull() ) & 
                  ((df['TotalBsmtSF']==0)) | df['TotalBsmtSF'].isnull() ) )
    
    #Fireplace
    bool_chk['Fireplace'] = ((df['FireplaceQu'].isnull()) & #categorical
                  ((df['Fireplaces']==0)| df['Fireplaces'].isnull())) #numerical
    
    #Pool
    bool_chk['Pool'] = ((df['PoolQC'].isnull()) & #categorical
                  ((df['PoolArea']==0)| df['PoolArea'].isnull())) #numerical
    
    #Garage
    bool_chk['Garage'] = (#categorical
    ((df['GarageType'].isnull()) & (df['GarageFinish'].isnull()) & 
                (df['GarageQual'].isnull()) & (df['GarageCond'].isnull())) & 
               #numerical
                  ((df['GarageCars']==0)| df['GarageCars'].isnull()))
    
    #Masonry Veneer
    bool_chk['Mas_veneer'] = (((df['MasVnrType'].isnull())| df['MasVnrType']=='None') & #categorical
                  ((df['MasVnrArea']==0)| df['MasVnrArea'].isnull())) #numerical
    
    
    for key in grp_dict:
        for feat in grp_dict[key]:
            if df[feat].dtype == 'object':
                df.loc[bool_chk[key],feat]='NA' 
            else:
                df.loc[bool_chk[key],feat]=0
    return df

In [4]:
def preproc_lvl1(dataframe):
    global astype_dict, grp_dict, drop_vars, ord_qc_vars
    
    #Typecasting
    dataframe = dataframe.astype(astype_dict)
    
    #To fill missing values for features in group
    dataframe = grp_chk(dataframe, grp_dict)
    
    #To fill missing values in other features
    dataframe.fillna({'Fence':'NA','MiscFeature':'NA'},inplace=True)
    
    #Dropping columns
    dataframe.drop(drop_vars, axis=1, inplace=True)
    
    #To replace strings with numerical values for quality-related ordinal variables 
    qual_dict = {'NA':0,'Po' : 1, 'Fa' : 2, 'TA' : 3, 'Gd' : 4, 'Ex' : 5}
    cat_dict = dict.fromkeys(ord_qc_vars,qual_dict)
    
    #To replace other ordinal variables
    #Custom cases
    gar_fin = {'None':0, 'NA':0, 'Unf':1, 'RFn':2, 'Fin':3}
    bsmt_exp = {'None':0,'NA':0, 'No':1, 'Mn':2, 'Av':3, 'Gd':4}
    bsmt_fin = {'None':0,'NA':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6}
    home_func = {'Sal':0, 'Sev':1, 'Maj2':2, 'Maj1':3, 'Mod':4, 'Min2':5, 'Min1':6, 'Typ':7}
    strt = {'Grvl':0, 'Pave':1}
    pvd_drv = {'N':0,'P':1,'Y':2}
    oth_ord_dict = {'GarageFinish':gar_fin, 'BsmtExposure':bsmt_exp,'BsmtFinType1':bsmt_fin, 'BsmtFinType2':bsmt_fin, 'Functional': home_func,'Street':strt,'PavedDrive':pvd_drv}
    cat_dict.update(oth_ord_dict)
    
    #Replacing
    dataframe.replace(cat_dict,inplace=True)
     
    return dataframe




## Custom transformers

In [5]:
class fillNATransformer(TransformerMixin, BaseEstimator):
    def __init__( self,medv,modev):
        #print('\n>>>>>>>>fillNA_init() called.\n')
        self.medv = medv
        self.modev = modev
        # define transformers
        self.siMode = SimpleImputer(strategy='most_frequent')
        self.siMed = SimpleImputer(strategy='median')
        
    def fit(self, X, y=None):
        #print('\n>>>>>>>>fillNA_fit() called.\n')
        self.siMode.fit(X[self.modev])
        self.siMed.fit(X[self.medv])
        return self
        
    def transform(self, X):
        #print('\n>>>>>>>>fillNA_transform() called.\n')
        X[self.modev] = self.siMode.transform(X[self.modev])
        X[self.medv] = self.siMed.transform(X[self.medv])
        if X.isna().sum().sum():
            for col in X.columns:
                chk = X[col].isna().sum()
                if chk:
                    print(col, chk)
                    
        return X
    
    def fit_transform(self, X, y=None):
        self = self.fit(X, y)
        return self.transform(X)
    

In [6]:
# Custom transformers

class encodeTransformer(TransformerMixin, BaseEstimator):
    def __init__( self, catv):
        #print('\n>>>>>>>>encode_init() called.\n')
        self.catv = catv
        # define transformers
        self.ohe = OneHotEncoder(handle_unknown='ignore')#drop='first'
        
    def fit(self, X, y=None):
        #print('\n>>>>>>>>encode_fit() called.\n')
        self.ohe.fit(X[self.catv])
        return self
        
    def transform(self, X):
        #print('\n>>>>>>>>encode_transform() called.\n')
        encoded_arr = self.ohe.transform(X[self.catv]).toarray()
        encoded_col = list(self.ohe.get_feature_names())
        encoded_df = pd.DataFrame(encoded_arr,columns=encoded_col)
        X_1 = X.drop(self.catv, axis=1)
        X_encoded = pd.concat([X_1,encoded_df],axis=1)
        return X_encoded
    
    def fit_transform(self, X, y=None):
        self = self.fit(X, y)
        return self.transform(X)
    

In [7]:
def reset_index(df):
    df = df.reset_index(inplace = False)
    return df
get_reset_index = FunctionTransformer(reset_index, validate=False)

In [8]:

#For pre-processing, independent of train/test data
astype_dict={'MSSubClass': 'object', 'MoSold': 'object', 'YrSold': 'object', 
             'YearRemodAdd': 'object', 'YearBuilt': 'object', 'GarageYrBlt': 'object'}

grp_dict={'Bsmt': ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
                   'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'], 
          'Fireplace': ['Fireplaces', 'FireplaceQu'], 
          'Garage': ['GarageType', 'GarageFinish', 'GarageCars', 'GarageQual', 'GarageCond', 'GarageYrBlt'], 
          'Pool': ['PoolArea', 'PoolQC'], 
          'Mas_veneer': ['MasVnrType', 'MasVnrArea']}

drop_vars=['Utilities', 'Street', 'GarageCars', 'TotRmsAbvGrd', 'TotalBsmtSF', 'GarageYrBlt']

ord_qc_vars = ['ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual','FireplaceQu',
               'GarageQual','GarageCond','PoolQC']

#For pre-processing, dependent of train/test data

num_vars = ['Id', 'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 
            '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 
            'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']

cat_vars = ['ExterQual','ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2','HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'GarageFinish', 'PoolQC', 
            'Functional', 'PavedDrive', 'OverallQual', 'OverallCond', 'BsmtFullBath', 'BsmtHalfBath', 
            'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 
            'MSSubClass', 'MoSold', 'YrSold', 'YearRemodAdd', 'YearBuilt', 'MSZoning', 'Alley', 
            'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 
            'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 
            'MasVnrType', 'Foundation', 'Heating', 'CentralAir', 'Electrical', 'GarageType',  
             'SaleType', 'SaleCondition']

encd_vars = ['MSSubClass', 'MSZoning', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 
             'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'YearBuilt', 
             'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 
             'Foundation', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'CentralAir', 
             'Electrical', 'GarageType', 'Fence', 'MiscFeature', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']



In [12]:
#for tuning parameters
parameters_for_testing = {
    'colsample_bytree':[0.4,0.6,0.8],
    'gamma':[0,0.03,0.1],
    'min_child_weight':[1,3], 
    'learning_rate':[0.01,0.07],
    'max_depth':[3,5],
    'subsample':[0.6,0.95]  
}

                    
xgb_model = xgboost.XGBRegressor(n_jobs=5,seed=27)

gsearch = GridSearchCV(estimator = xgb_model, param_grid = parameters_for_testing, n_jobs=6, verbose=10,scoring='neg_mean_squared_error')

# define model parameters
#model = XGBRegressor(learning_rate=0.07, 
#                       max_depth=5,
#                       min_child_weight= 3,
#                       gamma=0
#                       subsample=0.95,
#                       colsample_bytree= 0.8, 
#                       objective='reg:squarederror')
#                       seed=27,
#                       random_state=42)

# set up pipeline
pl = Pipeline(memory=None,
    steps=[
        ('reset_index', get_reset_index),
        ('fillNa',fillNATransformer(medv=num_vars,modev=cat_vars)),
        ('encode',encodeTransformer(encd_vars)),
        ('model', gsearch) #model
    ], verbose=False)

In [10]:
# Setup cross validation folds
kf = KFold(n_splits=12, random_state=42, shuffle=True)

def cv_rmse(model, X,y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
    return (rmse)


In [14]:
preproc_train = preproc_lvl1(train_df)


y_train = np.log(preproc_train['SalePrice'])
#applying log transformation

X_train = preproc_train.drop(['SalePrice'],axis=1)

col = preproc_train.columns

#score = cv_rmse(pl,X_train,y_train)
#print("XGB: {:.4f} ({:.4f})".format(score.mean(), score.std()))

In [15]:
pl.fit(X_train, y_train)
preproc_test = preproc_lvl1(test_df)
y_pred = pl.predict(preproc_test)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:    4.3s
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:    5.8s
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:    7.3s
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    8.9s
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:   10.9s
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   13.1s
[Parallel(n_jobs=6)]: Done  49 tasks      | elapsed:   17.1s
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed:   20.7s
[Parallel(n_jobs=6)]: Done  73 tasks      | elapsed:   26.6s
[Parallel(n_jobs=6)]: Done  86 tasks      | elapsed:   30.2s
[Parallel(n_jobs=6)]: Done 101 tasks      | elapsed:   34.2s
[Parallel(n_jobs=6)]: Done 116 tasks      | elapsed:   37.4s
[Parallel(n_jobs=6)]: Done 133 tasks      | elapsed:   43.2s
[Parallel(n_jobs=6)]: Done 150 tasks      | elapsed:   50.4s
[Parallel(n_jobs=6)]: Done 169 tasks      | elapsed:   55.9s
[Parallel(

In [16]:
print('best params')
print (gsearch.best_params_)
print('best score')
print (gsearch.best_score_)


best params
{'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.07, 'max_depth': 5, 'min_child_weight': 3, 'subsample': 0.95}
best score
-0.016579834508768154


In [17]:
final_df = pd.DataFrame(columns=['Id','SalePrice'])
final_df['Id'] = test_df["Id"]
final_df['SalePrice'] = np.exp(y_pred)

In [18]:
# import the modules we'll need
from IPython.display import HTML
import base64

# function that takes in a dataframe and creates a text link to  
# download it (will only work for files < 2MB or so)
def create_download_link(df, title = "Download CSV file", filename = "submission2.csv"):  
    csv = df.to_csv(sep=',',header=True, index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

# create a link to download the dataframe
create_download_link(final_df)
