In [1]:
#### Machine Learning Pipeline
## Aim here is to create a list of function, meta functions that can work together 
## to streamline the process of data cleaning, feature selection and training 
## our models. This will help into creating more results to focus more time on 
## optimising models and feature selection than the tiny gritty details of coding



######### importing python modules 

import numpy as np
import pandas as pd 
import pylab as plt 
import seaborn as sns
from sklearn.preprocessing import StandardScaler
%matplotlib inline

import sklearn
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb


def df_importer(name):
    # import csv from pandas
    dat = pd.read_csv(name)
    return dat
    
def convert_cat_2_num(indat,param):
    #convert different categories into a numerical order than can be highly correlated
    param_lst = ['Po','Fa','TA','Gd','Ex']
    for i in range (len(param_lst)):
        try:
            indat.loc[indat[param] == param_lst[i], param] = i+1
        except:
            dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1  
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1
    return indat

def convert_GarageFinish(indat,param='GarageFinish'):
    #Convert column GarageFinish to a numerically sensible order
    try:
        indat.loc[indat[param] == 'Fin', param] = 3
    except:
        dum = 1
    try:
        indat.loc[indat[param] == 'RFn', param] = 2
    except:
        dum = 1
    try:
        indat.loc[indat[param] == 'Unf', param] = 1
    except:
        dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1        
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1
    return indat


def convert_GarageType(indat,param='GarageType'):
    #Convert column GarageType to an order related to median prices
    garagetypelst = ['CarPort','Detchd','Basment','2Types','Attchd','BuiltIn']
    for i in range (len(garagetypelst)):
        try:
            indat.loc[indat[param] == garagetypelst[i], param] = i+1
        except:
            dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1        
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1
    return indat

def convert_Neighborhood(indat,param='Neighborhood'):
    #Convert column Neighborhood to a numerically to an order related to median prices
    neighborlst = ['MeadowV','IDOTRR','BrDale','OldTown','Edwards','BrkSide','Sawyer','Blueste',
                   'SWISU','NAmes','NPkVill','Mitchel','SawyerW','Gilbert','NWAmes','Blmngtn',
                    'CollgCr','ClearCr','Crawfor','Veenker','Somerst','Timber','StoneBr','NoRidge','NridgHt']
    for i in range (len(neighborlst)):
        try:
            indat.loc[indat[param] == neighborlst[i], param] = i+1
        except:
            dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1        
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1

    return indat

def convert_MSZoning(indat,param='MSZoning'):
    #Convert column MSZoning to an order related to median prices
    param_lst = ['C (all)','RM','RH','RL','FV']
    for i in range (len(param_lst)):
        try:
            indat.loc[indat[param] == param_lst[i], param] = i+1
        except:
            dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1  
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1
    return indat

def convert_LotShape(indat,param='LotShape'):
    #Convert column LotShape to an order related to median prices
    param_lst = ['Reg','IR1','IR3','IR2']
    for i in range (len(param_lst)):
        try:
            indat.loc[indat[param] == param_lst[i], param] = i+1
        except:
            dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1  
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1
    return indat

def convert_BsmtExposure(indat,param='BsmtExposure'):
    #Convert column BsmtExposure to an order related to median prices
    param_lst = ['No','Mn','Av','Gd']
    for i in range (len(param_lst)):
        try:
            indat.loc[indat[param] == param_lst[i], param] = i+1
        except:
            dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1  
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1
    return indat


def convert_all_categories(indat):
    ## meta converter. if used, the following columns of any dataframe will be converted to numerical column:
    # 'ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual','FireplaceQu','GarageQual','GarageCond'
    # 'Neighborhood', 'GarageFinish', 'GarageType', 'Neighborhood', 'MSZoning', 'LotShape', 'BsmtExposure'
    list_cat_num = ['ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual','FireplaceQu','GarageQual','GarageCond']
    
    for i in range(len(list_cat_num)):
        convert_cat_2_num(indat,list_cat_num[i])

    indat = convert_Neighborhood(indat)
    indat = convert_GarageFinish(indat)
    indat = convert_GarageType(indat)
    indat = convert_Neighborhood(indat)
    indat = convert_MSZoning(indat)
    indat = convert_LotShape(indat)
    indat = convert_BsmtExposure(indat)
    return indat


def load_converted_df(name):
    # load dataframe and convert categorical features into meaningful number.
    df = df_importer(name)
    df1 = convert_all_categories(df)
    return df1

def create_spearman_corr_plot(df_in):
    # creates the spearman correlation plot for all numerical parameters
    plt.style.use('ggplot')
    spearman_corr = df_in.corr(method='spearman')
    fig = plt.figure(figsize=(10,17))
    plt.title('Spearman correlation with sale prices')
    spearman_corr['SalePrice'].plot.barh()
    plt.tight_layout()
    plt.show()

In [2]:
def rmsle_cv(model,df_in,x_arr,y_arr):
    n_folds = 5
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(df_in.values)
    rmse= np.sqrt(-cross_val_score(model, x_arr, y_arr, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

def print_res(selec_mod,mod,df_in,x_arr,y_arr):
    score = rmsle_cv(mod,df_in,x_arr,y_arr)
    print("\n "+selec_mod+" score: {:.4f}% ({:.4f})\n".format(score.mean()*100, score.std()*100))
    return

def ML_models(selec_mod,df_in,x_arr,y_arr,x_tes):
    # selec_mod: select model
    # df_in - input dataframe - train DF
    # x_arr = X_train
    # y_arr = Y_train
    # x_tes = X_test

    if selec_mod == 'lasso':
        model_lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
        print_res(selec_mod,model_lasso,df_in,x_arr,y_arr)
        model_lasso.fit(x_arr,y_arr)
        prediction = model_lasso.predict(x_arr)
        prediction1 = model_lasso.predict(x_tes)
        print ((1-model_lasso.score(x_arr,y_arr))*100)
        return prediction1,prediction,model_lasso
    
    if selec_mod == 'Enet':
        model_ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
        print_res(selec_mod,model_ENet,df_in,x_arr,y_arr)
        model_ENet.fit(x_arr,y_arr)
        prediction = model_ENet.predict(x_arr)
        prediction1 = model_ENet.predict(x_tes)
        print ((1-model_ENet.score(x_arr,y_arr))*100)
        return prediction1,prediction,model_ENet
    
    if selec_mod == 'KRR':
        model_KRR = make_pipeline(RobustScaler(), KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5))
        print_res(selec_mod,model_KRR,df_in,x_arr,y_arr)
        model_KRR.fit(x_arr,y_arr)
        prediction = model_KRR.predict(x_arr)
        prediction1 = model_KRR.predict(x_tes)
        print ((1-model_KRR.score(x_arr,y_arr))*100)
        return prediction1,prediction,model_KRR
    
    if selec_mod == 'GBoost':
        model_GBoost = make_pipeline(RobustScaler(), GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5))

        print_res(selec_mod,model_GBoost,df_in,x_arr,y_arr)
        model_GBoost.fit(x_arr,y_arr)
        prediction = model_GBoost.predict(x_arr)
        prediction1 = model_GBoost.predict(x_tes)
        print ((1-model_GBoost.score(x_arr,y_arr))*100)
        return prediction1,prediction,model_GBoost
    
    if selec_mod == 'xbg':
        model_xgb = make_pipeline(RobustScaler(), xgb.XGBRegressor(colsample_bytree=0.2, gamma=0.0, 
                             learning_rate=0.01, max_depth=6, 
                             min_child_weight=1.5, n_estimators=7200,
                             reg_alpha=0.9, reg_lambda=0.6,
                             subsample=0.2, silent=42,
                             random_state =7, nthread = -1))

        
        print_res(selec_mod,model_xgb,df_in,x_arr,y_arr)
        model_xgb.fit(x_arr,y_arr)
        prediction = model_xgb.predict(x_arr)
        prediction1 = model_xgb.predict(x_tes)
        print ((1-model_xgb.score(x_arr,y_arr))*100)
        return prediction1,prediction,model_xgb
    
    if selec_mod == 'lgb':
        model_lgb = make_pipeline(RobustScaler(),lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11))
        
        print_res(selec_mod,model_lgb,df_in,x_arr,y_arr)
        model_lgb.fit(x_arr,y_arr)
        prediction = model_lgb.predict(x_arr)
        prediction1 = model_lgb.predict(x_tes)
        print ((1-model_lgb.score(x_arr,y_arr))*100)
        return prediction1,prediction,model_lgb

    #if selec_mod == 'randomforrest':
    #    model_randomfor = make_pipeline(RobustScaler(), RandomForestRegressor(n_estimators=100,criterion='mse'))

    #    print_res(selec_mod,model_randomfor,df_in,x_arr,y_arr)
    #    model_randomfor1 = model_randomfor.fit(x_arr,y_arr)
    #    prediction = model_randomfor.predict(x_arr)
    #    prediction1 = model_randomfor.predict(x_tes)
    #    print ((1-model_randomfor.score(x_arr,y_arr))*100)  
    #    return prediction1,prediction,model_randomfor1


    if selec_mod == 'randomforrest':
        model_randomfor =  RandomForestRegressor(n_estimators=1000,criterion='mse')

        print_res(selec_mod,model_randomfor,df_in,x_arr,y_arr)
        model_randomfor1 = model_randomfor.fit(x_arr,y_arr)
        prediction = model_randomfor.predict(x_arr)
        prediction1 = model_randomfor.predict(x_tes)
        print ((1-model_randomfor.score(x_arr,y_arr))*100)  
        return prediction1,prediction,model_randomfor1    
    

In [5]:
def label_encoding(df_in):
    #function to use label encoder to encode string columns - returns numerical columns 
    dfin_str = df_in.select_dtypes(include=['object'])

    for col in dfin_str:
        lbl = LabelEncoder() 
        lbl.fit(list(df_in[col].values)) 
        df_in[col] = lbl.transform(list(df_in[col].values))
        #pd.get_dummies(df_in,prefix=[col], drop_first=True)
        df_in = pd.concat([df_in,pd.get_dummies(df_in[col], prefix=col,drop_first=True)],axis=1)
        df_in.drop([col],axis=1, inplace=True)
    return df_in


feature_drop = ['Id','LotFrontage','Alley','MasVnrType','MasVnrArea','BsmtQual','BsmtCond','BsmtExposure',
                'BsmtFinType1','BsmtFinType2','FireplaceQu','GarageType','GarageYrBlt','GarageFinish','GarageQual',
                'GarageCond','PoolQC','Fence','MiscFeature']


### next part does not require coding - just go to the feature_importance csv file ... 
### decide a threshold on which to select columns - in my case I selected everything everything
### above 0.1 and created a list from this threshold. This feature list is shown below

feature_lst = ['SaleCondition_4','FullBath','BedroomAbvGr','YrSold','ExterQual','HeatingQC','BsmtFullBath',
               'EnclosedPorch','MSSubClass','ExterCond','Fireplaces','WoodDeckSF','KitchenQual','TotRmsAbvGrd',
               'MoSold','2ndFlrSF','OpenPorchSF','MSZoning','BsmtUnfSF','YearRemodAdd','CentralAir_1',
               'OverallCond','YearBuilt','LotArea','GarageCars','BsmtFinSF1','1stFlrSF','GarageArea',
               'TotalBsmtSF','Neighborhood','GrLivArea','OverallQual','SalePrice']
    
    
#### using the same old code to load the merged dataset 

train_test = load_converted_df('train_test_merge.csv')
train_test_set = train_test.drop(feature_drop,axis=1)
train_test_set = label_encoding(train_test_set)
train_test_set = train_test_set.fillna(0)


#### but this line is added to only select the columns that met the threshold we selected
train_test_set = train_test_set[feature_lst]

train = train_test_set.iloc[:1460,:]
test = train_test_set.iloc[1460:,:]

train.to_csv('new_train1.csv')
test.to_csv('new_test1.csv')

train.drop(train[(train['OverallQual']<5) & (train['SalePrice']>200000)].index, inplace=True)
train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index, inplace=True)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)


Y_train = np.log(train['SalePrice'].values)
train.drop(['SalePrice'],axis=1, inplace=True)
X_train = train.values

X_test = test.iloc[:,:-1].values

In [9]:
#simple root mean square
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# they got the best alpha at that value
best_alpha = 0.00099

#### simple lasso regression based on that alpha value
regr = Lasso(alpha=best_alpha, max_iter=50000)
regr.fit(X_train,Y_train)
y_predl = regr.predict(X_train)
y_test = Y_train
print("Lasso score on training set: ", rmse(y_test, y_predl))

y_pred_lasso = regr.predict(X_test)

('Lasso score on training set: ', 0.11713443231885926)


In [8]:
regr = xgb.XGBRegressor(
                 colsample_bytree=0.2,
                 gamma=0.0,
                 learning_rate=0.01,
                 max_depth=10,
                 min_child_weight=1.5,
                 n_estimators=7200,                                                                  
                 reg_alpha=0.9,
                 reg_lambda=0.6,
                 subsample=0.2,
                 seed=10,
                 silent=1)

regr.fit(X_train,Y_train)

# Run prediction on training set to get a rough idea of how well it does.
y_predx = regr.predict(X_train)
y_test = Y_train
print("XGBoost score on training set: ", rmse(y_test, y_predx))

# Run prediction on the Kaggle test set.
y_pred_xgb = regr.predict(X_test)

('XGBoost score on training set: ', 0.08845751339590811)


In [10]:
model_name = 'GBoost'
out1G, out2G, out3G = ML_models(model_name,train,X_train,Y_train,X_test)

new_array = np.zeros((len(X_test),2),int)
testid = pd.read_csv('test.csv')
new_array[:,0] = testid['Id'].values
new_array[:,1] = np.exp(out1G)
np.savetxt(model_name+'4.csv',new_array,delimiter=',',header='Id,SalePrice',comments='',fmt='%d')


 GBoost score: 12.2182% (0.7961)

1.3905718849816062


In [11]:
model_name = 'Enet'
out1E, out2E, out3E = ML_models(model_name,train,X_train,Y_train,X_test)

new_array = np.zeros((len(X_test),2),int)
testid = pd.read_csv('test.csv')
new_array[:,0] = testid['Id'].values
new_array[:,1] = np.exp(out1E)
np.savetxt(model_name+'4.csv',new_array,delimiter=',',header='Id,SalePrice',comments='',fmt='%d')


 Enet score: 12.0840% (0.5001)

8.56828798413508


In [12]:
model_name = 'KRR'
out1K, out2K, out3K = ML_models(model_name,train,X_train,Y_train,X_test)

new_array = np.zeros((len(X_test),2),int)
testid = pd.read_csv('test.csv')
new_array[:,0] = testid['Id'].values
new_array[:,1] = np.exp(out1K)
np.savetxt(model_name+'4.csv',new_array,delimiter=',',header='Id,SalePrice',comments='',fmt='%d')


 KRR score: 12.5016% (1.1074)

5.839758321346178


In [13]:
################# Adding all the models we ran together to get an average


y_pred = (y_pred_xgb + (out1G) + y_pred_lasso  + out1E + out1K) / 5.
y_pred = np.exp(y_pred)

testid = pd.read_csv('test.csv')
pred_df = pd.DataFrame(y_pred, index=testid["Id"], columns=["SalePrice"])
pred_df.to_csv('gboost_xgb_lasso_Enet_KRR1.csv', header=True, index_label='Id')