In [4]:
#### Machine Learning Pipeline
## Aim here is to create a list of function, meta functions that can work together 
## to streamline the process of data cleaning, feature selection and training 
## our models. This will help into creating more results to focus more time on 
## optimising models and feature selection than the tiny gritty details of coding



######### importing python modules 

import numpy as np
import pandas as pd 
import pylab as plt 
import seaborn as sns
from sklearn.preprocessing import StandardScaler
%matplotlib inline

import sklearn
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb


def df_importer(name):
    # import csv from pandas
    dat = pd.read_csv(name)
    return dat
    
def convert_cat_2_num(indat,param):
    #convert different categories into a numerical order than can be highly correlated
    param_lst = ['Po','Fa','TA','Gd','Ex']
    for i in range (len(param_lst)):
        try:
            indat.loc[indat[param] == param_lst[i], param] = i+1
        except:
            dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1  
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1
    return indat

def convert_GarageFinish(indat,param='GarageFinish'):
    #Convert column GarageFinish to a numerically sensible order
    try:
        indat.loc[indat[param] == 'Fin', param] = 3
    except:
        dum = 1
    try:
        indat.loc[indat[param] == 'RFn', param] = 2
    except:
        dum = 1
    try:
        indat.loc[indat[param] == 'Unf', param] = 1
    except:
        dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1        
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1
    return indat


def convert_GarageType(indat,param='GarageType'):
    #Convert column GarageType to an order related to median prices
    garagetypelst = ['CarPort','Detchd','Basment','2Types','Attchd','BuiltIn']
    for i in range (len(garagetypelst)):
        try:
            indat.loc[indat[param] == garagetypelst[i], param] = i+1
        except:
            dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1        
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1
    return indat

def convert_Neighborhood(indat,param='Neighborhood'):
    #Convert column Neighborhood to a numerically to an order related to median prices
    neighborlst = ['MeadowV','IDOTRR','BrDale','OldTown','Edwards','BrkSide','Sawyer','Blueste',
                   'SWISU','NAmes','NPkVill','Mitchel','SawyerW','Gilbert','NWAmes','Blmngtn',
                    'CollgCr','ClearCr','Crawfor','Veenker','Somerst','Timber','StoneBr','NoRidge','NridgHt']
    for i in range (len(neighborlst)):
        try:
            indat.loc[indat[param] == neighborlst[i], param] = i+1
        except:
            dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1        
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1

    return indat

def convert_MSZoning(indat,param='MSZoning'):
    #Convert column MSZoning to an order related to median prices
    param_lst = ['C (all)','RM','RH','RL','FV']
    for i in range (len(param_lst)):
        try:
            indat.loc[indat[param] == param_lst[i], param] = i+1
        except:
            dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1  
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1
    return indat

def convert_LotShape(indat,param='LotShape'):
    #Convert column LotShape to an order related to median prices
    param_lst = ['Reg','IR1','IR3','IR2']
    for i in range (len(param_lst)):
        try:
            indat.loc[indat[param] == param_lst[i], param] = i+1
        except:
            dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1  
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1
    return indat

def convert_BsmtExposure(indat,param='BsmtExposure'):
    #Convert column BsmtExposure to an order related to median prices
    param_lst = ['No','Mn','Av','Gd']
    for i in range (len(param_lst)):
        try:
            indat.loc[indat[param] == param_lst[i], param] = i+1
        except:
            dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1  
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1
    return indat


def convert_all_categories(indat):
    ## meta converter. if used, the following columns of any dataframe will be converted to numerical column:
    # 'ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual','FireplaceQu','GarageQual','GarageCond'
    # 'Neighborhood', 'GarageFinish', 'GarageType', 'Neighborhood', 'MSZoning', 'LotShape', 'BsmtExposure'
    list_cat_num = ['ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual','FireplaceQu','GarageQual','GarageCond']
    
    for i in range(len(list_cat_num)):
        convert_cat_2_num(indat,list_cat_num[i])

    indat = convert_Neighborhood(indat)
    indat = convert_GarageFinish(indat)
    indat = convert_GarageType(indat)
    indat = convert_Neighborhood(indat)
    indat = convert_MSZoning(indat)
    indat = convert_LotShape(indat)
    indat = convert_BsmtExposure(indat)
    return indat


def load_converted_df(name):
    # load dataframe and convert categorical features into meaningful number.
    df = df_importer(name)
    df1 = convert_all_categories(df)
    return df1

def create_spearman_corr_plot(df_in):
    # creates the spearman correlation plot for all numerical parameters
    plt.style.use('ggplot')
    spearman_corr = df_in.corr(method='spearman')
    fig = plt.figure(figsize=(10,17))
    plt.title('Spearman correlation with sale prices')
    spearman_corr['SalePrice'].plot.barh()
    plt.tight_layout()
    plt.show()
    

In [5]:
def rmsle_cv(model,df_in,x_arr,y_arr):
    n_folds = 5
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(df_in.values)
    rmse= np.sqrt(-cross_val_score(model, x_arr, y_arr, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

def print_res(selec_mod,mod,df_in,x_arr,y_arr):
    score = rmsle_cv(mod,df_in,x_arr,y_arr)
    print("\n "+selec_mod+" score: {:.4f}% ({:.4f})\n".format(score.mean()*100, score.std()*100))
    return

def ML_models(selec_mod,df_in,x_arr,y_arr,x_tes):
    # selec_mod: select model
    # df_in - input dataframe - train DF
    # x_arr = X_train
    # y_arr = Y_train
    # x_tes = X_test

    if selec_mod == 'lasso':
        model_lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
        print_res(selec_mod,model_lasso,df_in,x_arr,y_arr)
        model_lasso.fit(x_arr,y_arr)
        prediction = model_lasso.predict(x_arr)
        print (1-model_lasso.score(x_arr,y_arr))*100
        prediction1 = model_lasso.predict(x_tes)
        return prediction1,model_lasso
    
    if selec_mod == 'Enet':
        model_ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
        print_res(selec_mod,model_ENet,df_in,x_arr,y_arr)
        model_ENet.fit(x_arr,y_arr)
        prediction = model_ENet.predict(x_arr)
        print (1-model_ENet.score(x_arr,y_arr))*100
        prediction1 = model_ENet.predict(x_tes)
        return prediction1,model_ENet
    
    if selec_mod == 'KRR':
        model_KRR = make_pipeline(RobustScaler(), KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5))
        print_res(selec_mod,model_KRR,df_in,x_arr,y_arr)
        model_KRR.fit(x_arr,y_arr)
        prediction = model_KRR.predict(x_arr)
        print (1-model_KRR.score(x_arr,y_arr))*100
        prediction1 = model_KRR.predict(x_tes)
        return prediction1,model_KRR
    
    if selec_mod == 'GBoost':
        model_GBoost = make_pipeline(RobustScaler(), GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5))

        print_res(selec_mod,model_GBoost,df_in,x_arr,y_arr)
        model_GBoost.fit(x_arr,y_arr)
        prediction = model_KRR.predict(x_arr)
        print (1-model_KRR.score(x_arr,y_arr))*100
        prediction1 = model_KRR.predict(x_tes)
        return prediction1,model_GBoost
    
    if selec_mod == 'xbg':
        model_xgb = make_pipeline(RobustScaler(), xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1))
        
        print_res(selec_mod,model_xgb,df_in,x_arr,y_arr)
        model_xgb.fit(x_arr,y_arr)
        prediction = model_xgb.predict(x_arr)
        print (1-model_xgb.score(x_arr,y_arr))*100
        prediction1 = model_xgb.predict(x_tes)
        return prediction1,model_xgb
    
    if selec_mod == 'lgb':
        model_lgb = make_pipeline(RobustScaler(),lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11))
        
        print_res(selec_mod,model_lgb,df_in,x_arr,y_arr)
        model_lgb.fit(x_arr,y_arr)
        prediction = model_lgb.predict(x_arr)
        print (1-model_lgb.score(x_arr,y_arr))*100
        prediction1 = model_lgb.predict(x_tes)
        return prediction1,model_lgb

    if selec_mod == 'randomforrest':
        model_randomfor = make_pipeline(RobustScaler(), RandomForestRegressor(n_estimators=45,criterion='mse'))

        print_res(selec_mod,model_randomfor,df_in,x_arr,y_arr)
        model_randomfor.fit(x_arr,y_arr)
        prediction = model_randomfor.predict(x_arr)
        print (1-model_randomfor.score(x_arr,y_arr))*100  
        prediction1 = model_randomfor.predict(x_tes)
        return prediction1,model_randomfor



In [6]:


feature_lst = ['HeatingQC','Foundation','Fireplaces','TotRmsAbvGrd','YearRemodAdd','1stFlrSF',
            'GarageType','TotalBsmtSF','GarageFinish','FullBath','GarageArea','YearBuilt',
            'KitchenQual','BsmtQual','ExterQual','GarageCars','GrLivArea','Neighborhood','OverallQual','SalePrice']

feature_drop = ['Id','Alley','FireplaceQu','PoolQC','Fence','MiscFeature']

def feature_selection(df_in,feature_lst,feature_drop):
    #function to select and drop features from a dataframe. 
    df_in = df_in.drop(feature_drop,axis=1)
    df_in = df_in[feature_lst]
    return df_in

def label_encoding(df_in):
    #function to use label encoder to encode string columns - returns numerical columns 
    dfin_str = df_in.select_dtypes(include=['object'])

    for col in dfin_str:
        lbl = LabelEncoder() 
        lbl.fit(list(df_in[col].values)) 
        df_in[col] = lbl.transform(list(df_in[col].values))
    return df_in


### Loading csv file and converting specific categorical data into meaningful name
train = load_converted_df('train.csv')
test = load_converted_df('test.csv')

#encoding remaining categorical data with one-hot-encoder
training = label_encoding(train)
#feature selection of the training set
training_set = feature_selection(training,feature_lst,feature_drop)

testing = label_encoding(test)
testing_set = feature_selection(testing,feature_lst[:-1],feature_drop)
testing_set = testing_set.fillna(0)

#dividing training data into independent variable (X) and dependent variable (Y)
X_train = training_set.iloc[:,:-1].values
Y_train = np.log(training_set.iloc[:,-1].values)

X_test = testing_set.iloc[:,:].values


In [7]:
#calling an ML model to fit to the data

ML_models('lasso',training_set,X_train,Y_train,X_test)


 lasso score: 15.5279% (1.8523)

14.1448054635


(array([ 11.67488275,  11.85699186,  12.0378066 , ...,  11.9281727 ,
         11.61574068,  12.2940295 ]), Pipeline(memory=None,
      steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
        with_scaling=True)), ('lasso', Lasso(alpha=0.0005, copy_X=True, fit_intercept=True, max_iter=1000,
    normalize=False, positive=False, precompute=False, random_state=1,
    selection='cyclic', tol=0.0001, warm_start=False))]))