In [16]:
#### Machine Learning Pipeline
## Aim here is to create a list of function, meta functions that can work together 
## to streamline the process of data cleaning, feature selection and training 
## our models. This will help into creating more results to focus more time on 
## optimising models and feature selection than the tiny gritty details of coding



######### importing python modules 

import numpy as np
import pandas as pd 
import pylab as plt 
import seaborn as sns
from sklearn.preprocessing import StandardScaler
%matplotlib inline

import sklearn
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb



def df_importer(name):
    # import csv from pandas
    dat = pd.read_csv(name)
    return dat
    
def convert_cat_2_num(indat,param):
    #convert different categories into a numerical order than can be highly correlated
    param_lst = ['Po','Fa','TA','Gd','Ex']
    for i in range (len(param_lst)):
        try:
            indat.loc[indat[param] == param_lst[i], param] = i+1
        except:
            dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1  
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1
    return indat

def convert_GarageFinish(indat,param='GarageFinish'):
    #Convert column GarageFinish to a numerically sensible order
    try:
        indat.loc[indat[param] == 'Fin', param] = 3
    except:
        dum = 1
    try:
        indat.loc[indat[param] == 'RFn', param] = 2
    except:
        dum = 1
    try:
        indat.loc[indat[param] == 'Unf', param] = 1
    except:
        dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1        
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1
    return indat


def convert_GarageType(indat,param='GarageType'):
    #Convert column GarageType to an order related to median prices
    garagetypelst = ['CarPort','Detchd','Basment','2Types','Attchd','BuiltIn']
    for i in range (len(garagetypelst)):
        try:
            indat.loc[indat[param] == garagetypelst[i], param] = i+1
        except:
            dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1        
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1
    return indat

def convert_Neighborhood(indat,param='Neighborhood'):
    #Convert column Neighborhood to a numerically to an order related to median prices
    neighborlst = ['MeadowV','IDOTRR','BrDale','OldTown','Edwards','BrkSide','Sawyer','Blueste',
                   'SWISU','NAmes','NPkVill','Mitchel','SawyerW','Gilbert','NWAmes','Blmngtn',
                    'CollgCr','ClearCr','Crawfor','Veenker','Somerst','Timber','StoneBr','NoRidge','NridgHt']
    for i in range (len(neighborlst)):
        try:
            indat.loc[indat[param] == neighborlst[i], param] = i+1
        except:
            dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1        
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1

    return indat

def convert_MSZoning(indat,param='MSZoning'):
    #Convert column MSZoning to an order related to median prices
    param_lst = ['C (all)','RM','RH','RL','FV']
    for i in range (len(param_lst)):
        try:
            indat.loc[indat[param] == param_lst[i], param] = i+1
        except:
            dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1  
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1
    return indat

def convert_LotShape(indat,param='LotShape'):
    #Convert column LotShape to an order related to median prices
    param_lst = ['Reg','IR1','IR3','IR2']
    for i in range (len(param_lst)):
        try:
            indat.loc[indat[param] == param_lst[i], param] = i+1
        except:
            dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1  
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1
    return indat

def convert_BsmtExposure(indat,param='BsmtExposure'):
    #Convert column BsmtExposure to an order related to median prices
    param_lst = ['No','Mn','Av','Gd']
    for i in range (len(param_lst)):
        try:
            indat.loc[indat[param] == param_lst[i], param] = i+1
        except:
            dum = 1
    try:
        indat.loc[indat[param] == 'NA', param] = 0
    except:
        dum = 1  
    try:
        indat[param].fillna(0,inplace=True)
    except:
        dum = 1
    return indat


def convert_all_categories(indat):
    ## meta converter. if used, the following columns of any dataframe will be converted to numerical column:
    # 'ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual','FireplaceQu','GarageQual','GarageCond'
    # 'Neighborhood', 'GarageFinish', 'GarageType', 'Neighborhood', 'MSZoning', 'LotShape', 'BsmtExposure'
    list_cat_num = ['ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual','FireplaceQu','GarageQual','GarageCond']
    
    for i in range(len(list_cat_num)):
        convert_cat_2_num(indat,list_cat_num[i])

    indat = convert_Neighborhood(indat)
    indat = convert_GarageFinish(indat)
    indat = convert_GarageType(indat)
    indat = convert_Neighborhood(indat)
    indat = convert_MSZoning(indat)
    indat = convert_LotShape(indat)
    indat = convert_BsmtExposure(indat)
    return indat


def load_converted_df(name):
    # load dataframe and convert categorical features into meaningful number.
    df = df_importer(name)
    df1 = convert_all_categories(df)
    return df1

def create_spearman_corr_plot(df_in):
    # creates the spearman correlation plot for all numerical parameters
    plt.style.use('ggplot')
    spearman_corr = df_in.corr(method='spearman')
    fig = plt.figure(figsize=(10,17))
    plt.title('Spearman correlation with sale prices')
    spearman_corr['SalePrice'].plot.barh()
    plt.tight_layout()
    plt.show()
    



In [19]:
training_set = load_converted_df('train.csv')
#create_spearman_corr_plot(training_set)

In [7]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC

