In [1]:
import pandas as pd
import numpy as np
import opendatasets
import os
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, FunctionTransformer, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import root_mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV, train_test_split
from custom_transformers import ReplaceNoFeatureTransformer
import joblib

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
x = train.drop('SalePrice', axis=1)
y = train['SalePrice']

In [4]:
RANDOM_STATE = 42

In [5]:
def pipe_func(cat_cols, num_cols, cat_fill_value='No Feature'):
    
    cat_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=cat_fill_value)
    num_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
    regressor = GradientBoostingRegressor(max_depth=5, n_estimators=300, random_state=RANDOM_STATE)  # best model and hyperparameters found before
    pca = PCA(n_components=50, random_state=RANDOM_STATE)  # best n_components for pca found before
    transform_target = FunctionTransformer(func=np.log1p, inverse_func=np.expm1)

    cat_pipe = Pipeline(
        steps=[
            ('cat_imputer', cat_imputer),
            ('cat_indicator', ReplaceNoFeatureTransformer(fill_value=cat_fill_value))
        ]
    )
    
    num_pipe = Pipeline(
        steps=[
            ('numerical_imputer', num_imputer),
            ('scaler', 'passthrough'),
            ('transf', 'passthrough')
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat_preprocessor', cat_pipe, cat_cols),
            ('num_preprocessor', num_pipe, num_cols)

        ],
        sparse_threshold=0
    )

    target_trans = TransformedTargetRegressor(
        regressor=regressor,
        transformer=transform_target
    )
    
    full_pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('dim_reduction', pca),
            ('regressor', target_trans)
        ]
    )
    
    return full_pipeline

# building the scorer function
def rmse_of_log(y_true, y_pred):
    """
    This scorer function will calculate the RMSE of the log of predictions and true values
    """
    return root_mean_squared_error(np.log1p(y_true), np.log1p(y_pred))

In [6]:
# %%capture
cat_cols = x.select_dtypes('object').columns.tolist()
num_cols = x.select_dtypes('number').columns.tolist()

pipe_model = pipe_func(cat_cols=cat_cols, num_cols=num_cols, cat_fill_value='No Feature')

param_grid = {
    # 'preprocessor__num_preprocessor__transf': [
    #     None, 
    #     FunctionTransformer(func=np.log1p, inverse_func=np.expm1), 
    #     FunctionTransformer(func=np.sqrt, inverse_func=np.square),
    #     PowerTransformer(method='box-cox'),
    #     PowerTransformer(method='yeo-johnson')
    # ],
    'preprocessor__num_preprocessor__scaler': [
        StandardScaler(), 
        # MinMaxScaler(feature_range=(0,1))
    ],
    'dim_reduction__n_components': [50],
    'regressor__transformer': [
        None, 
        # FunctionTransformer(func=np.log1p, inverse_func=np.expm1), 
        FunctionTransformer(func=np.sqrt, inverse_func=np.square),
        # PowerTransformer(method='box-cox'),
        # PowerTransformer(method='yeo-johnson'),
    ],
    'regressor__regressor': [    
        # LinearRegression(),
        # DecisionTreeRegressor(),
        # RandomForestRegressor(random_state=RANDOM_STATE),
        GradientBoostingRegressor(random_state=RANDOM_STATE)
    ],
    'regressor__regressor__loss': [
        'squared_error', 
        'absolute_error', 
        'huber', 
        'quantile'
    ],
    'regressor__regressor__learning_rate': [
        0.001,
        0.01,
        0.05,
        0.1
    ],
    'regressor__regressor__n_estimators': [
        10, 
        50, 
        100,
        200
    ],
    'regressor__regressor__max_depth': [
        1,
        3,
        5,
        10,
        None
    ]
}

rmse_log_scorer= make_scorer(rmse_of_log, greater_is_better=False)

grid_search = GridSearchCV(pipe_model, param_grid, cv=5, scoring=rmse_log_scorer, verbose=1)

# Fit GridSearchCV to your data
grid_search.fit(X=x, y=y)

Fitting 5 folds for each of 640 candidates, totalling 3200 fits


In [7]:
results = pd.DataFrame(grid_search.cv_results_).sort_values(by='rank_test_score', ascending=True).head(10)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_dim_reduction__n_components,param_preprocessor__num_preprocessor__scaler,param_regressor__regressor,param_regressor__regressor__learning_rate,param_regressor__regressor__loss,param_regressor__regressor__max_depth,...,param_regressor__transformer,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
495,5.691777,0.263059,0.011666,0.008532,50,StandardScaler(),GradientBoostingRegressor(random_state=42),0.1,squared_error,3,...,"FunctionTransformer(func=<ufunc 'sqrt'>, inver...","{'dim_reduction__n_components': 50, 'preproces...",-0.125564,-0.157455,-0.144266,-0.129705,-0.148319,-0.141062,0.011839,1
581,7.230914,0.166097,0.015895,0.00223,50,StandardScaler(),GradientBoostingRegressor(random_state=42),0.1,huber,5,...,"FunctionTransformer(func=<ufunc 'sqrt'>, inver...","{'dim_reduction__n_components': 50, 'preproces...",-0.130164,-0.154728,-0.145555,-0.129383,-0.147562,-0.141478,0.010035,2
342,7.45268,0.096403,0.011906,0.006259,50,StandardScaler(),GradientBoostingRegressor(random_state=42),0.05,squared_error,5,...,,"{'dim_reduction__n_components': 50, 'preproces...",-0.130646,-0.153722,-0.143274,-0.129963,-0.150542,-0.141629,0.00985,3
423,10.167685,0.762067,0.01731,0.004106,50,StandardScaler(),GradientBoostingRegressor(random_state=42),0.05,huber,5,...,"FunctionTransformer(func=<ufunc 'sqrt'>, inver...","{'dim_reduction__n_components': 50, 'preproces...",-0.128691,-0.155526,-0.148092,-0.128003,-0.148719,-0.141806,0.011297,4
343,7.863654,0.209281,0.009312,0.008949,50,StandardScaler(),GradientBoostingRegressor(random_state=42),0.05,squared_error,5,...,"FunctionTransformer(func=<ufunc 'sqrt'>, inver...","{'dim_reduction__n_components': 50, 'preproces...",-0.135995,-0.153373,-0.144483,-0.126526,-0.149211,-0.141918,0.009623,5
583,15.252547,0.557796,0.016829,0.002752,50,StandardScaler(),GradientBoostingRegressor(random_state=42),0.1,huber,5,...,"FunctionTransformer(func=<ufunc 'sqrt'>, inver...","{'dim_reduction__n_components': 50, 'preproces...",-0.131404,-0.155004,-0.146269,-0.129011,-0.148209,-0.14198,0.010068,6
493,3.144322,0.43483,0.016738,0.005731,50,StandardScaler(),GradientBoostingRegressor(random_state=42),0.1,squared_error,3,...,"FunctionTransformer(func=<ufunc 'sqrt'>, inver...","{'dim_reduction__n_components': 50, 'preproces...",-0.126081,-0.158481,-0.143088,-0.13556,-0.148394,-0.142321,0.011028,7
422,10.044186,0.43841,0.008744,0.008334,50,StandardScaler(),GradientBoostingRegressor(random_state=42),0.05,huber,5,...,,"{'dim_reduction__n_components': 50, 'preproces...",-0.131942,-0.154769,-0.145355,-0.129963,-0.150171,-0.14244,0.009861,8
575,8.844876,0.179947,0.015574,0.001691,50,StandardScaler(),GradientBoostingRegressor(random_state=42),0.1,huber,3,...,"FunctionTransformer(func=<ufunc 'sqrt'>, inver...","{'dim_reduction__n_components': 50, 'preproces...",-0.127019,-0.158519,-0.144245,-0.125107,-0.158074,-0.142593,0.014452,9
335,4.81513,0.15771,0.010428,0.005963,50,StandardScaler(),GradientBoostingRegressor(random_state=42),0.05,squared_error,3,...,"FunctionTransformer(func=<ufunc 'sqrt'>, inver...","{'dim_reduction__n_components': 50, 'preproces...",-0.125722,-0.16062,-0.142467,-0.13525,-0.148991,-0.14261,0.01187,10


In [8]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'dim_reduction__n_components': 50, 'preprocessor__num_preprocessor__scaler': StandardScaler(), 'regressor__regressor': GradientBoostingRegressor(random_state=42), 'regressor__regressor__learning_rate': 0.1, 'regressor__regressor__loss': 'squared_error', 'regressor__regressor__max_depth': 3, 'regressor__regressor__n_estimators': 200, 'regressor__transformer': FunctionTransformer(func=<ufunc 'sqrt'>, inverse_func=<ufunc 'square'>)}
-0.14106171631319192


In [9]:
joblib.dump(grid_search, '../models/pipeline_indicator.pkl')

['../models/pipeline_indicator.pkl']