In [1]:
import opendatasets
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor, make_column_selector
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, FunctionTransformer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error, make_scorer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
import joblib

In [2]:
# Reading the database
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
x = train.drop(['SalePrice', 'Id'], axis=1)
y = train['SalePrice']

In [4]:
def pipe_func(**kwargs):
    
    random_state = kwargs.get('random_state')
    
    ohe_pipe = Pipeline(
        steps=[
            ('cat_imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='No Feature')),
            ('one_hot_encoder', OneHotEncoder(handle_unknown='infrequent_if_exist', sparse_output=False))
        ]
    )
    
    num_pipe = Pipeline(
        steps=[
            ('numerical_imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
            ('scaler', StandardScaler())
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', ohe_pipe, make_column_selector(dtype_include='object')),
            ('num', num_pipe, make_column_selector(dtype_include='number')),
        ]
    )

    target_trans = TransformedTargetRegressor(
        regressor=GradientBoostingRegressor(max_depth=5, n_estimators=300, random_state=random_state),
        transformer=FunctionTransformer(func=np.log1p, inverse_func=np.expm1)
    )
    
    full_pipe = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('dim_reduction', PCA(n_components=50, random_state=random_state)),
            ('regressor', target_trans)
        ]
    )

    return full_pipe

def rmse_of_log(y_true, y_pred):
        """
        This scorer function will calculate the RMSE of the log of predictions and true values
        """
        return root_mean_squared_error(np.log1p(y_true), np.log1p(y_pred))

# building the scorer function
scorer = make_scorer(rmse_of_log, greater_is_better=False)

In [5]:
pipe_func()

In [None]:
# %%capture
param_grid = {
    'dim_reduction__n_components': [100, 200, 300],
    'regressor__regressor__learning_rate': [ 0.001, 0.01, 0.1],
    'regressor__regressor__n_estimators': [10, 50, 100,200],
    'regressor__regressor__max_depth': [1, 3, 5, 10]
}

rmse_log_scorer = scorer()

pipe_model = pipe_func()

grid_search = GridSearchCV(pipe_model, param_grid, cv=5, scoring=rmse_log_scorer, verbose=2)

# Fit GridSearchCV to your data
grid_search.fit(X=x, y=y)

In [49]:
results = pd.DataFrame(grid_search.cv_results_).sort_values(by='rank_test_score', ascending=True).head(10)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_dim_reduction__n_components,param_regressor__regressor__learning_rate,param_regressor__regressor__max_depth,param_regressor__regressor__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
87,21.697638,1.254661,0.020211,0.006571,200,0.1,3,200,"{'dim_reduction__n_components': 200, 'regresso...",-0.122998,-0.148295,-0.141668,-0.118643,-0.128492,-0.132019,0.011232,1
39,10.559028,1.079854,0.018083,0.010581,100,0.1,3,200,"{'dim_reduction__n_components': 100, 'regresso...",-0.127292,-0.141511,-0.14047,-0.123591,-0.135817,-0.133736,0.007135,2
86,11.180309,0.327427,0.016039,0.005085,200,0.1,3,100,"{'dim_reduction__n_components': 200, 'regresso...",-0.122298,-0.144845,-0.144426,-0.121289,-0.137203,-0.134012,0.010345,3
38,5.451565,0.679676,0.011678,0.006028,100,0.1,3,100,"{'dim_reduction__n_components': 100, 'regresso...",-0.130778,-0.14703,-0.141949,-0.125377,-0.134114,-0.13585,0.007754,4
91,28.742033,1.173142,0.014876,0.001489,200,0.1,5,200,"{'dim_reduction__n_components': 200, 'regresso...",-0.1352,-0.150353,-0.142455,-0.121595,-0.134187,-0.136758,0.009552,5
43,18.560619,1.680811,0.016507,0.005334,100,0.1,5,200,"{'dim_reduction__n_components': 100, 'regresso...",-0.133684,-0.153244,-0.141266,-0.121424,-0.136089,-0.137141,0.010358,6
90,14.05326,0.069565,0.013396,0.000976,200,0.1,5,100,"{'dim_reduction__n_components': 200, 'regresso...",-0.134324,-0.156866,-0.141007,-0.119804,-0.133931,-0.137186,0.012023,7
42,8.37834,0.394593,0.01351,0.007584,100,0.1,5,100,"{'dim_reduction__n_components': 100, 'regresso...",-0.131214,-0.152816,-0.141066,-0.121644,-0.139431,-0.137234,0.010407,8
89,7.080708,0.053673,0.013232,0.00036,200,0.1,5,50,"{'dim_reduction__n_components': 200, 'regresso...",-0.136919,-0.158918,-0.139191,-0.123695,-0.137541,-0.139253,0.011288,9
41,4.554523,0.421889,0.013772,0.007303,100,0.1,5,50,"{'dim_reduction__n_components': 100, 'regresso...",-0.133695,-0.15337,-0.141241,-0.126823,-0.143515,-0.139729,0.009007,10


In [50]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'dim_reduction__n_components': 200, 'regressor__regressor__learning_rate': 0.1, 'regressor__regressor__max_depth': 3, 'regressor__regressor__n_estimators': 200}
-0.1320192458258847


In [57]:
best_model = grid_search.best_estimator_
best_model.fit(X=x, y=y)

In [58]:
joblib.dump(best_model, '../models/pipeline_gbr_all_ohe.pkl')

['../models/pipeline_gbr_all_ohe.pkl']

In [59]:
best_model = joblib.load('../models/pipeline_gbr_all_ohe.pkl')

In [64]:
test_predictions = pd.DataFrame(
    {
        'Id': test['Id'],
        'SalePrice': best_model.predict(test.drop('Id', axis=1))
    }
)
test_predictions.head()

Unnamed: 0,Id,SalePrice
0,1461,127227.234956
1,1462,162721.210292
2,1463,186455.090517
3,1464,191893.786001
4,1465,171830.971662


In [65]:
test_predictions.to_csv('../predictions/gbr_predictions_all_ohe.csv', index=False)

In [66]:
# create the .kaggle directory
!mkdir -p ~/.kaggle

The syntax of the command is incorrect.


In [67]:
# copy the kaggle.json file to thehis kaggle directory
!cp kaggle.json ~/.kaggle/

'cp' is not recognized as an internal or external command,
operable program or batch file.


In [68]:
# change permission to kaggle.json file
!chmod 600 ~/.kaggle/kaggle.json

'chmod' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
!kaggle competitions submit -c house-prices-advanced-regression-techniques -f /content/gradient_boosting_regressor_predictions.csv -m "Gradient Boosting Regressor model, using Pearson correlation, ANOVA and Kruskal-Wallis for feature decision and grid search for tuning hyperparameters"