In [12]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from xgboost import XGBRegressor, DMatrix

import matplotlib.pyplot as plt
%matplotlib inline            
import seaborn as sns

from time import time
import pprint
import joblib
from functools import partial

# Suppressing warnings because of skopt verbosity
import warnings
warnings.filterwarnings("ignore")

# Skopt functions
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, DeltaYStopper
from skopt.space import Real, Categorical, Integer

# Model selection
from sklearn.model_selection import KFold

import optuna

In [13]:
df = pd.read_csv("../input/train_folds.csv")
df_test = pd.read_csv("../input/test.csv")
sample_submission = pd.read_csv("../input/sample_submission.csv")

df = df.drop(df[df['target'].lt(6)].index)
print(df.shape)



useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if 'cont' in col]
df_test = df_test[useful_features]

# target encoding
for col in object_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat)
    
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = pd.concat(temp_df)

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]




(299628, 27)


In [14]:
def run(trial):

    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)




    final_predictions = []
    scores=[]
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        #xtest = df_test.copy()


        ytrain = xtrain.target
        yvalid = xvalid.target
        
        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]

    
        # standarization

        scaler = preprocessing.StandardScaler()
        xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
        xvalid[numerical_cols] = scaler.transform(xvalid[numerical_cols])
        #xtest[numerical_cols] = scaler.transform(xtest[numerical_cols])

        # categorical features
        high_cardinality_cols = [col for col in object_cols if xtrain[col].nunique()>=9]
        low_cardinality_cols = [col for col in object_cols if xtrain[col].nunique()<9]
        
        
        # label encode columns with high cardinality 
        ordinal_encoder = preprocessing.OrdinalEncoder()
        xtrain[high_cardinality_cols] = ordinal_encoder.fit_transform(xtrain[high_cardinality_cols])
        xvalid[high_cardinality_cols] = ordinal_encoder.transform(xvalid[high_cardinality_cols])
        #xtest[high_cardinality_cols] = ordinal_encoder.transform(xtest[high_cardinality_cols])
        
        # One hot encode columns with low cardinality 
        OH_encoder = preprocessing.OneHotEncoder(handle_unknown='ignore', sparse=False)

        OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(xtrain[low_cardinality_cols]))
        OH_cols_valid = pd.DataFrame(OH_encoder.transform(xvalid[low_cardinality_cols]))
        #OH_cols_test = pd.DataFrame(OH_encoder.transform(xtest[low_cardinality_cols]))

        # codificador one-hot elimina; ponerlo de nuevo 
        OH_cols_train.index = xtrain.index
        OH_cols_valid.index = xvalid.index
        #OH_cols_test.index = xtest.index

        # Eliminar columnas categóricas (se reemplazarán con codificación one-hot) 
        num_X_train = xtrain.drop(low_cardinality_cols, axis=1)
        num_X_valid = xvalid.drop(low_cardinality_cols, axis=1)
        #num_X_test= xtest.drop(low_cardinality_cols, axis=1)

        #  añadir columnas codificadas one-hot a variables numéricas 
        after_OH_xtrain = pd.concat([num_X_train, OH_cols_train], axis=1)
        after_OH_valid= pd.concat([num_X_valid, OH_cols_valid], axis=1)
        #after_OH_test= pd.concat([num_X_test, OH_cols_test], axis=1) 


        model = XGBRegressor(random_state=42, #fold, 
                            n_jobs=-1,
                            n_estimators= 1000,
                            tree_method='gpu_hist',
                            learning_rate= learning_rate,
                            subsample= subsample,
                            max_depth= max_depth,
                            colsample_bytree= colsample_bytree,
                            reg_alpha = reg_alpha,
                            eval_metric='rmse',
                            reg_lambda = reg_lambda,
                            gpu_id=0,predictor='gpu_predictor',
                            objective='reg:squarederror')
        
        
        model.fit(after_OH_xtrain, ytrain, early_stopping_rounds=300, eval_set=[(after_OH_valid,yvalid)], verbose=1000)
        preds_valid = model.predict(after_OH_valid)
        #test_preds = model.predict(after_OH_test)
        #final_predictions.append(test_preds)
        rmse = mean_squared_error(yvalid, preds_valid, squared=False)
        print(fold,rmse)
        scores.append(rmse)
        
    return np.mean(scores)


In [15]:
study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=10)



[32m[I 2021-08-25 05:03:43,332][0m A new study created in memory with name: no-name-d81e1e8a-05e6-4e43-aa02-2f78a31a6eb2[0m
[0]	validation_0-rmse:7.47053
[999]	validation_0-rmse:0.70901
0 0.7090084490402568
[0]	validation_0-rmse:7.46632
[999]	validation_0-rmse:0.71058
1 0.7105713193408968
[0]	validation_0-rmse:7.46399
[999]	validation_0-rmse:0.71290
2 0.7128872798443765
[0]	validation_0-rmse:7.46666
[999]	validation_0-rmse:0.71240
3 0.7123904908297215
[0]	validation_0-rmse:7.47234
[999]	validation_0-rmse:0.71551
[32m[I 2021-08-25 05:05:08,692][0m Trial 0 finished with value: 0.7120516094280731 and parameters: {'learning_rate': 0.04064605037326259, 'reg_lambda': 2.3814916399284364e-07, 'reg_alpha': 3.4891088445954557, 'subsample': 0.9961031153315419, 'colsample_bytree': 0.8432094408082822, 'max_depth': 6}. Best is trial 0 with value: 0.7120516094280731.[0m
4 0.7154005080851145
[0]	validation_0-rmse:6.96256
[829]	validation_0-rmse:0.71048
0 0.7099017702381413
[0]	validation_0-rmse:

In [16]:
study.best_params  # E.g. {'x': 2.002108042}

{'learning_rate': 0.217078318724281,
 'reg_lambda': 26.933916936470073,
 'reg_alpha': 1.5955715249724064e-05,
 'subsample': 0.9847861450099923,
 'colsample_bytree': 0.8738913579766451,
 'max_depth': 3}

0.7289161215950625  ordinal + stand   <br>
0.7359939686055645  ordinal + normalizer <br>
0.7339300282189536  ordinal + standard + normalizer <br>
0.7359933943841589  ordinal + normalizer +standard   <br>
0.7288775872910201  ohe + stand   <br>
0.7359376887794242   ohe + normalizer   <br>
0.7289403434752502 (ohe+ 3 ordinal) + stand   <br>
0.7291070689887855  poly3 (T,F) (ohe+ 1 ordinal) + stand   <br>
0.72914235959686  poly3 (F,F) (ohe+ 1 ordinal) + stand   <br>
0.728907008813998  poly2 (F,F) (ohe+ 1 ordinal) + stand   <br>
0.7289321289479873   poly2 (F,T) (ohe+ 1 ordinal) + stand   <br>
0.7289501787229472   poly2 (T,T) (ohe+ 1 ordinal) + stand   <br>
0.7289416327232601   poly2 (T,F) (ohe+ 1 ordinal) + stand   <br>

0.7288644838881868  (ohe+ 1 ordinal) + stand   <br>
-0.7189543356528036   T_outliers+ (ohe+ 1 ordinal) + stand <br>
0.7205793549092518    T_encoding + T_outliers+ (ohe+ 1 ordinal) + stand <br>
