In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from xgboost import XGBRegressor, DMatrix

import matplotlib.pyplot as plt
%matplotlib inline            
import seaborn as sns

from time import time
import pprint
import joblib
from functools import partial

# Suppressing warnings because of skopt verbosity
import warnings
warnings.filterwarnings("ignore")

# Skopt functions
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, DeltaYStopper
from skopt.space import Real, Categorical, Integer

# Model selection
from sklearn.model_selection import KFold

import optuna


from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


In [2]:
df = pd.read_csv("../input/train_folds.csv")
df_test = pd.read_csv("../input/test.csv")
sample_submission = pd.read_csv("../input/sample_submission.csv")

df = df.drop(df[df['target'].lt(6)].index)
print(df.shape)

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if 'cont' in col]
df_test = df_test[useful_features]

# target encoding
for col in object_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat)
    
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = pd.concat(temp_df)

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

# Processing categoricals with SVD encoding
X_seq = df[object_cols].apply(lambda x: " ".join(list([str(y) + str(i) for i, y in enumerate(x)])), axis=1)
X_test_seq = df_test[object_cols].apply(lambda x: " ".join(list([str(y) + str(i) for i, y in enumerate(x)])), axis=1)


latent_dims = 24

svd_feats = ['svd_'+str(l) for l in range(latent_dims)]
vectorizer = TfidfVectorizer()

dim_reductio = TruncatedSVD(n_components=24, random_state=0)
df[svd_feats] =  dim_reductio.fit_transform(vectorizer.fit_transform(X_seq))
df_test[svd_feats] = dim_reductio.transform(vectorizer.transform(X_test_seq))


    
# Processing categoricals with frequency encoding
object_cols = [item for item in df.columns if 'cat' in item]

for cat in object_cols:
    counts = dict(df[cat].value_counts() / len(df))
    df[cat+'_freq'] = df[cat].replace(counts)
    df_test[cat+'_freq'] = df_test[cat].replace(counts)

frequencies = [cat+'_freq' for cat in object_cols]


useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]



(299628, 27)


In [11]:
def run(trial):

    n_estimators = trial.suggest_int("n_estimators", 1000, 8000)
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)




    final_predictions = []
    scores=[]
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        #xtest = df_test.copy()


        ytrain = xtrain.target
        yvalid = xvalid.target
        
        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]

    
        # standarization

        scaler = preprocessing.StandardScaler()
        xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
        xvalid[numerical_cols] = scaler.transform(xvalid[numerical_cols])
        #xtest[numerical_cols] = scaler.transform(xtest[numerical_cols])

        
 
        # label encode columns 
        ordinal_encoder = preprocessing.OrdinalEncoder()
        xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
        xvalid[object_cols] = ordinal_encoder.fit_transform(xvalid[object_cols])
        #xtest[object_cols] = ordinal_encoder.fit_transform(xtest[object_cols])

        model = XGBRegressor(random_state=42, #fold, 
                            n_jobs=-1,
                            n_estimators= n_estimators,
                            tree_method='gpu_hist',
                            learning_rate= learning_rate,
                            subsample= subsample,
                            max_depth= max_depth,
                            colsample_bytree= colsample_bytree,
                            reg_alpha = reg_alpha,
                            eval_metric='rmse',
                            reg_lambda = reg_lambda,
                            gpu_id=0,predictor='gpu_predictor',
                            objective='reg:squarederror')
        
        
        model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid,yvalid)], verbose=1000)
        preds_valid = model.predict(xvalid)
        #test_preds = model.predict(after_OH_test)
        #final_predictions.append(test_preds)
        rmse = mean_squared_error(yvalid, preds_valid, squared=False)
        print(fold,rmse)
        scores.append(rmse)
        
    return np.mean(scores)


In [12]:
study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=30)



43166167, 'reg_lambda': 9.958442360634781e-05, 'reg_alpha': 1.3585634532238833e-05, 'subsample': 0.8239696252566668, 'colsample_bytree': 0.6826662271904085, 'max_depth': 5}. Best is trial 5 with value: 0.7144889134825391.[0m
4 0.7158508040373192
[0]	validation_0-rmse:7.67363
[1000]	validation_0-rmse:0.71653
[2000]	validation_0-rmse:0.71376
[2518]	validation_0-rmse:0.71905
0 0.7133948836491598
[0]	validation_0-rmse:7.67013
[1000]	validation_0-rmse:0.71835
[1811]	validation_0-rmse:0.71676
1 0.7159879792672085
[0]	validation_0-rmse:7.66782
[1000]	validation_0-rmse:0.72143
[1337]	validation_0-rmse:0.72225
2 0.7212496285758292
[0]	validation_0-rmse:7.67048
[1000]	validation_0-rmse:0.72019
[1704]	validation_0-rmse:0.71927
3 0.7186024119176905
[0]	validation_0-rmse:7.67568
[1000]	validation_0-rmse:0.71964
[1993]	validation_0-rmse:0.71797
[32m[I 2021-08-25 13:18:51,740][0m Trial 9 finished with value: 0.7172776059001358 and parameters: {'n_estimators': 2660, 'learning_rate': 0.0142620156794

0.7289161215950625  ordinal + stand   <br>
0.7359939686055645  ordinal + normalizer <br>
0.7339300282189536  ordinal + standard + normalizer <br>
0.7359933943841589  ordinal + normalizer +standard   <br>
0.7288775872910201  ohe + stand   <br>
0.7359376887794242   ohe + normalizer   <br>
0.7289403434752502 (ohe+ 3 ordinal) + stand   <br>
0.7291070689887855  poly3 (T,F) (ohe+ 1 ordinal) + stand   <br>
0.72914235959686  poly3 (F,F) (ohe+ 1 ordinal) + stand   <br>
0.728907008813998  poly2 (F,F) (ohe+ 1 ordinal) + stand   <br>
0.7289321289479873   poly2 (F,T) (ohe+ 1 ordinal) + stand   <br>
0.7289501787229472   poly2 (T,T) (ohe+ 1 ordinal) + stand   <br>
0.7289416327232601   poly2 (T,F) (ohe+ 1 ordinal) + stand   <br>

0.7288644838881868  (ohe+ 1 ordinal) + stand   <br>
-0.7189543356528036   T_outliers+ (ohe+ 1 ordinal) + stand <br>
0.7205793549092518    T_encoding + T_outliers+ (ohe+ 1 ordinal) + stand <br>
         opt + SVD encoding + T_encoding + T_outliers+ (ohe+ 1 ordinal) + stand <br>


In [13]:
study.best_params

{'n_estimators': 4185,
 'learning_rate': 0.02150695086603378,
 'reg_lambda': 0.0025184847547236947,
 'reg_alpha': 3.343542448729039,
 'subsample': 0.9391276589890923,
 'colsample_bytree': 0.5250522983048288,
 'max_depth': 5}