In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import optuna

In [2]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

for col in object_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat)
    
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = pd.concat(temp_df)
    

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

In [6]:
def run(trial):
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)

    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)

    ytrain = xtrain.target
    yvalid = xvalid.target

    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])

    model = XGBRegressor(
        random_state=1,
        tree_method="gpu_hist",
        gpu_id=1,
        predictor="gpu_predictor",
        n_estimators=7000,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    return rmse

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=20)

[32m[I 2021-08-31 20:24:18,613][0m A new study created in memory with name: no-name-02ef21b6-e409-4f47-a5f7-c12ae47c4e06[0m


[0]	validation_0-rmse:6.42606
[819]	validation_0-rmse:0.72254


[32m[I 2021-08-31 20:24:24,242][0m Trial 0 finished with value: 0.7213835125554788 and parameters: {'learning_rate': 0.17596830002500205, 'reg_lambda': 13.731907662472675, 'reg_alpha': 0.06238251960575841, 'subsample': 0.2334730434956424, 'colsample_bytree': 0.10523036968287411, 'max_depth': 4}. Best is trial 0 with value: 0.7213835125554788.[0m


[0]	validation_0-rmse:6.95002
[1000]	validation_0-rmse:0.72141
[2000]	validation_0-rmse:0.71891
[3000]	validation_0-rmse:0.71819
[3487]	validation_0-rmse:0.71815


[32m[I 2021-08-31 20:24:36,299][0m Trial 1 finished with value: 0.7180840032669827 and parameters: {'learning_rate': 0.10774417504372416, 'reg_lambda': 0.00032788048988744025, 'reg_alpha': 22.336766671382403, 'subsample': 0.8013077426974943, 'colsample_bytree': 0.41056829411803075, 'max_depth': 2}. Best is trial 1 with value: 0.7180840032669827.[0m


[0]	validation_0-rmse:6.76478
[453]	validation_0-rmse:0.72952


[32m[I 2021-08-31 20:24:41,613][0m Trial 2 finished with value: 0.7248453353487938 and parameters: {'learning_rate': 0.13185540584192004, 'reg_lambda': 8.164435887807497e-06, 'reg_alpha': 0.8917062400368072, 'subsample': 0.28341767542260443, 'colsample_bytree': 0.35628894672821443, 'max_depth': 6}. Best is trial 1 with value: 0.7180840032669827.[0m


[0]	validation_0-rmse:6.73680
[1000]	validation_0-rmse:0.71889
[1433]	validation_0-rmse:0.71880


[32m[I 2021-08-31 20:24:48,692][0m Trial 3 finished with value: 0.7187743237264778 and parameters: {'learning_rate': 0.13550586743956933, 'reg_lambda': 62.24729364790009, 'reg_alpha': 2.2145368571383772e-08, 'subsample': 0.8347712931235336, 'colsample_bytree': 0.324338690457297, 'max_depth': 3}. Best is trial 1 with value: 0.7180840032669827.[0m


[0]	validation_0-rmse:7.68338
[1000]	validation_0-rmse:0.73194
[2000]	validation_0-rmse:0.72827
[3000]	validation_0-rmse:0.72593
[4000]	validation_0-rmse:0.72423
[5000]	validation_0-rmse:0.72314
[6000]	validation_0-rmse:0.72226
[6999]	validation_0-rmse:0.72156


[32m[I 2021-08-31 20:25:08,741][0m Trial 4 finished with value: 0.7215566071585475 and parameters: {'learning_rate': 0.012518277900469651, 'reg_lambda': 4.794006796219808e-06, 'reg_alpha': 0.9752204590265471, 'subsample': 0.3750696240949184, 'colsample_bytree': 0.9013322277030361, 'max_depth': 2}. Best is trial 1 with value: 0.7180840032669827.[0m


[0]	validation_0-rmse:7.02402
[1000]	validation_0-rmse:0.72023
[1144]	validation_0-rmse:0.72034


[32m[I 2021-08-31 20:25:15,221][0m Trial 5 finished with value: 0.7201122221536624 and parameters: {'learning_rate': 0.09840166052288259, 'reg_lambda': 0.7141025046438402, 'reg_alpha': 1.1576604541942345e-05, 'subsample': 0.5998404733886153, 'colsample_bytree': 0.617536653505792, 'max_depth': 4}. Best is trial 1 with value: 0.7180840032669827.[0m


[0]	validation_0-rmse:7.01075
[560]	validation_0-rmse:0.72422


[32m[I 2021-08-31 20:25:23,858][0m Trial 6 finished with value: 0.722339020127762 and parameters: {'learning_rate': 0.09983430514260011, 'reg_lambda': 0.001614925920058345, 'reg_alpha': 0.00029651475873981485, 'subsample': 0.8817065629423246, 'colsample_bytree': 0.3625898715627277, 'max_depth': 7}. Best is trial 1 with value: 0.7180840032669827.[0m


[0]	validation_0-rmse:7.11471
[1000]	validation_0-rmse:0.71895
[1522]	validation_0-rmse:0.71899


[32m[I 2021-08-31 20:25:32,180][0m Trial 7 finished with value: 0.7187401528437445 and parameters: {'learning_rate': 0.08633651100490385, 'reg_lambda': 1.1107264713948882e-05, 'reg_alpha': 1.3025142707026449e-07, 'subsample': 0.7807972072580716, 'colsample_bytree': 0.20061366230178368, 'max_depth': 4}. Best is trial 1 with value: 0.7180840032669827.[0m


[0]	validation_0-rmse:7.25905
[1000]	validation_0-rmse:0.72042
[2000]	validation_0-rmse:0.71929
[2108]	validation_0-rmse:0.71927


[32m[I 2021-08-31 20:25:40,722][0m Trial 8 finished with value: 0.7192224925322389 and parameters: {'learning_rate': 0.06760320864177612, 'reg_lambda': 9.674081125498352e-06, 'reg_alpha': 0.8653310379825596, 'subsample': 0.4570809955402034, 'colsample_bytree': 0.3889264551440994, 'max_depth': 3}. Best is trial 1 with value: 0.7180840032669827.[0m


[0]	validation_0-rmse:7.55879
[1000]	validation_0-rmse:0.72788
[2000]	validation_0-rmse:0.72401
[3000]	validation_0-rmse:0.72196
[4000]	validation_0-rmse:0.72074
[5000]	validation_0-rmse:0.72002
[6000]	validation_0-rmse:0.71947
[6999]	validation_0-rmse:0.71910


[32m[I 2021-08-31 20:26:00,413][0m Trial 9 finished with value: 0.7190900160716192 and parameters: {'learning_rate': 0.02869192835959732, 'reg_lambda': 2.92932997659859, 'reg_alpha': 7.688365906808927e-08, 'subsample': 0.4795682373724992, 'colsample_bytree': 0.6544324126545568, 'max_depth': 2}. Best is trial 1 with value: 0.7180840032669827.[0m


[0]	validation_0-rmse:7.50776
[1000]	validation_0-rmse:0.73326
[2000]	validation_0-rmse:0.73072
[3000]	validation_0-rmse:0.72945
[4000]	validation_0-rmse:0.72854
[5000]	validation_0-rmse:0.72782
[6000]	validation_0-rmse:0.72720
[6999]	validation_0-rmse:0.72670


[32m[I 2021-08-31 20:26:18,909][0m Trial 10 finished with value: 0.7267023235628097 and parameters: {'learning_rate': 0.03532191815916005, 'reg_lambda': 0.003098815387111055, 'reg_alpha': 97.79670733436187, 'subsample': 0.6930825335871904, 'colsample_bytree': 0.820742625479461, 'max_depth': 1}. Best is trial 1 with value: 0.7180840032669827.[0m


[0]	validation_0-rmse:7.38501
[1000]	validation_0-rmse:0.71880
[2000]	validation_0-rmse:0.71838
[2192]	validation_0-rmse:0.71854


[32m[I 2021-08-31 20:26:32,146][0m Trial 11 finished with value: 0.7183375415244628 and parameters: {'learning_rate': 0.05124083122002647, 'reg_lambda': 1.1837307926462121e-08, 'reg_alpha': 5.676828294422935e-06, 'subsample': 0.9961050018955511, 'colsample_bytree': 0.14513011642310247, 'max_depth': 5}. Best is trial 1 with value: 0.7180840032669827.[0m


[0]	validation_0-rmse:7.42624
[1000]	validation_0-rmse:0.71866
[1354]	validation_0-rmse:0.71878


[32m[I 2021-08-31 20:26:44,614][0m Trial 12 finished with value: 0.7186244953395851 and parameters: {'learning_rate': 0.04589032288066591, 'reg_lambda': 1.7185996701924753e-08, 'reg_alpha': 1.397257008642292e-05, 'subsample': 0.9781662824744688, 'colsample_bytree': 0.13123734668550302, 'max_depth': 6}. Best is trial 1 with value: 0.7180840032669827.[0m


[0]	validation_0-rmse:7.61811
[1000]	validation_0-rmse:0.72219
[2000]	validation_0-rmse:0.71978
[3000]	validation_0-rmse:0.71925
[3728]	validation_0-rmse:0.71928


[32m[I 2021-08-31 20:27:05,561][0m Trial 13 finished with value: 0.7192234842481869 and parameters: {'learning_rate': 0.020985053493842196, 'reg_lambda': 1.5078334105238647e-08, 'reg_alpha': 5.179625664103848e-06, 'subsample': 0.9618958861484932, 'colsample_bytree': 0.5190842763474741, 'max_depth': 5}. Best is trial 1 with value: 0.7180840032669827.[0m


[0]	validation_0-rmse:7.28892
[1000]	validation_0-rmse:0.71914
[1490]	validation_0-rmse:0.71936


[32m[I 2021-08-31 20:27:15,586][0m Trial 14 finished with value: 0.71907634248476 and parameters: {'learning_rate': 0.06371546041128873, 'reg_lambda': 0.0354309519373568, 'reg_alpha': 0.0029697135767132125, 'subsample': 0.9967529285066266, 'colsample_bytree': 0.2547215528867274, 'max_depth': 5}. Best is trial 1 with value: 0.7180840032669827.[0m


[0]	validation_0-rmse:5.98266
[1000]	validation_0-rmse:0.72679
[2000]	validation_0-rmse:0.72446
[3000]	validation_0-rmse:0.72326
[4000]	validation_0-rmse:0.72236
[5000]	validation_0-rmse:0.72173
[6000]	validation_0-rmse:0.72124
[6999]	validation_0-rmse:0.72083


[32m[I 2021-08-31 20:27:33,767][0m Trial 15 finished with value: 0.72080237739145 and parameters: {'learning_rate': 0.2335956345433736, 'reg_lambda': 5.589949505133686e-07, 'reg_alpha': 96.72557180440127, 'subsample': 0.7148016688109191, 'colsample_bytree': 0.4689355381038265, 'max_depth': 1}. Best is trial 1 with value: 0.7180840032669827.[0m


[0]	validation_0-rmse:7.39353
[1000]	validation_0-rmse:0.72177
[2000]	validation_0-rmse:0.71890
[3000]	validation_0-rmse:0.71803
[4000]	validation_0-rmse:0.71778
[4407]	validation_0-rmse:0.71778


In [5]:
study.best_params

{'learning_rate': 0.10060669515592832,
 'reg_lambda': 3.6268609366263594e-08,
 'reg_alpha': 0.5837933768291012,
 'subsample': 0.9420844926252625,
 'colsample_bytree': 0.6067936212382514,
 'max_depth': 2}