In [1]:
# Familiar imports
import numpy as np
import pandas as pd
import random
import os
import time
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error

#import lightgbm as lgb
#import xgboost as xgb
import catboost as ctb

import warnings
warnings.simplefilter('ignore')

# Parameters

In [2]:
target = 'loss'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 17
    CVSEED = 17
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    #N_ITERS = 2
else:
    N_SPLITS = 10
    N_ESTIMATORS = 10000
    EARLY_STOPPING_ROUNDS = 200
    VERBOSE = 1000
    SEED = 17
    CVSEED = 17
    #N_ITERS = 10

In [3]:
def set_seed(seed=17):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

# Load data

In [4]:
INPUT = Path("../input/tabular-playground-series-aug-2021")

train = pd.read_csv(INPUT / "train.csv")
test = pd.read_csv(INPUT / "test.csv")
submission = pd.read_csv(INPUT / "sample_submission.csv")

# Preprocessing

In [5]:
scale_features = [col for col in test.columns if 'f' in col]

ss = StandardScaler()
train[scale_features] = ss.fit_transform(train[scale_features])
test[scale_features] = ss.transform(test[scale_features])

In [6]:
# Swap noise

# Random
def apply_noise_rn(df, p=.75):
    should_not_swap = np.random.binomial(1, p, df.shape)
    corrupted_df = df.where(should_not_swap == 1, np.random.permutation(df))
    return corrupted_df

# Row-wise
def apply_noise_row(df, p=.75):
    should_not_swap = np.zeros(df.shape)
    for i in range(df.shape[0]):
        for j in np.random.choice(df.shape[1],int(p*df.shape[1]),replace=False):
            should_not_swap[i,j]=1 
    corrupted_df = df.where(should_not_swap == 1, np.random.permutation(df))
    return corrupted_df

# Pseudo Label

In [7]:
pseudo = pd.read_csv("../input/tps-aug-2021-lgbm-xgb-catboost/submission.csv")[target]
test_pseudo = pd.concat([test, pseudo], axis=1)
all_pseudo = pd.concat([train, test_pseudo]).reset_index(drop=True)

# Optuna

In [8]:
# Optuna for parameter search
!pip install -q optuna

import optuna
import pickle



In [9]:
# for the fixed learning rate, use the opt n iterations and tune the tree hyperparameters
def objective(trial, X=all_pseudo[scale_features], y=all_pseudo[target]):
  """
  """
  param_space = {
                  'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
            'loss_function': 'RMSE',
            'eval_metric': 'RMSE',
          #'task_type': 'GPU',
          'max_depth':trial.suggest_int('max_depth', 2, 16),
           'learning_rate':trial.suggest_uniform('learning_rate', 1e-3, 1e-2),
          'n_estimators':N_ESTIMATORS,
        'max_bin':trial.suggest_int('max_bin', 150, 450),
             'min_data_in_leaf':trial.suggest_int('min_data_in_leaf', 20, 100),
         'reg_lambda':trial.suggest_uniform('reg_lambda', 0.001, 1),
            'subsample': trial.suggest_uniform('subsample', 0.5, 1.0)
                 }
            

  seed_list=[SEED, SEED+1]
  #kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
  ctb_oof = np.zeros(train.shape[0])
  #lgb_pred = np.zeros(test.shape[0])
  X_tr, X_va, y_tr, y_va = train_test_split(X,y,test_size=.2,random_state=CVSEED)
  trn_idx = X_tr.index
  val_idx = X_va.index
  
  
  #for (trn_idx, val_idx) in enumerate(kf.split(X, y)):
  #print(f"===== fold {fold} =====")
  oof_idx = np.array([idx for idx in val_idx if idx < train.shape[0]])
  preds_idx = np.array([idx for idx in val_idx if idx >= train.shape[0]])

  X_train, y_train = all_pseudo[scale_features].iloc[trn_idx], all_pseudo[target].iloc[trn_idx]
  #X_train = apply_noise_rn(X_train)

  X_valid, y_valid = all_pseudo[scale_features].iloc[oof_idx], all_pseudo[target].iloc[oof_idx]
  #X_test = all_pseudo[scale_features].iloc[preds_idx]

  #start = time.time()
  for inseed in seed_list:
    param_space['random_seed'] = inseed

    model =  ctb.CatBoostRegressor(**param_space)
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        use_best_model=True,
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        verbose=VERBOSE,
    )


    ctb_oof[oof_idx] += model.predict(X_valid) / len(seed_list)
    #lgb_pred[preds_idx-train.shape[0]] += model.predict(X_test) / len(seed_list)

  #elapsed = time.time() - start
  rmse = mean_squared_error(y_valid, ctb_oof[oof_idx], squared=False)
  #print(f"fold {fold} - lgb rmse: {rmse:.6f}, elapsed time: {elapsed:.2f}sec\n")            
  
  #print(f"oof lgb_rmse = {mean_squared_error(train[target], lgb_oof, squared=False)}")

  
  return rmse

In [10]:
study = optuna.create_study(direction='minimize')
study.optimize(objective,n_trials= 30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-08-25 02:41:41,512][0m A new study created in memory with name: no-name-c7b6442d-e291-4abf-9a1c-70b129221006[0m


0:	learn: 6.3404063	test: 7.8586497	best: 7.8586497 (0)	total: 230ms	remaining: 38m 19s
1000:	learn: 6.2534723	test: 7.8080086	best: 7.8080086 (1000)	total: 2m 32s	remaining: 22m 46s
2000:	learn: 6.2104190	test: 7.7924010	best: 7.7924010 (2000)	total: 5m 2s	remaining: 20m 10s
3000:	learn: 6.1752301	test: 7.7831036	best: 7.7831036 (3000)	total: 7m 28s	remaining: 17m 26s
4000:	learn: 6.1402068	test: 7.7767378	best: 7.7767378 (4000)	total: 9m 47s	remaining: 14m 41s
5000:	learn: 6.1062982	test: 7.7722918	best: 7.7722918 (5000)	total: 12m 7s	remaining: 12m 6s
6000:	learn: 6.0740634	test: 7.7691672	best: 7.7691604 (5985)	total: 14m 25s	remaining: 9m 36s
7000:	learn: 6.0425298	test: 7.7669103	best: 7.7669103 (7000)	total: 16m 46s	remaining: 7m 10s
8000:	learn: 6.0117264	test: 7.7651463	best: 7.7651423 (7996)	total: 19m 7s	remaining: 4m 46s
9000:	learn: 5.9813624	test: 7.7635592	best: 7.7635508 (8988)	total: 21m 27s	remaining: 2m 22s
9999:	learn: 5.9511586	test: 7.7620586	best: 7.7620586 (9999

[32m[I 2021-08-25 03:29:33,074][0m Trial 0 finished with value: 7.761804289405876 and parameters: {'bootstrap_type': 'MVS', 'max_depth': 8, 'learning_rate': 0.004513601852878248, 'max_bin': 268, 'min_data_in_leaf': 33, 'reg_lambda': 0.7642834462924214, 'subsample': 0.7719972787053753}. Best is trial 0 with value: 7.761804289405876.[0m


0:	learn: 6.3401227	test: 7.8584480	best: 7.8584480 (0)	total: 150ms	remaining: 25m 4s
1000:	learn: 6.2077778	test: 7.7915684	best: 7.7915684 (1000)	total: 2m 27s	remaining: 22m 8s
2000:	learn: 6.1351313	test: 7.7761166	best: 7.7761166 (2000)	total: 4m 45s	remaining: 19m 2s
3000:	learn: 6.0680065	test: 7.7687796	best: 7.7687552 (2997)	total: 7m 2s	remaining: 16m 24s
4000:	learn: 6.0045590	test: 7.7645673	best: 7.7645577 (3999)	total: 9m 19s	remaining: 13m 59s
5000:	learn: 5.9425355	test: 7.7618337	best: 7.7617977 (4988)	total: 11m 38s	remaining: 11m 37s
6000:	learn: 5.8814832	test: 7.7594862	best: 7.7594809 (5987)	total: 13m 57s	remaining: 9m 17s
7000:	learn: 5.8218613	test: 7.7577764	best: 7.7577345 (6984)	total: 16m 16s	remaining: 6m 58s
8000:	learn: 5.7634430	test: 7.7564454	best: 7.7564449 (7995)	total: 18m 36s	remaining: 4m 38s
9000:	learn: 5.7060210	test: 7.7553337	best: 7.7553157 (8941)	total: 20m 55s	remaining: 2m 19s
9999:	learn: 5.6496757	test: 7.7543421	best: 7.7543421 (9999

[32m[I 2021-08-25 04:16:24,919][0m Trial 1 finished with value: 7.753055265479657 and parameters: {'bootstrap_type': 'MVS', 'max_depth': 8, 'learning_rate': 0.009386202456558995, 'max_bin': 262, 'min_data_in_leaf': 36, 'reg_lambda': 0.8991571580559535, 'subsample': 0.7439917108329501}. Best is trial 1 with value: 7.753055265479657.[0m


0:	learn: 6.3401979	test: 7.8585049	best: 7.8585049 (0)	total: 128ms	remaining: 21m 19s
1000:	learn: 6.2140804	test: 7.7948852	best: 7.7948852 (1000)	total: 1m 59s	remaining: 17m 54s
2000:	learn: 6.1442388	test: 7.7797750	best: 7.7797750 (2000)	total: 3m 51s	remaining: 15m 25s
3000:	learn: 6.0772193	test: 7.7721570	best: 7.7721533 (2995)	total: 5m 41s	remaining: 13m 16s
4000:	learn: 6.0139655	test: 7.7674236	best: 7.7674220 (3999)	total: 7m 32s	remaining: 11m 17s
5000:	learn: 5.9527248	test: 7.7643852	best: 7.7643807 (4990)	total: 9m 23s	remaining: 9m 23s
6000:	learn: 5.8932580	test: 7.7618890	best: 7.7618646 (5994)	total: 11m 15s	remaining: 7m 30s
7000:	learn: 5.8345917	test: 7.7603189	best: 7.7603028 (6993)	total: 13m 8s	remaining: 5m 37s
8000:	learn: 5.7769191	test: 7.7588450	best: 7.7588395 (7996)	total: 15m 2s	remaining: 3m 45s
9000:	learn: 5.7202940	test: 7.7578485	best: 7.7578091 (8980)	total: 16m 55s	remaining: 1m 52s
9999:	learn: 5.6645210	test: 7.7569549	best: 7.7568786 (9918

[32m[I 2021-08-25 04:51:43,329][0m Trial 2 finished with value: 7.756616246813477 and parameters: {'bootstrap_type': 'Bernoulli', 'max_depth': 8, 'learning_rate': 0.008653102341471961, 'max_bin': 188, 'min_data_in_leaf': 92, 'reg_lambda': 0.032735034847530375, 'subsample': 0.7201190211064717}. Best is trial 1 with value: 7.753055265479657.[0m
[33m[W 2021-08-25 04:51:45,217][0m Trial 3 failed because of the following error: CatBoostError("catboost/private/libs/options/bootstrap_options.cpp:16: Error: bayesian bootstrap doesn't support taken fraction option")
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/optuna/_optimize.py", line 216, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-9-02d592099811>", line 51, in objective
    verbose=VERBOSE,
  File "/opt/conda/lib/python3.7/site-packages/catboost/core.py", line 5259, in fit
    save_snapshot, snapshot_file, snapshot_interval, init_model, callbacks, log_cout, log_cerr)
  Fil

CatBoostError: catboost/private/libs/options/bootstrap_options.cpp:16: Error: bayesian bootstrap doesn't support taken fraction option

In [None]:
study.best_params

# Log

7.768172118850735 row-wise noise ver4

7.776578973194598 random noise ver5

7.757181558352988 no noise ver7

7.756073089003697 no noise ver8 (narrow space)

7.756000258000434 no noise ver9 (narrow space) final

============= No GPU ================



