In [1]:
# Familiar imports
import numpy as np
import pandas as pd
import random
import os
import time
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error

#import lightgbm as lgb
import xgboost as xgb
#import catboost as ctb

import warnings
warnings.simplefilter('ignore')

# Parameters

In [2]:
target = 'loss'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 17
    CVSEED = 17
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    #N_ITERS = 2
else:
    N_SPLITS = 10
    N_ESTIMATORS = 10000
    EARLY_STOPPING_ROUNDS = 200
    VERBOSE = 1000
    SEED = 17
    CVSEED = 17
    #N_ITERS = 10

In [3]:
def set_seed(seed=17):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

# Load data

In [4]:
INPUT = Path("../input/tabular-playground-series-aug-2021")

train = pd.read_csv(INPUT / "train.csv")
test = pd.read_csv(INPUT / "test.csv")
submission = pd.read_csv(INPUT / "sample_submission.csv")

# Preprocessing

In [5]:
scale_features = [col for col in test.columns if 'f' in col]

ss = StandardScaler()
train[scale_features] = ss.fit_transform(train[scale_features])
test[scale_features] = ss.transform(test[scale_features])

In [6]:
# Swap noise

# Random
def apply_noise_rn(df, p=.75):
    should_not_swap = np.random.binomial(1, p, df.shape)
    corrupted_df = df.where(should_not_swap == 1, np.random.permutation(df))
    return corrupted_df

# Row-wise
def apply_noise_row(df, p=.75):
    should_not_swap = np.zeros(df.shape)
    for i in range(df.shape[0]):
        for j in np.random.choice(df.shape[1],int(p*df.shape[1]),replace=False):
            should_not_swap[i,j]=1 
    corrupted_df = df.where(should_not_swap == 1, np.random.permutation(df))
    return corrupted_df

# Optuna

In [7]:
# Optuna for parameter search
!pip install -q optuna

import optuna
import pickle



In [8]:
# for the fixed learning rate, use the opt n iterations and tune the tree hyperparameters
def objective(trial, X=train[scale_features], y=train[target]):
  """
  """
  param_space = {
               'objective': 'reg:squarederror',
               'learning_rate':trial.suggest_uniform('learning_rate', 0.65e-3, 2.65e-3),
               'n_estimators':N_ESTIMATORS,
               'subsample': trial.suggest_uniform('subsample', 0.52, 0.62),
              'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.704, 0.904),
              'max_depth':trial.suggest_int('max_depth', 11, 15),
              'lambda':trial.suggest_uniform('lambda', 6.955, 11.955),
              'alpha':trial.suggest_uniform('alpha', 3.979, 8.979),
           'min_child_weight':trial.suggest_uniform('min_child_weight', 116, 216),
           'importance_type': 'total_gain',
            'tree_method': 'hist'
               # 'subsample_freq': trial.suggest_int('subsample_freq', 1, 5),
             #'boosting_type': 'gbdt',
               
                # 'min_child_samples':trial.suggest_int('min_child_samples', 5, 35),
             # 'num_leaves':trial.suggest_int('num_leaves', 10, 200),
              #'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 0.005),
              #'class_weight':trial.suggest_categorical('class_weight',['balanced',None]),
              # 'n_jobs' : -1,
                #'max_bin':trial.suggest_int('max_bin', 300, 1000),
              #'cat_smooth':trial.suggest_int('cat_smooth', 5, 100),
              #'cat_l2':trial.suggest_loguniform('cat_l2', 1e-3, 100)
                }
            

  seed_list=[SEED, SEED+1]
  #kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
  #lgb_oof = np.zeros(train.shape[0])
  #lgb_pred = np.zeros(test.shape[0])
  
  X_tr, X_va, y_tr, y_va = train_test_split(X,y,test_size=.2,random_state=CVSEED)
  
  pred=np.zeros(y_va.shape[0])
  
  #for (trn_idx, val_idx) in enumerate(kf.split(X, y)):
  #print(f"===== fold {fold} =====")
  
  #X_tr = apply_noise_row(X_tr)

  
  
  #start = time.time()
  for inseed in seed_list:
            
            param_space['seed'] = inseed

            model = xgb.XGBRegressor(**param_space)
            model.fit(
                X_tr, 
                y_tr,
                eval_set=[(X_va, y_va)],
                eval_metric='rmse',
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                verbose=VERBOSE,
            )


            pred += model.predict(X_va) / len(seed_list)
    
  #elapsed = time.time() - start
  rmse = mean_squared_error(y_va, pred, squared=False)
  #print(f"fold {fold} - lgb rmse: {rmse:.6f}, elapsed time: {elapsed:.2f}sec\n")            
  
  #print(f"oof lgb_rmse = {mean_squared_error(train[target], lgb_oof, squared=False)}")

  
  return rmse

In [9]:
study = optuna.create_study(direction='minimize')
study.optimize(objective,n_trials= 30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-08-22 10:57:54,559][0m A new study created in memory with name: no-name-b55e377a-4cc6-434d-82a0-1bb93914ea64[0m


[0]	validation_0-rmse:10.05525


[1000]	validation_0-rmse:8.16670


[2000]	validation_0-rmse:7.84509


[3000]	validation_0-rmse:7.78953


[4000]	validation_0-rmse:7.77543


[5000]	validation_0-rmse:7.76883


[6000]	validation_0-rmse:7.76479


[7000]	validation_0-rmse:7.76170


[8000]	validation_0-rmse:7.75951


[9000]	validation_0-rmse:7.75769


[9999]	validation_0-rmse:7.75659


[0]	validation_0-rmse:10.05531


[1000]	validation_0-rmse:8.16578


[2000]	validation_0-rmse:7.84466


[3000]	validation_0-rmse:7.78897


[4000]	validation_0-rmse:7.77505


[5000]	validation_0-rmse:7.76867


[6000]	validation_0-rmse:7.76471


[7000]	validation_0-rmse:7.76176


[8000]	validation_0-rmse:7.75944


[9000]	validation_0-rmse:7.75798


[9999]	validation_0-rmse:7.75713


[32m[I 2021-08-22 12:07:39,882][0m Trial 0 finished with value: 7.756359590132131 and parameters: {'learning_rate': 0.0009997080219980193, 'subsample': 0.551100816316413, 'colsample_bytree': 0.7897765605184331, 'max_depth': 14, 'lambda': 9.111270479736842, 'alpha': 5.130251596690405, 'min_child_weight': 126.82683780903136}. Best is trial 0 with value: 7.756359590132131.[0m


[0]	validation_0-rmse:10.05375


[1000]	validation_0-rmse:7.97705


[2000]	validation_0-rmse:7.79985


[3000]	validation_0-rmse:7.77732


[4000]	validation_0-rmse:7.76925


[5000]	validation_0-rmse:7.76427


[6000]	validation_0-rmse:7.76080


[7000]	validation_0-rmse:7.75866


[8000]	validation_0-rmse:7.75699


[9000]	validation_0-rmse:7.75578


[9999]	validation_0-rmse:7.75467


[0]	validation_0-rmse:10.05382


[1000]	validation_0-rmse:7.97689


[2000]	validation_0-rmse:7.79917


[3000]	validation_0-rmse:7.77721


[4000]	validation_0-rmse:7.76917


[5000]	validation_0-rmse:7.76385


[6000]	validation_0-rmse:7.76034


[7000]	validation_0-rmse:7.75817


[8000]	validation_0-rmse:7.75634


[9000]	validation_0-rmse:7.75542


[9485]	validation_0-rmse:7.75516


[32m[I 2021-08-22 13:07:22,245][0m Trial 1 finished with value: 7.754181378699815 and parameters: {'learning_rate': 0.0013752146552519868, 'subsample': 0.544803397754841, 'colsample_bytree': 0.8073763477651917, 'max_depth': 12, 'lambda': 11.30432804417353, 'alpha': 4.245082975548115, 'min_child_weight': 116.43026850946228}. Best is trial 1 with value: 7.754181378699815.[0m


[0]	validation_0-rmse:10.05100


[1000]	validation_0-rmse:7.83819


[2000]	validation_0-rmse:7.77405


[3000]	validation_0-rmse:7.76405


[4000]	validation_0-rmse:7.75991


[5000]	validation_0-rmse:7.75728


[6000]	validation_0-rmse:7.75636


[6164]	validation_0-rmse:7.75660


[0]	validation_0-rmse:10.05101


[1000]	validation_0-rmse:7.83837


[2000]	validation_0-rmse:7.77469


[3000]	validation_0-rmse:7.76489


[4000]	validation_0-rmse:7.76010


[5000]	validation_0-rmse:7.75724


[5825]	validation_0-rmse:7.75611


[32m[I 2021-08-22 13:51:40,174][0m Trial 2 finished with value: 7.754948562705274 and parameters: {'learning_rate': 0.0020968269458880853, 'subsample': 0.5305029332594213, 'colsample_bytree': 0.7627713834418416, 'max_depth': 15, 'lambda': 11.243226092419464, 'alpha': 8.643263262767594, 'min_child_weight': 130.16720863134395}. Best is trial 1 with value: 7.754181378699815.[0m


[0]	validation_0-rmse:10.05265


[1000]	validation_0-rmse:7.90205


[2000]	validation_0-rmse:7.78577


[3000]	validation_0-rmse:7.77109


[4000]	validation_0-rmse:7.76474


[5000]	validation_0-rmse:7.76073


[6000]	validation_0-rmse:7.75768


[7000]	validation_0-rmse:7.75606


[7829]	validation_0-rmse:7.75527


[0]	validation_0-rmse:10.05268


[1000]	validation_0-rmse:7.90234


[2000]	validation_0-rmse:7.78620


[3000]	validation_0-rmse:7.77111


[4000]	validation_0-rmse:7.76452


[5000]	validation_0-rmse:7.76047


[6000]	validation_0-rmse:7.75804


[7000]	validation_0-rmse:7.75641


[8000]	validation_0-rmse:7.75496


[9000]	validation_0-rmse:7.75418


[9943]	validation_0-rmse:7.75374


[32m[I 2021-08-22 14:42:19,840][0m Trial 3 finished with value: 7.75344924632747 and parameters: {'learning_rate': 0.0016509612156689498, 'subsample': 0.589707824875318, 'colsample_bytree': 0.7040768212621661, 'max_depth': 13, 'lambda': 10.988708718044395, 'alpha': 8.000101187284184, 'min_child_weight': 152.97060418096973}. Best is trial 3 with value: 7.75344924632747.[0m


[0]	validation_0-rmse:10.05185


[1000]	validation_0-rmse:7.86697


[2000]	validation_0-rmse:7.78332


[3000]	validation_0-rmse:7.77035


[4000]	validation_0-rmse:7.76402


[5000]	validation_0-rmse:7.75985


[6000]	validation_0-rmse:7.75696


[7000]	validation_0-rmse:7.75497


[8000]	validation_0-rmse:7.75390


[9000]	validation_0-rmse:7.75319


[9566]	validation_0-rmse:7.75294


[0]	validation_0-rmse:10.05187


[1000]	validation_0-rmse:7.86797


[2000]	validation_0-rmse:7.78375


[3000]	validation_0-rmse:7.77122


[4000]	validation_0-rmse:7.76455


[5000]	validation_0-rmse:7.76051


[6000]	validation_0-rmse:7.75765


[7000]	validation_0-rmse:7.75629


[8000]	validation_0-rmse:7.75502


[8230]	validation_0-rmse:7.75503


[32m[I 2021-08-22 15:22:52,084][0m Trial 4 finished with value: 7.753198149027122 and parameters: {'learning_rate': 0.0018331925260194747, 'subsample': 0.5999280340279187, 'colsample_bytree': 0.8309679692775827, 'max_depth': 11, 'lambda': 9.484816540848616, 'alpha': 5.787546831727187, 'min_child_weight': 203.58662168918866}. Best is trial 4 with value: 7.753198149027122.[0m


[0]	validation_0-rmse:10.05050


[1000]	validation_0-rmse:7.82552


[2000]	validation_0-rmse:7.77327


[3000]	validation_0-rmse:7.76426


[4000]	validation_0-rmse:7.75986


[5000]	validation_0-rmse:7.75798


[6000]	validation_0-rmse:7.75670


[6378]	validation_0-rmse:7.75657


[0]	validation_0-rmse:10.05049


[1000]	validation_0-rmse:7.82546


[2000]	validation_0-rmse:7.77331


[3000]	validation_0-rmse:7.76367


[4000]	validation_0-rmse:7.75953


[5000]	validation_0-rmse:7.75732


[5408]	validation_0-rmse:7.75725


[32m[I 2021-08-22 15:58:46,496][0m Trial 5 finished with value: 7.755594219963789 and parameters: {'learning_rate': 0.002209176304404727, 'subsample': 0.5490720203482734, 'colsample_bytree': 0.7141044710172328, 'max_depth': 15, 'lambda': 9.02164782126678, 'alpha': 7.430487224086697, 'min_child_weight': 171.33302762756372}. Best is trial 4 with value: 7.753198149027122.[0m


[0]	validation_0-rmse:10.05022


[1000]	validation_0-rmse:7.82600


[2000]	validation_0-rmse:7.77705


[3000]	validation_0-rmse:7.76679


[4000]	validation_0-rmse:7.76163


[5000]	validation_0-rmse:7.75808


[6000]	validation_0-rmse:7.75641


[7000]	validation_0-rmse:7.75530


[7203]	validation_0-rmse:7.75528


[0]	validation_0-rmse:10.05024


[1000]	validation_0-rmse:7.82613


[2000]	validation_0-rmse:7.77763


[3000]	validation_0-rmse:7.76770


[4000]	validation_0-rmse:7.76207


[5000]	validation_0-rmse:7.75909


[6000]	validation_0-rmse:7.75747


[7000]	validation_0-rmse:7.75635


[7887]	validation_0-rmse:7.75585


[32m[I 2021-08-22 16:37:55,546][0m Trial 6 finished with value: 7.754472841950368 and parameters: {'learning_rate': 0.0022477846434354578, 'subsample': 0.5842072878738096, 'colsample_bytree': 0.7731451881514582, 'max_depth': 11, 'lambda': 9.732328055287212, 'alpha': 6.706953889606423, 'min_child_weight': 127.93403595169634}. Best is trial 4 with value: 7.753198149027122.[0m


[0]	validation_0-rmse:10.05363


[1000]	validation_0-rmse:7.96455


[2000]	validation_0-rmse:7.79847


[3000]	validation_0-rmse:7.77756


[4000]	validation_0-rmse:7.76950


[5000]	validation_0-rmse:7.76445


[6000]	validation_0-rmse:7.76109


[7000]	validation_0-rmse:7.75846


[8000]	validation_0-rmse:7.75663


[9000]	validation_0-rmse:7.75555


[9829]	validation_0-rmse:7.75484


[0]	validation_0-rmse:10.05367


[1000]	validation_0-rmse:7.96487


[2000]	validation_0-rmse:7.79855


[3000]	validation_0-rmse:7.77784


[4000]	validation_0-rmse:7.76973


[5000]	validation_0-rmse:7.76484


[6000]	validation_0-rmse:7.76131


[7000]	validation_0-rmse:7.75901


[8000]	validation_0-rmse:7.75746


[9000]	validation_0-rmse:7.75656


[9999]	validation_0-rmse:7.75582


[32m[I 2021-08-22 17:31:18,404][0m Trial 7 finished with value: 7.75470624607062 and parameters: {'learning_rate': 0.0014024651658922538, 'subsample': 0.5360571182581438, 'colsample_bytree': 0.8297852969365358, 'max_depth': 11, 'lambda': 8.391900323894133, 'alpha': 6.913553682439553, 'min_child_weight': 123.3608529923481}. Best is trial 4 with value: 7.753198149027122.[0m


[0]	validation_0-rmse:10.05537


[1000]	validation_0-rmse:8.18350


[2000]	validation_0-rmse:7.85509


[3000]	validation_0-rmse:7.79639


[4000]	validation_0-rmse:7.78143


[5000]	validation_0-rmse:7.77438


[6000]	validation_0-rmse:7.76964


[7000]	validation_0-rmse:7.76595


[8000]	validation_0-rmse:7.76330


[9000]	validation_0-rmse:7.76109


[9999]	validation_0-rmse:7.75932


[0]	validation_0-rmse:10.05539


[1000]	validation_0-rmse:8.18327


[2000]	validation_0-rmse:7.85519


[3000]	validation_0-rmse:7.79677


[4000]	validation_0-rmse:7.78131


[5000]	validation_0-rmse:7.77408


[6000]	validation_0-rmse:7.76935


[7000]	validation_0-rmse:7.76601


[8000]	validation_0-rmse:7.76347


[9000]	validation_0-rmse:7.76121


[9999]	validation_0-rmse:7.75962


[32m[I 2021-08-22 18:27:09,749][0m Trial 8 finished with value: 7.759187414980873 and parameters: {'learning_rate': 0.0009629597709808566, 'subsample': 0.5353994887631974, 'colsample_bytree': 0.7123842569515274, 'max_depth': 11, 'lambda': 7.132313010016075, 'alpha': 4.072725133944426, 'min_child_weight': 149.23364976552293}. Best is trial 4 with value: 7.753198149027122.[0m


[0]	validation_0-rmse:10.04893


[1000]	validation_0-rmse:7.80703


[2000]	validation_0-rmse:7.77344


[3000]	validation_0-rmse:7.76460


[4000]	validation_0-rmse:7.76033


[5000]	validation_0-rmse:7.75782


[6000]	validation_0-rmse:7.75654


[6362]	validation_0-rmse:7.75624


[0]	validation_0-rmse:10.04890


[1000]	validation_0-rmse:7.80705


[2000]	validation_0-rmse:7.77299


[3000]	validation_0-rmse:7.76415


[4000]	validation_0-rmse:7.75965


[5000]	validation_0-rmse:7.75693


[6000]	validation_0-rmse:7.75477


[6182]	validation_0-rmse:7.75504


[32m[I 2021-08-22 18:58:39,722][0m Trial 9 finished with value: 7.7544974847152845 and parameters: {'learning_rate': 0.0025788423011414837, 'subsample': 0.6099594415045345, 'colsample_bytree': 0.8758109331066848, 'max_depth': 11, 'lambda': 9.232728512645036, 'alpha': 5.180356676154741, 'min_child_weight': 200.55080324398966}. Best is trial 4 with value: 7.753198149027122.[0m


[0]	validation_0-rmse:10.05218


[1000]	validation_0-rmse:7.87742


[2000]	validation_0-rmse:7.78273


[3000]	validation_0-rmse:7.77008


[4000]	validation_0-rmse:7.76361


[5000]	validation_0-rmse:7.75993


[6000]	validation_0-rmse:7.75770


[7000]	validation_0-rmse:7.75640


[8000]	validation_0-rmse:7.75577


[8676]	validation_0-rmse:7.75544


[0]	validation_0-rmse:10.05220


[1000]	validation_0-rmse:7.87707


[2000]	validation_0-rmse:7.78181


[3000]	validation_0-rmse:7.76900


[4000]	validation_0-rmse:7.76279


[5000]	validation_0-rmse:7.75845


[6000]	validation_0-rmse:7.75589


[6998]	validation_0-rmse:7.75455


[32m[I 2021-08-22 19:49:23,680][0m Trial 10 finished with value: 7.754114367996892 and parameters: {'learning_rate': 0.0017611202036189068, 'subsample': 0.6192014522206959, 'colsample_bytree': 0.9017795656542079, 'max_depth': 13, 'lambda': 10.33480455799122, 'alpha': 5.644892485497893, 'min_child_weight': 215.39670069031698}. Best is trial 4 with value: 7.753198149027122.[0m


[0]	validation_0-rmse:10.05219


[1000]	validation_0-rmse:7.87856


[2000]	validation_0-rmse:7.78132


In [None]:
study.best_params

# Log

7.76776099796762 random noise ver3

7.767604136668598 row-wise noise ver4

7.753515220316349 no noise ver5

7.7532905240188965 no noise ver6 (narrow space)

kfold row-wise noise ver2
7.847220305678095