In [1]:
# Familiar imports
import numpy as np
import pandas as pd
import random
import os
import time
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error

#import lightgbm as lgb
import xgboost as xgb
#import catboost as ctb

import warnings
warnings.simplefilter('ignore')

# Parameters

In [2]:
target = 'loss'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 17
    CVSEED = 17
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    #N_ITERS = 2
else:
    N_SPLITS = 10
    N_ESTIMATORS = 10000
    EARLY_STOPPING_ROUNDS = 200
    VERBOSE = 1000
    SEED = 17
    CVSEED = 17
    #N_ITERS = 10

In [3]:
def set_seed(seed=17):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

# Load data

In [4]:
INPUT = Path("../input/tabular-playground-series-aug-2021")

train = pd.read_csv(INPUT / "train.csv")
test = pd.read_csv(INPUT / "test.csv")
submission = pd.read_csv(INPUT / "sample_submission.csv")

# Preprocessing

In [5]:
scale_features = [col for col in test.columns if 'f' in col]

ss = StandardScaler()
train[scale_features] = ss.fit_transform(train[scale_features])
test[scale_features] = ss.transform(test[scale_features])

In [6]:
# Swap noise

# Random
def apply_noise_rn(df, p=.75):
    should_not_swap = np.random.binomial(1, p, df.shape)
    corrupted_df = df.where(should_not_swap == 1, np.random.permutation(df))
    return corrupted_df

# Row-wise
def apply_noise_row(df, p=.75):
    should_not_swap = np.zeros(df.shape)
    for i in range(df.shape[0]):
        for j in np.random.choice(df.shape[1],int(p*df.shape[1]),replace=False):
            should_not_swap[i,j]=1 
    corrupted_df = df.where(should_not_swap == 1, np.random.permutation(df))
    return corrupted_df

# Pseudo Label

In [7]:
pseudo = pd.read_csv("../input/tps-aug-2021-lgbm-xgb-catboost/submission.csv")[target]
test_pseudo = pd.concat([test, pseudo], axis=1)
all_pseudo = pd.concat([train, test_pseudo]).reset_index(drop=True)

# Optuna

In [8]:
# Optuna for parameter search
!pip install -q optuna

import optuna
import pickle



In [9]:
# for the fixed learning rate, use the opt n iterations and tune the tree hyperparameters
def objective(trial, X=all_pseudo[scale_features], y=all_pseudo[target]):
  """
  """
  param_space = {
               'objective': 'reg:squarederror',
               'learning_rate':trial.suggest_uniform('learning_rate', 3.119e-3, 5.119e-3),
               'n_estimators':N_ESTIMATORS,
               'subsample': trial.suggest_uniform('subsample', 0.68, 0.78),
              'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.4619, 0.5619),
              'max_depth':trial.suggest_int('max_depth', 13, 17),
              'lambda':trial.suggest_uniform('lambda', 9.93, 14.93),
              'alpha':trial.suggest_uniform('alpha', 21.9, 26.9),
           'min_child_weight':trial.suggest_uniform('min_child_weight', 340, 440),
           'importance_type': 'total_gain',
            'tree_method': 'hist'
      
               # 'subsample_freq': trial.suggest_int('subsample_freq', 1, 5),
             #'boosting_type': 'gbdt',
               
                # 'min_child_samples':trial.suggest_int('min_child_samples', 5, 35),
             # 'num_leaves':trial.suggest_int('num_leaves', 10, 200),
              #'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 0.005),
              #'class_weight':trial.suggest_categorical('class_weight',['balanced',None]),
              # 'n_jobs' : -1,
                #'max_bin':trial.suggest_int('max_bin', 300, 1000),
              #'cat_smooth':trial.suggest_int('cat_smooth', 5, 100),
              #'cat_l2':trial.suggest_loguniform('cat_l2', 1e-3, 100)
                }
            

  seed_list=[SEED, SEED+1]
  #kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
  xgb_oof = np.zeros(train.shape[0])
  #lgb_pred = np.zeros(test.shape[0])
  X_tr, X_va, y_tr, y_va = train_test_split(X,y,test_size=.2,random_state=CVSEED)
  trn_idx = X_tr.index
  val_idx = X_va.index
  
  
  #for (trn_idx, val_idx) in enumerate(kf.split(X, y)):
  #print(f"===== fold {fold} =====")
  oof_idx = np.array([idx for idx in val_idx if idx < train.shape[0]])
  preds_idx = np.array([idx for idx in val_idx if idx >= train.shape[0]])

  X_train, y_train = all_pseudo[scale_features].iloc[trn_idx], all_pseudo[target].iloc[trn_idx]
  #X_train = apply_noise_rn(X_train)

  X_valid, y_valid = all_pseudo[scale_features].iloc[oof_idx], all_pseudo[target].iloc[oof_idx]
  #X_test = all_pseudo[scale_features].iloc[preds_idx]

  #start = time.time()
  for inseed in seed_list:
    param_space['seed'] = inseed

    model = xgb.XGBRegressor(**param_space)
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='rmse',
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        verbose=VERBOSE,
    )


    xgb_oof[oof_idx] += model.predict(X_valid) / len(seed_list)
    #lgb_pred[preds_idx-train.shape[0]] += model.predict(X_test) / len(seed_list)

  #elapsed = time.time() - start
  rmse = mean_squared_error(y_valid, xgb_oof[oof_idx], squared=False)
  #print(f"fold {fold} - lgb rmse: {rmse:.6f}, elapsed time: {elapsed:.2f}sec\n")            
  
  #print(f"oof lgb_rmse = {mean_squared_error(train[target], lgb_oof, squared=False)}")

  
  return rmse

In [10]:
study = optuna.create_study(direction='minimize')
study.optimize(objective,n_trials= 30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-08-22 01:10:16,294][0m A new study created in memory with name: no-name-bb591b35-2782-491f-b69b-85645241a7d9[0m


[0]	validation_0-rmse:10.03166


[1000]	validation_0-rmse:7.77885


[2000]	validation_0-rmse:7.76123


[3000]	validation_0-rmse:7.75428


[4000]	validation_0-rmse:7.75108


[5000]	validation_0-rmse:7.74935


[6000]	validation_0-rmse:7.74820


[6354]	validation_0-rmse:7.74805


[0]	validation_0-rmse:10.03167


[1000]	validation_0-rmse:7.77845


[2000]	validation_0-rmse:7.76076


[3000]	validation_0-rmse:7.75437


[4000]	validation_0-rmse:7.75100


[4923]	validation_0-rmse:7.74968


[32m[I 2021-08-22 01:51:48,243][0m Trial 0 finished with value: 7.747616592824789 and parameters: {'learning_rate': 0.004533207635022494, 'subsample': 0.7288284366111589, 'colsample_bytree': 0.46878160112634343, 'max_depth': 14, 'lambda': 12.348038900734842, 'alpha': 22.676261890480408, 'min_child_weight': 413.031771067915}. Best is trial 0 with value: 7.747616592824789.[0m


[0]	validation_0-rmse:10.03195


[1000]	validation_0-rmse:7.77545


[2000]	validation_0-rmse:7.75859


[3000]	validation_0-rmse:7.75195


[4000]	validation_0-rmse:7.74846


[5000]	validation_0-rmse:7.74638


[6000]	validation_0-rmse:7.74555


[6009]	validation_0-rmse:7.74553


[0]	validation_0-rmse:10.03196


[1000]	validation_0-rmse:7.77558


[2000]	validation_0-rmse:7.75881


[3000]	validation_0-rmse:7.75209


[4000]	validation_0-rmse:7.74866


[5000]	validation_0-rmse:7.74698


[5468]	validation_0-rmse:7.74659


[32m[I 2021-08-22 02:42:03,080][0m Trial 1 finished with value: 7.744636762159451 and parameters: {'learning_rate': 0.004456183434811274, 'subsample': 0.7193486774183349, 'colsample_bytree': 0.4976929879139919, 'max_depth': 16, 'lambda': 11.052431489457067, 'alpha': 25.908000014344125, 'min_child_weight': 342.6638701552546}. Best is trial 1 with value: 7.744636762159451.[0m


[0]	validation_0-rmse:10.03609


[1000]	validation_0-rmse:7.78748


[2000]	validation_0-rmse:7.76613


[3000]	validation_0-rmse:7.75809


[4000]	validation_0-rmse:7.75357


[5000]	validation_0-rmse:7.75103


[6000]	validation_0-rmse:7.74943


[7000]	validation_0-rmse:7.74823


[8000]	validation_0-rmse:7.74725


[8437]	validation_0-rmse:7.74726


[0]	validation_0-rmse:10.03612


[1000]	validation_0-rmse:7.78773


[2000]	validation_0-rmse:7.76561


[3000]	validation_0-rmse:7.75767


[4000]	validation_0-rmse:7.75262


[5000]	validation_0-rmse:7.75034


[6000]	validation_0-rmse:7.74846


[7000]	validation_0-rmse:7.74747


[8000]	validation_0-rmse:7.74653


[8914]	validation_0-rmse:7.74618


[32m[I 2021-08-22 03:54:48,831][0m Trial 2 finished with value: 7.745701703739794 and parameters: {'learning_rate': 0.003393192649830596, 'subsample': 0.7740307457039776, 'colsample_bytree': 0.4991968746700697, 'max_depth': 15, 'lambda': 14.063110091000771, 'alpha': 23.829507481182986, 'min_child_weight': 363.21842089435336}. Best is trial 1 with value: 7.744636762159451.[0m


[0]	validation_0-rmse:10.03510


[1000]	validation_0-rmse:7.78620


[2000]	validation_0-rmse:7.76609


[3000]	validation_0-rmse:7.75749


[4000]	validation_0-rmse:7.75298


[5000]	validation_0-rmse:7.74992


[6000]	validation_0-rmse:7.74777


[7000]	validation_0-rmse:7.74617


[7394]	validation_0-rmse:7.74589


[0]	validation_0-rmse:10.03510


[1000]	validation_0-rmse:7.78682


[2000]	validation_0-rmse:7.76635


[3000]	validation_0-rmse:7.75770


[4000]	validation_0-rmse:7.75321


[5000]	validation_0-rmse:7.75008


[6000]	validation_0-rmse:7.74815


[7000]	validation_0-rmse:7.74650


[8000]	validation_0-rmse:7.74538


[8322]	validation_0-rmse:7.74551


[32m[I 2021-08-22 04:49:51,380][0m Trial 3 finished with value: 7.744684653607536 and parameters: {'learning_rate': 0.0036494365074162894, 'subsample': 0.6968528068919326, 'colsample_bytree': 0.5017392762988903, 'max_depth': 13, 'lambda': 12.014148927345573, 'alpha': 26.551525126401565, 'min_child_weight': 413.1370489845744}. Best is trial 1 with value: 7.744636762159451.[0m


[0]	validation_0-rmse:10.03428


[1000]	validation_0-rmse:7.78075


[2000]	validation_0-rmse:7.76185


[3000]	validation_0-rmse:7.75436


[4000]	validation_0-rmse:7.74965


[5000]	validation_0-rmse:7.74772


[6000]	validation_0-rmse:7.74601


[7000]	validation_0-rmse:7.74533


[7241]	validation_0-rmse:7.74537


[0]	validation_0-rmse:10.03429


[1000]	validation_0-rmse:7.78123


[2000]	validation_0-rmse:7.76206


[3000]	validation_0-rmse:7.75470


[4000]	validation_0-rmse:7.75024


[5000]	validation_0-rmse:7.74801


[6000]	validation_0-rmse:7.74691


[6412]	validation_0-rmse:7.74659


[32m[I 2021-08-22 05:48:56,002][0m Trial 4 finished with value: 7.744758133482364 and parameters: {'learning_rate': 0.003851837900858955, 'subsample': 0.7769026590169555, 'colsample_bytree': 0.5135887621924576, 'max_depth': 15, 'lambda': 13.482626488291867, 'alpha': 24.81159836156926, 'min_child_weight': 351.6110925902816}. Best is trial 1 with value: 7.744636762159451.[0m


[0]	validation_0-rmse:10.03187


[1000]	validation_0-rmse:7.77605


[2000]	validation_0-rmse:7.75912


[3000]	validation_0-rmse:7.75273


[4000]	validation_0-rmse:7.74920


[5000]	validation_0-rmse:7.74725


[5668]	validation_0-rmse:7.74683


[0]	validation_0-rmse:10.03192


[1000]	validation_0-rmse:7.77648


[2000]	validation_0-rmse:7.75888


[3000]	validation_0-rmse:7.75190


[4000]	validation_0-rmse:7.74867


[5000]	validation_0-rmse:7.74706


[6000]	validation_0-rmse:7.74625


[6222]	validation_0-rmse:7.74641


[32m[I 2021-08-22 06:40:50,256][0m Trial 5 finished with value: 7.745076731997948 and parameters: {'learning_rate': 0.004472710564880304, 'subsample': 0.7377777934016643, 'colsample_bytree': 0.5165340342134916, 'max_depth': 17, 'lambda': 13.356730564725027, 'alpha': 24.70602951070347, 'min_child_weight': 386.88810854717013}. Best is trial 1 with value: 7.744636762159451.[0m


[0]	validation_0-rmse:10.03290


[1000]	validation_0-rmse:7.78165


[2000]	validation_0-rmse:7.76332


[3000]	validation_0-rmse:7.75585


[4000]	validation_0-rmse:7.75187


[5000]	validation_0-rmse:7.74956


[5727]	validation_0-rmse:7.74874


[0]	validation_0-rmse:10.03290


[1000]	validation_0-rmse:7.78147


[2000]	validation_0-rmse:7.76253


[3000]	validation_0-rmse:7.75482


[4000]	validation_0-rmse:7.75123


[5000]	validation_0-rmse:7.74876


[6000]	validation_0-rmse:7.74764


[6141]	validation_0-rmse:7.74766


[32m[I 2021-08-22 07:22:53,591][0m Trial 6 finished with value: 7.747278287192206 and parameters: {'learning_rate': 0.004213521164285296, 'subsample': 0.7534892661208958, 'colsample_bytree': 0.5007384452699637, 'max_depth': 13, 'lambda': 13.230835280732496, 'alpha': 26.52594469906762, 'min_child_weight': 420.0452448735341}. Best is trial 1 with value: 7.744636762159451.[0m


[0]	validation_0-rmse:10.03324


[1000]	validation_0-rmse:7.78224


[2000]	validation_0-rmse:7.76402


[3000]	validation_0-rmse:7.75633


[4000]	validation_0-rmse:7.75217


[5000]	validation_0-rmse:7.74945


[6000]	validation_0-rmse:7.74791


[7000]	validation_0-rmse:7.74704


[7617]	validation_0-rmse:7.74655


[0]	validation_0-rmse:10.03324


[1000]	validation_0-rmse:7.78220


[2000]	validation_0-rmse:7.76313


[3000]	validation_0-rmse:7.75550


[4000]	validation_0-rmse:7.75138


[5000]	validation_0-rmse:7.74890


[6000]	validation_0-rmse:7.74690


[6672]	validation_0-rmse:7.74649


[32m[I 2021-08-22 08:11:46,824][0m Trial 7 finished with value: 7.745453792630712 and parameters: {'learning_rate': 0.004119411950261611, 'subsample': 0.7027387489637449, 'colsample_bytree': 0.47318590037457403, 'max_depth': 13, 'lambda': 11.590808047486455, 'alpha': 25.3198087625169, 'min_child_weight': 401.26652828899836}. Best is trial 1 with value: 7.744636762159451.[0m


[0]	validation_0-rmse:10.03590


[1000]	validation_0-rmse:7.78807


[2000]	validation_0-rmse:7.76666


[3000]	validation_0-rmse:7.75814


[4000]	validation_0-rmse:7.75366


[5000]	validation_0-rmse:7.75063


[6000]	validation_0-rmse:7.74845


[7000]	validation_0-rmse:7.74678


[8000]	validation_0-rmse:7.74581


[8565]	validation_0-rmse:7.74575


[0]	validation_0-rmse:10.03592


[1000]	validation_0-rmse:7.78749


[2000]	validation_0-rmse:7.76554


[3000]	validation_0-rmse:7.75690


[4000]	validation_0-rmse:7.75225


[5000]	validation_0-rmse:7.74974


[6000]	validation_0-rmse:7.74800


[6216]	validation_0-rmse:7.74800


[32m[I 2021-08-22 09:07:33,433][0m Trial 8 finished with value: 7.745802588423409 and parameters: {'learning_rate': 0.0034460046654902037, 'subsample': 0.7343022070706486, 'colsample_bytree': 0.5058997094174641, 'max_depth': 14, 'lambda': 12.4375113988764, 'alpha': 25.818299282492667, 'min_child_weight': 422.13880456967246}. Best is trial 1 with value: 7.744636762159451.[0m


[0]	validation_0-rmse:10.03637


[1000]	validation_0-rmse:7.78654


[2000]	validation_0-rmse:7.76435


[3000]	validation_0-rmse:7.75627


[4000]	validation_0-rmse:7.75154


[5000]	validation_0-rmse:7.74928


[6000]	validation_0-rmse:7.74723


[6689]	validation_0-rmse:7.74696


[0]	validation_0-rmse:10.03637


[1000]	validation_0-rmse:7.78658


[2000]	validation_0-rmse:7.76437


[3000]	validation_0-rmse:7.75666


[4000]	validation_0-rmse:7.75236


[5000]	validation_0-rmse:7.75032


[6000]	validation_0-rmse:7.74860


[7000]	validation_0-rmse:7.74759


In [None]:
study.best_params

# Log

7.758238638554282 row-wise noise ver1

7.758171791679289 random noise ver4

7.745837750406564 no noise ver5

7.744947212695728 no noise ver7 (narrow space)