In [1]:
# Familiar imports
import numpy as np
import pandas as pd
import random
import os
import time
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error

#import lightgbm as lgb
import xgboost as xgb
#import catboost as ctb

import warnings
warnings.simplefilter('ignore')

# Parameters

In [2]:
target = 'loss'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 17
    CVSEED = 17
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    #N_ITERS = 2
else:
    N_SPLITS = 10
    N_ESTIMATORS = 10000
    EARLY_STOPPING_ROUNDS = 300
    VERBOSE = 1000
    SEED = 17
    CVSEED = 17
    #N_ITERS = 10

In [3]:
def set_seed(seed=17):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

# Load data

In [4]:
INPUT = Path("../input/tabular-playground-series-aug-2021")

train = pd.read_csv(INPUT / "train.csv")
test = pd.read_csv(INPUT / "test.csv")
submission = pd.read_csv(INPUT / "sample_submission.csv")

# Preprocessing

In [5]:
scale_features = [col for col in test.columns if 'f' in col]

ss = StandardScaler()
train[scale_features] = ss.fit_transform(train[scale_features])
test[scale_features] = ss.transform(test[scale_features])

In [6]:
# Swap noise

# Random
def apply_noise_rn(df, p=.75):
    should_not_swap = np.random.binomial(1, p, df.shape)
    corrupted_df = df.where(should_not_swap == 1, np.random.permutation(df))
    return corrupted_df

# Row-wise
def apply_noise_row(df, p=.75):
    should_not_swap = np.zeros(df.shape)
    for i in range(df.shape[0]):
        for j in np.random.choice(df.shape[1],int(p*df.shape[1]),replace=False):
            should_not_swap[i,j]=1 
    corrupted_df = df.where(should_not_swap == 1, np.random.permutation(df))
    return corrupted_df

# Pseudo Label

In [7]:
pseudo = pd.read_csv("../input/blending-tool-tps-aug-2021/file1_7.84996_file2_7.85000_blend.csv")[target]
test_pseudo = pd.concat([test, pseudo], axis=1)
all_pseudo = pd.concat([train, test_pseudo]).reset_index(drop=True)

# Optuna

In [8]:
# Optuna for parameter search
!pip install -q optuna

import optuna
import pickle



In [9]:
# for the fixed learning rate, use the opt n iterations and tune the tree hyperparameters
def objective(trial, X=all_pseudo[scale_features], y=all_pseudo[target]):
  """
  """
  param_space = {
                  'objective': 'reg:squarederror',              
                  'n_estimators':N_ESTIMATORS,  
                  'importance_type': 'total_gain',      
                  'tree_method': 'hist',   
                  'booster': 'gbtree',
                  'n_jobs':-1,
     
                  'learning_rate':trial.suggest_uniform('learning_rate', 3.5e-3, 8.5e-3), 
                  'subsample': trial.suggest_uniform('subsample', 0.667, 0.867),        
                  'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.499, 0.699),       
                  'max_depth':trial.suggest_int('max_depth', 14, 22),           
                  'lambda':trial.suggest_uniform('lambda', 0.97, 10.97),           
                  'alpha':trial.suggest_uniform('alpha', 24.5, 34.5),         
                  'min_child_weight':trial.suggest_uniform('min_child_weight', 323, 523)  
                }
            

  seed_list=[SEED, SEED+1]
  #kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
  xgb_oof = np.zeros(train.shape[0])
  #lgb_pred = np.zeros(test.shape[0])
  X_tr, X_va, y_tr, y_va = train_test_split(X,y,test_size=.2,random_state=CVSEED)
  trn_idx = X_tr.index
  val_idx = X_va.index
  
  
  #for (trn_idx, val_idx) in enumerate(kf.split(X, y)):
  #print(f"===== fold {fold} =====")
  oof_idx = np.array([idx for idx in val_idx if idx < train.shape[0]])
  preds_idx = np.array([idx for idx in val_idx if idx >= train.shape[0]])

  X_train, y_train = all_pseudo[scale_features].iloc[trn_idx], all_pseudo[target].iloc[trn_idx]
  #X_train = apply_noise_rn(X_train)

  X_valid, y_valid = all_pseudo[scale_features].iloc[oof_idx], all_pseudo[target].iloc[oof_idx]
  #X_test = all_pseudo[scale_features].iloc[preds_idx]

  #start = time.time()
  for inseed in seed_list:
    param_space['seed'] = inseed

    model = xgb.XGBRegressor(**param_space)
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='rmse',
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        verbose=VERBOSE,
    )


    xgb_oof[oof_idx] += model.predict(X_valid) / len(seed_list)
    #lgb_pred[preds_idx-train.shape[0]] += model.predict(X_test) / len(seed_list)

  #elapsed = time.time() - start
  rmse = mean_squared_error(y_valid, xgb_oof[oof_idx], squared=False)
  #print(f"fold {fold} - lgb rmse: {rmse:.6f}, elapsed time: {elapsed:.2f}sec\n")            
  
  #print(f"oof lgb_rmse = {mean_squared_error(train[target], lgb_oof, squared=False)}")

  
  return rmse

In [10]:
study = optuna.create_study(direction='minimize')
study.optimize(objective,n_trials= 30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-08-30 08:27:44,485][0m A new study created in memory with name: no-name-2363d217-260a-4fdf-b2a6-7c4e5283f4e5[0m


[0]	validation_0-rmse:10.02121


[1000]	validation_0-rmse:7.76142


[2000]	validation_0-rmse:7.75222


[3000]	validation_0-rmse:7.74884


[3752]	validation_0-rmse:7.74863


[0]	validation_0-rmse:10.02130


[1000]	validation_0-rmse:7.76224


[2000]	validation_0-rmse:7.75296


[3000]	validation_0-rmse:7.75009


[3087]	validation_0-rmse:7.75025


[32m[I 2021-08-30 08:56:53,129][0m Trial 0 finished with value: 7.747334274490658 and parameters: {'learning_rate': 0.007168761726094947, 'subsample': 0.7538174664175544, 'colsample_bytree': 0.61295836339311, 'max_depth': 17, 'lambda': 1.8126587567692944, 'alpha': 32.807085563877365, 'min_child_weight': 452.40642196204703}. Best is trial 0 with value: 7.747334274490658.[0m


[0]	validation_0-rmse:10.03211


[1000]	validation_0-rmse:7.77285


[2000]	validation_0-rmse:7.75801


[3000]	validation_0-rmse:7.75247


[4000]	validation_0-rmse:7.74880


[5000]	validation_0-rmse:7.74721


[6000]	validation_0-rmse:7.74651


[7000]	validation_0-rmse:7.74591


[7529]	validation_0-rmse:7.74616


[0]	validation_0-rmse:10.03216


[1000]	validation_0-rmse:7.77379


[2000]	validation_0-rmse:7.75870


[3000]	validation_0-rmse:7.75274


[4000]	validation_0-rmse:7.74988


[5000]	validation_0-rmse:7.74858


[5564]	validation_0-rmse:7.74819


[32m[I 2021-08-30 09:56:50,585][0m Trial 1 finished with value: 7.745365670110334 and parameters: {'learning_rate': 0.004403077863993593, 'subsample': 0.7854414243901037, 'colsample_bytree': 0.6732533740765063, 'max_depth': 15, 'lambda': 3.6829220489150662, 'alpha': 32.16158630492214, 'min_child_weight': 338.699432756783}. Best is trial 1 with value: 7.745365670110334.[0m


[0]	validation_0-rmse:10.02709


[1000]	validation_0-rmse:7.76586


[2000]	validation_0-rmse:7.75299


[3000]	validation_0-rmse:7.74905


[3770]	validation_0-rmse:7.74852


[0]	validation_0-rmse:10.02711


[1000]	validation_0-rmse:7.76619


[2000]	validation_0-rmse:7.75307


[3000]	validation_0-rmse:7.74808


[4000]	validation_0-rmse:7.74625


[4310]	validation_0-rmse:7.74658


[32m[I 2021-08-30 10:33:45,241][0m Trial 2 finished with value: 7.745404534724204 and parameters: {'learning_rate': 0.005690425514715098, 'subsample': 0.7231636950516521, 'colsample_bytree': 0.5054992762390372, 'max_depth': 19, 'lambda': 3.070630537264991, 'alpha': 27.604315348845937, 'min_child_weight': 343.2082402003427}. Best is trial 1 with value: 7.745365670110334.[0m


[0]	validation_0-rmse:10.01805


[1000]	validation_0-rmse:7.76051


[2000]	validation_0-rmse:7.75131


[3000]	validation_0-rmse:7.74993


[3272]	validation_0-rmse:7.75023


[0]	validation_0-rmse:10.01807


[1000]	validation_0-rmse:7.76056


[2000]	validation_0-rmse:7.75122


[3000]	validation_0-rmse:7.74932


[3281]	validation_0-rmse:7.74945


[32m[I 2021-08-30 11:04:00,691][0m Trial 3 finished with value: 7.747006907064253 and parameters: {'learning_rate': 0.007974865247211147, 'subsample': 0.7342145800583244, 'colsample_bytree': 0.6305051286728602, 'max_depth': 18, 'lambda': 8.334462550821085, 'alpha': 25.33307251132437, 'min_child_weight': 372.46009340411183}. Best is trial 1 with value: 7.745365670110334.[0m


[0]	validation_0-rmse:10.03014


[1000]	validation_0-rmse:7.76776


[2000]	validation_0-rmse:7.75412


[3000]	validation_0-rmse:7.74913


[4000]	validation_0-rmse:7.74756


[5000]	validation_0-rmse:7.74667


[5745]	validation_0-rmse:7.74644


[0]	validation_0-rmse:10.03023


[1000]	validation_0-rmse:7.76827


[2000]	validation_0-rmse:7.75417


[3000]	validation_0-rmse:7.74996


[3402]	validation_0-rmse:7.74980


[32m[I 2021-08-30 11:47:20,547][0m Trial 4 finished with value: 7.745901799003799 and parameters: {'learning_rate': 0.004900773239012929, 'subsample': 0.7160856910250786, 'colsample_bytree': 0.6677913892618419, 'max_depth': 21, 'lambda': 5.325207146864641, 'alpha': 27.85780141205624, 'min_child_weight': 394.9736848104902}. Best is trial 1 with value: 7.745365670110334.[0m


[0]	validation_0-rmse:10.01675


[1000]	validation_0-rmse:7.75767


[2000]	validation_0-rmse:7.75057


[3000]	validation_0-rmse:7.74834


[3308]	validation_0-rmse:7.74877


[0]	validation_0-rmse:10.01678


[1000]	validation_0-rmse:7.75947


[2000]	validation_0-rmse:7.75096


[2238]	validation_0-rmse:7.75081


[32m[I 2021-08-30 12:17:24,475][0m Trial 5 finished with value: 7.746615254511395 and parameters: {'learning_rate': 0.008311674302505307, 'subsample': 0.8076674835738886, 'colsample_bytree': 0.5147474137321244, 'max_depth': 22, 'lambda': 10.159638850596705, 'alpha': 32.08071851807198, 'min_child_weight': 333.84182202059026}. Best is trial 1 with value: 7.745365670110334.[0m


[0]	validation_0-rmse:10.01792


[1000]	validation_0-rmse:7.76119


[2000]	validation_0-rmse:7.75149


[3000]	validation_0-rmse:7.74822


[4000]	validation_0-rmse:7.74701


[4396]	validation_0-rmse:7.74721


[0]	validation_0-rmse:10.01797


[1000]	validation_0-rmse:7.76195


[2000]	validation_0-rmse:7.75163


[3000]	validation_0-rmse:7.74899


[3907]	validation_0-rmse:7.74822


[32m[I 2021-08-30 12:47:50,070][0m Trial 6 finished with value: 7.745431104776354 and parameters: {'learning_rate': 0.008004283015946835, 'subsample': 0.8005614868603719, 'colsample_bytree': 0.568049363494123, 'max_depth': 14, 'lambda': 7.7274772666849465, 'alpha': 34.43281514763645, 'min_child_weight': 476.8083236676259}. Best is trial 1 with value: 7.745365670110334.[0m


[0]	validation_0-rmse:10.02887


[1000]	validation_0-rmse:7.77165


[2000]	validation_0-rmse:7.75761


[3000]	validation_0-rmse:7.75206


[4000]	validation_0-rmse:7.74913


[5000]	validation_0-rmse:7.74761


[6000]	validation_0-rmse:7.74671


[6465]	validation_0-rmse:7.74635


[0]	validation_0-rmse:10.02893


[1000]	validation_0-rmse:7.77249


[2000]	validation_0-rmse:7.75793


[3000]	validation_0-rmse:7.75294


[4000]	validation_0-rmse:7.75031


[5000]	validation_0-rmse:7.74869


[5257]	validation_0-rmse:7.74867


[32m[I 2021-08-30 13:28:01,282][0m Trial 7 finished with value: 7.74619529139142 and parameters: {'learning_rate': 0.005229873670200594, 'subsample': 0.8190145369436002, 'colsample_bytree': 0.5111553230820959, 'max_depth': 14, 'lambda': 4.866859193256249, 'alpha': 28.028462831351447, 'min_child_weight': 507.44293071650304}. Best is trial 1 with value: 7.745365670110334.[0m


[0]	validation_0-rmse:10.02254


[1000]	validation_0-rmse:7.76209


[2000]	validation_0-rmse:7.75132


[2990]	validation_0-rmse:7.74855


[0]	validation_0-rmse:10.02256


[1000]	validation_0-rmse:7.76328


[2000]	validation_0-rmse:7.75336


[3000]	validation_0-rmse:7.75028


[3695]	validation_0-rmse:7.74969


[32m[I 2021-08-30 13:58:29,970][0m Trial 8 finished with value: 7.747171019298125 and parameters: {'learning_rate': 0.006841062501293358, 'subsample': 0.8325229814017416, 'colsample_bytree': 0.549445113986017, 'max_depth': 21, 'lambda': 3.4280184818200263, 'alpha': 31.806884075540996, 'min_child_weight': 464.14956927142725}. Best is trial 1 with value: 7.745365670110334.[0m


[0]	validation_0-rmse:10.02225


[1000]	validation_0-rmse:7.76117


[2000]	validation_0-rmse:7.75185


[3000]	validation_0-rmse:7.74779


[3472]	validation_0-rmse:7.74790


[0]	validation_0-rmse:10.02223


[1000]	validation_0-rmse:7.76091


[2000]	validation_0-rmse:7.75062


[3000]	validation_0-rmse:7.74776


[4000]	validation_0-rmse:7.74646


[5000]	validation_0-rmse:7.74579


[5194]	validation_0-rmse:7.74624


[32m[I 2021-08-30 14:39:08,137][0m Trial 9 finished with value: 7.743730951037691 and parameters: {'learning_rate': 0.006926882143879833, 'subsample': 0.7665578211019899, 'colsample_bytree': 0.551350454117138, 'max_depth': 21, 'lambda': 2.7649143275388735, 'alpha': 26.355021643568286, 'min_child_weight': 417.1915305746212}. Best is trial 9 with value: 7.743730951037691.[0m


[0]	validation_0-rmse:10.02420


[1000]	validation_0-rmse:7.76247


[2000]	validation_0-rmse:7.75205


[3000]	validation_0-rmse:7.74812


[3734]	validation_0-rmse:7.74779


[0]	validation_0-rmse:10.02428


[1000]	validation_0-rmse:7.76344


[2000]	validation_0-rmse:7.75162


[3000]	validation_0-rmse:7.74847


[4000]	validation_0-rmse:7.74763


[4402]	validation_0-rmse:7.74800


[32m[I 2021-08-30 15:14:03,353][0m Trial 10 finished with value: 7.745363079335315 and parameters: {'learning_rate': 0.006424379150478126, 'subsample': 0.6950062059385218, 'colsample_bytree': 0.5653177980504662, 'max_depth': 20, 'lambda': 1.0223840718761803, 'alpha': 24.756454528149565, 'min_child_weight': 410.9889168418383}. Best is trial 9 with value: 7.743730951037691.[0m


[0]	validation_0-rmse:10.02341


[1000]	validation_0-rmse:7.76318


[2000]	validation_0-rmse:7.75285


[3000]	validation_0-rmse:7.74965


[3632]	validation_0-rmse:7.74924


[0]	validation_0-rmse:10.02355


[1000]	validation_0-rmse:7.76329


[2000]	validation_0-rmse:7.75378


[3000]	validation_0-rmse:7.75020


[3350]	validation_0-rmse:7.74987


[32m[I 2021-08-30 15:43:20,465][0m Trial 11 finished with value: 7.747375536337517 and parameters: {'learning_rate': 0.006615854985450881, 'subsample': 0.676616627117251, 'colsample_bytree': 0.5652101729529815, 'max_depth': 20, 'lambda': 1.0370902520148924, 'alpha': 24.65028571157717, 'min_child_weight': 415.95821003754077}. Best is trial 9 with value: 7.743730951037691.[0m


[0]	validation_0-rmse:10.02542


[1000]	validation_0-rmse:7.76279


[2000]	validation_0-rmse:7.75229


[3000]	validation_0-rmse:7.74890


[4000]	validation_0-rmse:7.74720


[4971]	validation_0-rmse:7.74698


[0]	validation_0-rmse:10.02552


[1000]	validation_0-rmse:7.76419


[2000]	validation_0-rmse:7.75250


[3000]	validation_0-rmse:7.74889


[3577]	validation_0-rmse:7.74801


[32m[I 2021-08-30 16:20:57,619][0m Trial 12 finished with value: 7.74501471358232 and parameters: {'learning_rate': 0.006109679024128444, 'subsample': 0.6698046070576467, 'colsample_bytree': 0.5863623370523208, 'max_depth': 22, 'lambda': 2.2342742837080074, 'alpha': 25.71023990518336, 'min_child_weight': 421.8925131088092}. Best is trial 9 with value: 7.743730951037691.[0m


[0]	validation_0-rmse:10.02056


[1000]	validation_0-rmse:7.75989


[2000]	validation_0-rmse:7.75031


[2916]	validation_0-rmse:7.74822


[0]	validation_0-rmse:10.02065


[1000]	validation_0-rmse:7.75956


[2000]	validation_0-rmse:7.75181


[2909]	validation_0-rmse:7.74995


[32m[I 2021-08-30 16:50:51,111][0m Trial 13 finished with value: 7.7471242545274155 and parameters: {'learning_rate': 0.007342087281701791, 'subsample': 0.8620302212868357, 'colsample_bytree': 0.597238203980089, 'max_depth': 22, 'lambda': 2.047812769127256, 'alpha': 26.102082617680942, 'min_child_weight': 440.09290828518414}. Best is trial 9 with value: 7.743730951037691.[0m


[0]	validation_0-rmse:10.02615


[1000]	validation_0-rmse:7.76454


[2000]	validation_0-rmse:7.75187


[3000]	validation_0-rmse:7.74885


[3838]	validation_0-rmse:7.74794


[0]	validation_0-rmse:10.02620


[1000]	validation_0-rmse:7.76512


[2000]	validation_0-rmse:7.75334


[3000]	validation_0-rmse:7.75002


[4000]	validation_0-rmse:7.74834


In [None]:
study.best_params

# Log

7.744636762159451 lv1 best

7.744741749510907 no noise ver3

narrow
