In [1]:
# Familiar imports
import numpy as np
import pandas as pd
import random
import os
import time
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

#import lightgbm as lgb
#import xgboost as xgb
#import catboost as ctb

import warnings
warnings.simplefilter('ignore')

# Parameters

In [2]:
target = 'loss'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 17
    CVSEED = 17
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    #N_ITERS = 2
else:
    N_SPLITS = 10
    N_ESTIMATORS = 500
    EARLY_STOPPING_ROUNDS = 200
    VERBOSE = 1000
    SEED = 17
    CVSEED = 17
    #N_ITERS = 10

In [3]:
def set_seed(seed=17):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

# Load data

In [4]:
INPUT = Path("../input/tabular-playground-series-aug-2021")

train = pd.read_csv(INPUT / "train.csv")
test = pd.read_csv(INPUT / "test.csv")
submission = pd.read_csv(INPUT / "sample_submission.csv")

In [5]:
train['lgb_pred'] = np.load("../input/tps-aug-2021-lgb/lgb_oof.npy")
test['lgb_pred'] = np.load("../input/tps-aug-2021-lgb/lgb_pred.npy")

train['xgb_pred'] = np.load("../input/tps-aug-2021-xgb/xgb_oof.npy")
test['xgb_pred'] = np.load("../input/tps-aug-2021-xgb/xgb_pred.npy")

train['xgb2_pred'] = np.load("../input/tps-aug-2021-xgb-pseudo2-pred/3xgb_oof.npy")+np.load("../input/tps-aug-2021-xgb-pseudo2-pred/4xgb_oof.npy")
test['xgb2_pred'] = np.load("../input/tps-aug-2021-xgb-pseudo2-pred/3xgb_pred.npy")+np.load("../input/tps-aug-2021-xgb-pseudo2-pred/4xgb_pred.npy")

train['ctb2_pred'] = np.load("../input/tps-aug-2021-catb-pseudo2/ctb_oof.npy")
test['ctb2_pred'] = np.load("../input/tps-aug-2021-catb-pseudo2/ctb_pred.npy")


# Preprocessing

In [6]:
scale_features = [col for col in test.columns if 'f' in col]

ss = StandardScaler()
train[scale_features] = ss.fit_transform(train[scale_features])
test[scale_features] = ss.transform(test[scale_features])

In [7]:
# Swap noise

# Random
def apply_noise_rn(df, p=.75):
    should_not_swap = np.random.binomial(1, p, df.shape)
    corrupted_df = df.where(should_not_swap == 1, np.random.permutation(df))
    return corrupted_df

# Row-wise
def apply_noise_row(df, p=.75):
    should_not_swap = np.zeros(df.shape)
    for i in range(df.shape[0]):
        for j in np.random.choice(df.shape[1],int(p*df.shape[1]),replace=False):
            should_not_swap[i,j]=1 
    corrupted_df = df.where(should_not_swap == 1, np.random.permutation(df))
    return corrupted_df

# Pseudo Label

In [8]:
pseudo = pd.read_csv("../input/blending-tool-tps-aug-2021/blend.csv/0.part")[target]
test_pseudo = pd.concat([test, pseudo], axis=1)
all_pseudo = pd.concat([train, test_pseudo]).reset_index(drop=True)

In [9]:
useful_features = scale_features + ['lgb_pred', 'xgb_pred','xgb2_pred','ctb2_pred']

# Optuna

In [10]:
# Optuna for parameter search
!pip install -q optuna

import optuna
import pickle



In [11]:
# for the fixed learning rate, use the opt n iterations and tune the tree hyperparameters
def objective(trial, X=all_pseudo[useful_features], y=all_pseudo[target]):
  """
  """
  param_space = {
              'n_estimators':N_ESTIMATORS,
                    #'objective': 'regression',
              'criterion':'mse',
                  #'metric':'rmse',
              'n_jobs':-1,
              'bootstrap':True,
              
              'max_depth' : trial.suggest_int("max_depth", 1, 7),
              'max_features' : trial.suggest_float("max_features", 0.1, 1.0),
              'max_samples' : trial.suggest_float("max_samples", 0.1, 1.0),
               'ccp_alpha' : trial.suggest_loguniform("ccp_alpha", 1e-8, 100.0)
               
                }
            

  seed_list=[SEED, SEED+1]
  #kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
  lgb_oof = np.zeros(train.shape[0])
  #lgb_pred = np.zeros(test.shape[0])
  X_tr, X_va, y_tr, y_va = train_test_split(X,y,test_size=.2,random_state=CVSEED)
  trn_idx = X_tr.index
  val_idx = X_va.index
  
  
  #for (trn_idx, val_idx) in enumerate(kf.split(X, y)):
  #print(f"===== fold {fold} =====")
  oof_idx = np.array([idx for idx in val_idx if idx < train.shape[0]])
  preds_idx = np.array([idx for idx in val_idx if idx >= train.shape[0]])

  X_train, y_train = all_pseudo[useful_features].iloc[trn_idx], all_pseudo[target].iloc[trn_idx]
  #X_train = apply_noise_rn(X_train)

  X_valid, y_valid = all_pseudo[useful_features].iloc[oof_idx], all_pseudo[target].iloc[oof_idx]
  #X_test = all_pseudo[scale_features].iloc[preds_idx]

  #start = time.time()
  for inseed in seed_list:
    param_space['random_state'] = inseed

    model = RandomForestRegressor(**param_space, verbose=0)
    model.fit(
        X_train, 
        y_train,
        #eval_set=[(X_valid, y_valid)],
        #eval_metric='rmse',
        #early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        )


    lgb_oof[oof_idx] += model.predict(X_valid) / len(seed_list)
    #lgb_pred[preds_idx-train.shape[0]] += model.predict(X_test) / len(seed_list)

  #elapsed = time.time() - start
  rmse = mean_squared_error(y_valid, lgb_oof[oof_idx], squared=False)
  #print(f"fold {fold} - lgb rmse: {rmse:.6f}, elapsed time: {elapsed:.2f}sec\n")            
  
  #print(f"oof lgb_rmse = {mean_squared_error(train[target], lgb_oof, squared=False)}")

  
  return rmse

In [12]:
study = optuna.create_study(direction='minimize')
study.optimize(objective,n_trials= 30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-08-29 13:57:38,603][0m A new study created in memory with name: no-name-6797c512-ce33-4c1b-a41c-9aa23bf6afa4[0m


[32m[I 2021-08-29 14:03:47,111][0m Trial 0 finished with value: 7.752292446773167 and parameters: {'max_depth': 2, 'max_features': 0.3470348028261788, 'max_samples': 0.24772906131729733, 'ccp_alpha': 7.270333222880626e-07}. Best is trial 0 with value: 7.752292446773167.[0m


[32m[I 2021-08-29 14:49:12,223][0m Trial 1 finished with value: 7.746409505166049 and parameters: {'max_depth': 3, 'max_features': 0.6746937289740315, 'max_samples': 0.8291813058247196, 'ccp_alpha': 2.191914404516864e-07}. Best is trial 1 with value: 7.746409505166049.[0m


[32m[I 2021-08-29 14:57:21,821][0m Trial 2 finished with value: 7.775605546775306 and parameters: {'max_depth': 1, 'max_features': 0.4304993265536826, 'max_samples': 0.6436267056937369, 'ccp_alpha': 1.1629845818491106e-07}. Best is trial 1 with value: 7.746409505166049.[0m


[32m[I 2021-08-29 15:23:31,450][0m Trial 3 finished with value: 7.746943861903528 and parameters: {'max_depth': 3, 'max_features': 0.3782093908902666, 'max_samples': 0.8621808224857016, 'ccp_alpha': 7.862020602038241e-08}. Best is trial 1 with value: 7.746409505166049.[0m


[32m[I 2021-08-29 16:43:12,772][0m Trial 4 finished with value: 7.745842389381208 and parameters: {'max_depth': 6, 'max_features': 0.9244341048225673, 'max_samples': 0.4938470905166652, 'ccp_alpha': 2.8153101495289973e-06}. Best is trial 4 with value: 7.745842389381208.[0m


[32m[I 2021-08-29 16:50:38,361][0m Trial 5 finished with value: 7.763658411281051 and parameters: {'max_depth': 2, 'max_features': 0.20177992650949622, 'max_samples': 0.6410655175958393, 'ccp_alpha': 2.367390287017467e-08}. Best is trial 4 with value: 7.745842389381208.[0m


[32m[I 2021-08-29 17:09:41,574][0m Trial 6 finished with value: 7.7754246039431205 and parameters: {'max_depth': 1, 'max_features': 0.9222730413613789, 'max_samples': 0.7387646229745919, 'ccp_alpha': 1.2638849089722112e-07}. Best is trial 4 with value: 7.745842389381208.[0m


[32m[I 2021-08-29 17:12:03,978][0m Trial 7 finished with value: 7.858947564025528 and parameters: {'max_depth': 1, 'max_features': 0.13217829706647571, 'max_samples': 0.5975796520714912, 'ccp_alpha': 94.52081053047536}. Best is trial 4 with value: 7.745842389381208.[0m


[32m[I 2021-08-29 17:13:02,118][0m Trial 8 finished with value: 7.800543944157951 and parameters: {'max_depth': 1, 'max_features': 0.14619181890147698, 'max_samples': 0.16969041147260683, 'ccp_alpha': 0.008384541641550437}. Best is trial 4 with value: 7.745842389381208.[0m


[32m[I 2021-08-29 17:28:32,205][0m Trial 9 finished with value: 7.746854839817412 and parameters: {'max_depth': 5, 'max_features': 0.24504112534797307, 'max_samples': 0.4156850403828799, 'ccp_alpha': 0.016297811924697096}. Best is trial 4 with value: 7.745842389381208.[0m


[32m[I 2021-08-29 18:51:57,498][0m Trial 10 finished with value: 7.745922655031892 and parameters: {'max_depth': 7, 'max_features': 0.9745292538030434, 'max_samples': 0.43016557502589203, 'ccp_alpha': 5.886872761932509e-05}. Best is trial 4 with value: 7.745842389381208.[0m


[32m[I 2021-08-29 20:09:46,021][0m Trial 11 finished with value: 7.745952263665505 and parameters: {'max_depth': 7, 'max_features': 0.9891832027991427, 'max_samples': 0.4002578528416588, 'ccp_alpha': 5.068200184218693e-05}. Best is trial 4 with value: 7.745842389381208.[0m


[32m[I 2021-08-29 21:09:49,936][0m Trial 12 finished with value: 7.746020876960884 and parameters: {'max_depth': 7, 'max_features': 0.8032956564372395, 'max_samples': 0.4080404148742466, 'ccp_alpha': 3.7040524109924905e-05}. Best is trial 4 with value: 7.745842389381208.[0m


[32m[I 2021-08-29 22:12:31,050][0m Trial 13 finished with value: 7.745862303574406 and parameters: {'max_depth': 6, 'max_features': 0.8341438818785393, 'max_samples': 0.47763414661361014, 'ccp_alpha': 5.2597615587363844e-05}. Best is trial 4 with value: 7.745842389381208.[0m


[32m[I 2021-08-29 22:45:01,560][0m Trial 14 finished with value: 7.774122310278008 and parameters: {'max_depth': 5, 'max_features': 0.7797662145192863, 'max_samples': 0.2935874441185176, 'ccp_alpha': 1.2509394907239884}. Best is trial 4 with value: 7.745842389381208.[0m


In [None]:
study.best_params

# Log

====== 4 preds ======

