In [1]:
# Familiar imports
import numpy as np
import pandas as pd
import random
import os
import time
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
#from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

#import lightgbm as lgb
#import xgboost as xgb
#import catboost as ctb

import warnings
warnings.simplefilter('ignore')

# Parameters

In [2]:
target = 'loss'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 17
    CVSEED = 17
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    #N_ITERS = 2
else:
    N_SPLITS = 10
    N_ESTIMATORS = 500
    EARLY_STOPPING_ROUNDS = 200
    VERBOSE = 1000
    SEED = 17
    CVSEED = 17
    #N_ITERS = 10

In [3]:
def set_seed(seed=17):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

# Load data

In [4]:
INPUT = Path("../input/tabular-playground-series-aug-2021")

train = pd.read_csv(INPUT / "train.csv")
test = pd.read_csv(INPUT / "test.csv")
submission = pd.read_csv(INPUT / "sample_submission.csv")

In [5]:
train['lgb_pred'] = np.load("../input/tps-aug-2021-lgb/lgb_oof.npy")
test['lgb_pred'] = np.load("../input/tps-aug-2021-lgb/lgb_pred.npy")

train['xgb_pred'] = np.load("../input/tps-aug-2021-xgb/xgb_oof.npy")
test['xgb_pred'] = np.load("../input/tps-aug-2021-xgb/xgb_pred.npy")

train['xgb2_pred'] = np.load("../input/tps-aug-2021-xgb-pseudo2-pred/3xgb_oof.npy")+np.load("../input/tps-aug-2021-xgb-pseudo2-pred/4xgb_oof.npy")
test['xgb2_pred'] = np.load("../input/tps-aug-2021-xgb-pseudo2-pred/3xgb_pred.npy")+np.load("../input/tps-aug-2021-xgb-pseudo2-pred/4xgb_pred.npy")

train['ctb2_pred'] = np.load("../input/tps-aug-2021-catb-pseudo2/ctb_oof.npy")
test['ctb2_pred'] = np.load("../input/tps-aug-2021-catb-pseudo2/ctb_pred.npy")


# Preprocessing

In [6]:
scale_features = [col for col in test.columns if 'f' in col]

ss = StandardScaler()
train[scale_features] = ss.fit_transform(train[scale_features])
test[scale_features] = ss.transform(test[scale_features])

In [7]:
# Swap noise

# Random
def apply_noise_rn(df, p=.75):
    should_not_swap = np.random.binomial(1, p, df.shape)
    corrupted_df = df.where(should_not_swap == 1, np.random.permutation(df))
    return corrupted_df

# Row-wise
def apply_noise_row(df, p=.75):
    should_not_swap = np.zeros(df.shape)
    for i in range(df.shape[0]):
        for j in np.random.choice(df.shape[1],int(p*df.shape[1]),replace=False):
            should_not_swap[i,j]=1 
    corrupted_df = df.where(should_not_swap == 1, np.random.permutation(df))
    return corrupted_df

# Pseudo Label

In [8]:
pseudo = pd.read_csv("../input/blending-tool-tps-aug-2021/blend.csv/0.part")[target]
test_pseudo = pd.concat([test, pseudo], axis=1)
all_pseudo = pd.concat([train, test_pseudo]).reset_index(drop=True)

In [9]:
useful_features = scale_features + ['lgb_pred', 'xgb_pred','xgb2_pred','ctb2_pred']

# Optuna

In [10]:
# Optuna for parameter search
!pip install -q optuna

import optuna
import pickle



In [11]:
# for the fixed learning rate, use the opt n iterations and tune the tree hyperparameters
def objective(trial, X=all_pseudo[useful_features], y=all_pseudo[target]):
  """
  """
  n_layers= trial.suggest_int('n_layers', 1,3) # no. of hidden layers 
  layers = []
  for i in range(n_layers):
        layers.append(trial.suggest_int('n_units_{}'.format(i+1), 8,256)) 

  param_space = {
               'early_stopping':True,
               'solver':'adam',
                'shuffle':True,
                'n_iter_no_change': EARLY_STOPPING_ROUNDS,
                'max_iter':N_ESTIMATORS,
      
                'hidden_layer_sizes':(layers),
                'alpha' : trial.suggest_loguniform("alpha", 22.04, 28.04),
                'batch_size':trial.suggest_int("batch_size", 504, 528),
                'learning_rate_init': trial.suggest_loguniform("learning_rate_init", 0.56e-4, 2.56e-4)
                  }
        
  seed_list=[SEED, SEED+1]
  #kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
  lgb_oof = np.zeros(train.shape[0])
  #lgb_pred = np.zeros(test.shape[0])
  X_tr, X_va, y_tr, y_va = train_test_split(X,y,test_size=.2,random_state=CVSEED)
  trn_idx = X_tr.index
  val_idx = X_va.index
  
  
  #for (trn_idx, val_idx) in enumerate(kf.split(X, y)):
  #print(f"===== fold {fold} =====")
  oof_idx = np.array([idx for idx in val_idx if idx < train.shape[0]])
  preds_idx = np.array([idx for idx in val_idx if idx >= train.shape[0]])

  X_train, y_train = all_pseudo[useful_features].iloc[trn_idx], all_pseudo[target].iloc[trn_idx]
  #X_train = apply_noise_rn(X_train)

  X_valid, y_valid = all_pseudo[useful_features].iloc[oof_idx], all_pseudo[target].iloc[oof_idx]
  #X_test = all_pseudo[scale_features].iloc[preds_idx]

  #start = time.time()
  for inseed in seed_list:
    param_space['random_state'] = inseed

    model = MLPRegressor(**param_space, verbose=0)
    model.fit(
        X_train, 
        y_train,
        #eval_set=[(X_valid, y_valid)],
        #eval_metric='rmse',
        #early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        )


    lgb_oof[oof_idx] += model.predict(X_valid) / len(seed_list)
    #lgb_pred[preds_idx-train.shape[0]] += model.predict(X_test) / len(seed_list)

  #elapsed = time.time() - start
  rmse = mean_squared_error(y_valid, lgb_oof[oof_idx], squared=False)
  #print(f"fold {fold} - lgb rmse: {rmse:.6f}, elapsed time: {elapsed:.2f}sec\n")            
  
  #print(f"oof lgb_rmse = {mean_squared_error(train[target], lgb_oof, squared=False)}")

  
  return rmse

In [12]:
study = optuna.create_study(direction='minimize')
study.optimize(objective,n_trials= 3)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-08-31 05:40:13,342][0m A new study created in memory with name: no-name-a9a6e8d5-0bc8-4811-9344-a416c80ef6d5[0m
[32m[I 2021-08-31 06:14:09,413][0m Trial 0 finished with value: 7.747759174829955 and parameters: {'n_layers': 2, 'n_units_1': 159, 'n_units_2': 38, 'alpha': 23.307899445462493, 'batch_size': 526, 'learning_rate_init': 9.998675425177034e-05}. Best is trial 0 with value: 7.747759174829955.[0m
[32m[I 2021-08-31 06:24:48,723][0m Trial 1 finished with value: 7.748336339599595 and parameters: {'n_layers': 1, 'n_units_1': 49, 'alpha': 22.84353427998821, 'batch_size': 525, 'learning_rate_init': 7.866378129313493e-05}. Best is trial 0 with value: 7.747759174829955.[0m
[32m[I 2021-08-31 07:17:28,887][0m Trial 2 finished with value: 7.7487094285866265 and parameters: {'n_layers': 3, 'n_units_1': 112, 'n_units_2': 68, 'n_units_3': 212, 'alpha': 24.03444679769322, 'batch_size': 504, 'learning_rate_init': 7.399992352456987e-05}. Best is trial 0 with value: 7.7477591

Number of finished trials: 3
Best trial: {'n_layers': 2, 'n_units_1': 159, 'n_units_2': 38, 'alpha': 23.307899445462493, 'batch_size': 526, 'learning_rate_init': 9.998675425177034e-05}


In [13]:
study.best_params

{'n_layers': 2,
 'n_units_1': 159,
 'n_units_2': 38,
 'alpha': 23.307899445462493,
 'batch_size': 526,
 'learning_rate_init': 9.998675425177034e-05}

# Log

====== 4 preds ======

7.752174909047353 no noise ver1

7.748076721633888 no noise ver2 (narrow space)

7.7479572494646005 no noise ver3 (narrow space)

7.747908079160598 no noise ver4 (narrow space)