In [None]:
# Familiar imports
import numpy as np
import pandas as pd
import random
import os
import time
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import ElasticNet

#import lightgbm as lgb
#import xgboost as xgb
#import catboost as ctb

import warnings
warnings.simplefilter('ignore')

# Parameters

In [None]:
target = 'loss'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 17
    CVSEED = 17
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    #N_ITERS = 2
else:
    N_SPLITS = 10
    N_ESTIMATORS = 10000
    EARLY_STOPPING_ROUNDS = 300
    VERBOSE = 1000
    SEED = 17
    CVSEED = 17
    #N_ITERS = 10

In [None]:
def set_seed(seed=17):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

# Load data

In [None]:
INPUT = Path("../input/tabular-playground-series-aug-2021")

train = pd.read_csv(INPUT / "train.csv")
test = pd.read_csv(INPUT / "test.csv")
submission = pd.read_csv(INPUT / "sample_submission.csv")

In [None]:
train['lgb_pred'] = np.load("../input/tps-aug-2021-stacking/lgb_oof.npy")
test['lgb_pred'] = np.load("../input/tps-aug-2021-stacking/lgb_pred.npy")

train['rr_pred'] = np.load("../input/model-loading-tps-aug-2021-stacking-rf/rr_oof.npy")
test['rr_pred'] = np.load("../input/model-loading-tps-aug-2021-stacking-rf/rr_pred.npy")

train['mlp_pred'] = np.load("../input/tps-aug-stacking-mlp-pred/1mlp_oof.npy")+np.load("../input/tps-aug-stacking-mlp-pred/2mlp_oof.npy")+np.load("../input/tps-aug-stacking-mlp-pred/3mlp_oof.npy")
test['mlp_pred'] = np.load("../input/tps-aug-stacking-mlp-pred/1mlp_pred.npy")+np.load("../input/tps-aug-stacking-mlp-pred/2mlp_pred.npy")+np.load("../input/tps-aug-stacking-mlp-pred/3mlp_pred.npy")

train['xgbl2_pred'] = np.load("../input/tps-aug-xgb-lv2-pred/1xgb_oof.npy")+np.load("../input/tps-aug-xgb-lv2-pred/2xgb_oof.npy")
test['xgbl2_pred'] = np.load("../input/tps-aug-xgb-lv2-pred/1xgb_pred.npy")+np.load("../input/tps-aug-xgb-lv2-pred/2xgb_pred.npy")


# Preprocessing

In [None]:
scale_features = [col for col in test.columns if 'pred' in col]


ss = StandardScaler()
train[scale_features] = ss.fit_transform(train[scale_features])
test[scale_features] = ss.transform(test[scale_features])


In [None]:
# Swap noise

# Random
def apply_noise_rn(df, p=.75):
    should_not_swap = np.random.binomial(1, p, df.shape)
    corrupted_df = df.where(should_not_swap == 1, np.random.permutation(df))
    return corrupted_df

# Row-wise
def apply_noise_row(df, p=.75):
    should_not_swap = np.zeros(df.shape)
    for i in range(df.shape[0]):
        for j in np.random.choice(df.shape[1],int(p*df.shape[1]),replace=False):
            should_not_swap[i,j]=1 
    corrupted_df = df.where(should_not_swap == 1, np.random.permutation(df))
    return corrupted_df

# Pseudo Label

In [None]:
pseudo = pd.read_csv("../input/blending-tool-tps-aug-2021/file1_7.84987_file2_7.84996_blend.csv")[target]
test_pseudo = pd.concat([test, pseudo], axis=1)
all_pseudo = pd.concat([train, test_pseudo]).reset_index(drop=True)

In [None]:
useful_features = scale_features

# Optuna

In [None]:
# Optuna for parameter search
!pip install -q optuna

import optuna
import pickle

In [None]:
# for the fixed learning rate, use the opt n iterations and tune the tree hyperparameters
def objective(trial, X=all_pseudo[useful_features], y=all_pseudo[target]):
  """
  """
  param_space = {
                  'max_iter':N_ESTIMATORS,
      
               'alpha':trial.suggest_uniform('alpha', 0.01, 50),
              'l1_ratio':trial.suggest_uniform('l1_ratio', 0,1),
              'selection':trial.suggest_categorical('selection',['cyclic', 'random'])
                }
            

  seed_list=[SEED, SEED+1]
  #kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
  en_oof = np.zeros(train.shape[0])
  #en_pred = np.zeros(test.shape[0])
  X_tr, X_va, y_tr, y_va = train_test_split(X,y,test_size=.2,random_state=CVSEED)
  trn_idx = X_tr.index
  val_idx = X_va.index
  
  
  #for (trn_idx, val_idx) in enumerate(kf.split(X, y)):
  #print(f"===== fold {fold} =====")
  oof_idx = np.array([idx for idx in val_idx if idx < train.shape[0]])
  preds_idx = np.array([idx for idx in val_idx if idx >= train.shape[0]])

  X_train, y_train = all_pseudo[useful_features].iloc[trn_idx], all_pseudo[target].iloc[trn_idx]
  #X_train = apply_noise_rn(X_train)

  X_valid, y_valid = all_pseudo[useful_features].iloc[oof_idx], all_pseudo[target].iloc[oof_idx]
  #X_test = all_pseudo[scale_features].iloc[preds_idx]

  #start = time.time()
  for inseed in seed_list:
    param_space['random_state'] = inseed

    model = ElasticNet(**param_space)
    model.fit(
        X_train, 
        y_train,
 
    )


    en_oof[oof_idx] += model.predict(X_valid) / len(seed_list)
    #en_pred[preds_idx-train.shape[0]] += model.predict(X_test) / len(seed_list)

  #elapsed = time.time() - start
  rmse = mean_squared_error(y_valid, en_oof[oof_idx], squared=False)
  #print(f"fold {fold} - en rmse: {rmse:.6f}, elapsed time: {elapsed:.2f}sec\n")            
  
  #print(f"oof en_rmse = {mean_squared_error(train[target], en_oof, squared=False)}")

  
  return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective,n_trials= 30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
study.best_params

# Log

====== 4 preds ======

7.7437431101440675 no noise ver2

7.743589517834239 {'alpha': 0.018982240369176395, 'l1_ratio': 0.8055028430485707, 'selection': 'random'} final
