In [None]:
import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, BayesianRidge, SGDRegressor, RidgeCV
from sklearn.svm import SVR
from category_encoders import LeaveOneOutEncoder
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor, GradientBoostingRegressor
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
import torch
import gc
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_df = pd.read_csv("../input/30-days-of-ml/train.csv")
test_df = pd.read_csv("../input/30-days-of-ml/test.csv")
submission_df = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [None]:
cat_features = [feature for feature in train_df.columns if ("cat" in feature)]
cont_features = [feature for feature in train_df.columns if ("cont" in feature)]
target = train_df["target"]

In [None]:
def competition_metric(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))

le_features = []
loo_features = []
oh_features = []

def label_encoding(feature):
    le = LabelEncoder()
    le.fit(train_df[feature].unique().tolist() + test_df[feature].unique().tolist())
    train_df[f"{feature}_le"] = le.transform(train_df[feature])
    test_df[f"{feature}_le"] = le.transform(test_df[feature])
    return f"{feature}_le"

def loo_encoding(feature):
    loo = LeaveOneOutEncoder()
    loo.fit(train_df[feature], train_df["target"])
    train_df[f"{feature}_loo"] = loo.transform(train_df[feature])
    test_df[f"{feature}_loo"] = loo.transform(test_df[feature])
    return f"{feature}_loo"

def oh_encoding(feature):
    oh = OneHotEncoder(sparse=False)
    oh.fit(train_df[feature].unique().tolist() + test_df[feature].unique().tolist())
    new_features = [f"{feature}_{item}" for item in oh.categories_[0]]
    train_df[new_features] = oh.transform(train_df[feature])
    test_df[new_features] = oh.transform(test_df[feature])
    return new_features
    
for feature in cat_features:
    le_features.append(label_encoding(feature))
    loo_features.append(loo_encoding(feature))
    # oh_features.extend(oh_encoding(feature))
    
xgb_cat_features = []
lgb_cat_features = []
cb_cat_features = []
hgbr_cat_features = []
ridge_cat_features = []
sgd_cat_features = []
rf_cat_features = []
bayesian_cat_features = []
svr_cat_features = []
rf_cat_features.extend(le_features)
xgb_cat_features.extend(le_features)
lgb_cat_features.extend(loo_features)
cb_cat_features.extend(le_features)
hgbr_cat_features.extend(loo_features)
ridge_cat_features.extend(loo_features)
sgd_cat_features.extend(loo_features)
bayesian_cat_features.extend(loo_features)
svr_cat_features.extend(loo_features)

In [None]:
l1_train = pd.read_csv("../input/oof-and-test-preds-of-base-models-for-30-days-ml/l1_train.csv")
l1_test = pd.read_csv("../input/oof-and-test-preds-of-base-models-for-30-days-ml/l1_test.csv")
target = train_df["target"].values
for column in l1_train.columns:
    print(f"{column} RMSE: ", competition_metric(target, l1_train[column]))

xgb RMSE:  0.7159227073378349
xgb2 RMSE:  0.7159365228018582
lgb RMSE:  0.7168988110822859
cb RMSE:  0.7176344946606455
hgbr RMSE:  0.7195858565311585
ridge RMSE:  0.7222613494882295


In [None]:
stacking_dict = {
    "level_1": {
        "seeds": [42],
        "n_folds": 10,
        "models": {
            "ridge": {"features": ["xgb", "xgb2", "lgb", "cb", "hgbr", "ridge"], "has_eval": False, "fit_params": {}, "class": Ridge, "init_params": {}},
            "xgb": {"features": ["xgb", "xgb2", "lgb", "cb", "hgbr", "ridge"], "has_eval": True, "fit_params": {"early_stopping_rounds": 300, "verbose": 0}, "class": XGBRegressor, "init_params": {
                                                                                                                                                                                                    'n_jobs': 4,
                                                                                                                                                                                                    'booster': 'gbtree',
                                                                                                                                                                                                    'n_estimators': 7000,
                                                                                                                                                                                                    'learning_rate': 0.030446409758071415,
                                                                                                                                                                                                    'reg_lambda': 2.712520306440014,
                                                                                                                                                                                                    'reg_alpha': 0.0007870899477561048,
                                                                                                                                                                                                    'subsample': 0.3336667858453184,
                                                                                                                                                                                                    'colsample_bytree': 0.9214478236363662,
                                                                                                                                                                                                    'max_depth': 1
                                                                                                                                                                                                }},
            "lgb": {"features": ["xgb", "xgb2", "lgb", "cb", "hgbr", "ridge"], "has_eval": True, "fit_params": {"eval_metric":'rmse', "early_stopping_rounds":500,"verbose":False}, "class": LGBMRegressor, "init_params": {
                                                                                                                                                                                                                            "metric": "rmse",
                                                                                                                                                                                                                            "n_jobs": 6,
                                                                                                                                                                                                                            'reg_lambda': 8.74046749454152e-08,
                                                                                                                                                                                                                            'reg_alpha': 6.105482055823789e-08,
                                                                                                                                                                                                                            'colsample_bytree': 0.9,
                                                                                                                                                                                                                            "learning_rate": 0.035053004475190795,
                                                                                                                                                                                                                            "max_depth": 69,
                                                                                                                                                                                                                            "num_leaves": 13,
                                                                                                                                                                                                                            'min_child_samples': 129,
                                                                                                                                                                                                                            "n_estimators":1600000,
                                                                                                                                                                                                                            "cat_smooth": 55,
                                                                                                                                                                                                                            # "max_bin": 512,
                                                                                                                                                                                                                            "min_data_per_group": 117,
                                                                                                                                                                                                                            "bagging_freq": 1,
                                                                                                                                                                                                                            "bagging_fraction": 0.6709049555262285,
                                                                                                                                                                                                                            "cat_l2": 45.53922844566267
                                                                                                                                                                                                                        }},
            "cb": {"features": ["xgb", "xgb2", "lgb", "cb", "hgbr", "ridge"], "has_eval": True, "fit_params": {"early_stopping_rounds": 1000, "verbose": False, "use_best_model": True}, "class": CatBoostRegressor, "init_params": {
                                                                                                                                                                                                                                        "iterations": 11900,
                                                                                                                                                                                                                                        "learning_rate": 0.06805013956644747,
                                                                                                                                                                                                                                        "loss_function": "RMSE",
                                                                                                                                                                                                                                        "eval_metric": "RMSE",
                                                                                                                                                                                                                                        "verbose": 0,
                                                                                                                                                                                                                                        "depth": 7,
                                                                                                                                                                                                                                        "thread_count": 4,
                                                                                                                                                                                                                                        "l2_leaf_reg": 4.934224184899785,
                                                                                                                                                                                                                                    }},
            "hgbr": {"features": ["xgb", "xgb2", "lgb", "cb", "hgbr", "ridge"], "has_eval": False, "fit_params": {}, "class": HistGradientBoostingRegressor, "init_params": {
                                                                                                                                                                                "learning_rate": 0.18975641592886155,
                                                                                                                                                                                "max_iter": 13000,
                                                                                                                                                                                "max_leaf_nodes": 60,
                                                                                                                                                                                "max_depth": 3,
                                                                                                                                                                                "l2_regularization": 7.393785635976974,
                                                                                                                                                                            }},
            "rf": {"features": ["xgb", "xgb2", "lgb", "cb", "hgbr", "ridge"], "has_eval": False, "fit_params": {}, "class": RandomForestRegressor, "init_params": {
                                                                                                                                                            "n_estimators": 500, "n_jobs": -1, "max_depth": 3
                                                                                                                                                    }},
            "gbr": {"features": ["xgb", "xgb2", "lgb", "cb", "hgbr", "ridge"], "has_eval": False, "fit_params": {}, "class": GradientBoostingRegressor, "init_params": {
                                                                                                                                                            "n_estimators": 500, "max_depth": 3
                                                                                                                                                    }}
        }
    },
    "level_2": {
        "seeds": [42],
        "n_folds": 10,
        "models": {
            "xgb": {"features": ["cb", "xgb", "ridge", "lgb", "gbr", "rf"], "has_eval": True, "fit_params": {"early_stopping_rounds": 300, "verbose": 0}, "class": XGBRegressor, "init_params": {
                                                                                                                                                                                                    'n_jobs': 4,
                                                                                                                                                                                                    'booster': 'gbtree',
                                                                                                                                                                                                    'n_estimators': 6000,
                                                                                                                                                                                                    'learning_rate': 0.054570441577205454,
                                                                                                                                                                                                    'reg_lambda': 1.3476680322724826e-06,
                                                                                                                                                                                                    'reg_alpha': 0.777847235568568,
                                                                                                                                                                                                    'subsample': 0.31969304182639074,
                                                                                                                                                                                                    'colsample_bytree': 0.7152496907340047,
                                                                                                                                                                                                    'max_depth': 2
                                                                                                                                                                                                }},
            "ridge": {"features": ["cb", "xgb", "ridge"], "has_eval": False, "fit_params": {}, "class": Ridge, "init_params": {"alpha": 10}},
            "lgb":  {"features": ['cb', 'xgb', 'ridge', 'rf', 'gbr', 'hgbr'], "has_eval": True, "fit_params": {"eval_metric":'rmse', "early_stopping_rounds":500,"verbose":False}, "class": LGBMRegressor, "init_params": {
                                                                                                                                                                                                                            "metric": "rmse",
                                                                                                                                                                                                                            "n_jobs": 6,
                                                                                                                                                                                                                            'reg_lambda': 0.02323969333810591,
                                                                                                                                                                                                                            'reg_alpha': 1.1403151293992749e-07,
                                                                                                                                                                                                                            'colsample_bytree': 0.8,
                                                                                                                                                                                                                            "learning_rate": 0.0504798086545976,
                                                                                                                                                                                                                            "max_depth": 26,
                                                                                                                                                                                                                            "num_leaves": 10,
                                                                                                                                                                                                                            'min_child_samples': 107,
                                                                                                                                                                                                                            "n_estimators":1600000,
                                                                                                                                                                                                                            "cat_smooth": 92,
                                                                                                                                                                                                                            # "max_bin": 512,
                                                                                                                                                                                                                            "min_data_per_group": 117,
                                                                                                                                                                                                                            "bagging_freq": 1,
                                                                                                                                                                                                                            "bagging_fraction": 0.6709049555262285,
                                                                                                                                                                                                                            "cat_l2": 10.74221611145318
                                                                                                                                                                                                                        }},
            "cb": {"features": ['cb', 'xgb', 'ridge', 'rf', 'gbr'], "has_eval": True, "fit_params": {"early_stopping_rounds": 1000, "verbose": False, "use_best_model": True}, "class": CatBoostRegressor, "init_params": {
                                                                                                                                                                                                                            "iterations": 7400,
                                                                                                                                                                                                                            "learning_rate": 0.2040222333224308,
                                                                                                                                                                                                                            "loss_function": "RMSE",
                                                                                                                                                                                                                            "eval_metric": "RMSE",
                                                                                                                                                                                                                            "verbose": 0,
                                                                                                                                                                                                                            "depth": 10,
                                                                                                                                                                                                                            "thread_count": 4,
                                                                                                                                                                                                                            "l2_leaf_reg": 7.801027337596287,
                                                                                                                                                                                                                        }},
        }
    }
}

In [None]:
for name_level, info_level in stacking_dict.items():
    level_num = int(name_level.split("_")[-1])
    print(f"Stacking level {level_num}")
    oof_preds_dict = {}
    test_preds_dict = {}
    for model_name in info_level["models"].keys():
        oof_preds_dict[model_name] = []
        test_preds_dict[model_name] = []
    seeds = info_level["seeds"]
    n_folds = info_level["n_folds"]
    level_train_df = locals()[f"l{level_num}_train"]
    level_test_df = locals()[f"l{level_num}_test"]
    X_test = level_test_df
    for seed in seeds:
        for model_name in info_level["models"].keys():
            locals()[f"{model_name}_oof_preds"] = np.zeros((level_train_df.shape[0],))
            locals()[f"{model_name}_test_preds"] = np.zeros((level_test_df.shape[0],))
            kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
            for fold, (train_idx, val_idx) in enumerate(kf.split(level_train_df)):
                X_train, y_train, X_val, y_val = level_train_df.iloc[train_idx], target[train_idx], level_train_df.iloc[val_idx], target[val_idx]
                model_X_train = X_train[info_level["models"][model_name]["features"]]
                model_X_val = X_val[info_level["models"][model_name]["features"]]
                model_X_test = X_test[info_level["models"][model_name]["features"]]
                if (info_level["models"][model_name]["has_eval"]):
                    info_level["models"][model_name]["fit_params"]["eval_set"] = [(model_X_val, y_val)]
                stack_model = info_level["models"][model_name]["class"](random_state=seed, **info_level["models"][model_name]["init_params"])
                stack_model.fit(model_X_train, y_train, **info_level["models"][model_name]["fit_params"])
                locals()[f"{model_name}_oof_preds"][val_idx] = stack_model.predict(model_X_val)
                locals()[f"{model_name}_test_preds"] += stack_model.predict(model_X_test)/n_folds
            oof_preds_dict[model_name].append(locals()[f"{model_name}_oof_preds"])
            test_preds_dict[model_name].append(locals()[f"{model_name}_test_preds"])
            print(f"Level {level_num}, Model {model_name}, SEED {seed} RMSE: ", competition_metric(target, oof_preds_dict[model_name]))
    for ele in [oof_preds_dict, test_preds_dict]:
        for k, v in ele.items():
            ele[k] = np.mean(v, axis=0)
    locals()[f"l{level_num+1}_train"] = pd.DataFrame(data={k:v for k, v in oof_preds_dict.items()})
    locals()[f"l{level_num+1}_test"] = pd.DataFrame(data={k:v for k, v in test_preds_dict.items()})

Stacking level 1
Level 1, Model ridge, SEED 42 RMSE:  0.7157111340594113
Level 1, Model xgb, SEED 42 RMSE:  0.715558913072304
Level 1, Model lgb, SEED 42 RMSE:  0.7156079652431893
Level 1, Model cb, SEED 42 RMSE:  0.7156062922471066
Level 1, Model hgbr, SEED 42 RMSE:  0.715693895088378
Level 1, Model rf, SEED 42 RMSE:  0.7159867231005268
Level 1, Model gbr, SEED 42 RMSE:  0.7160382159447694
Stacking level 2
Level 2, Model xgb, SEED 42 RMSE:  0.7155186773008262
Level 2, Model ridge, SEED 42 RMSE:  0.7155129366342742
Level 2, Model lgb, SEED 42 RMSE:  0.7154675593464758
Level 2, Model cb, SEED 42 RMSE:  0.7155794135522373


In [None]:
all = []
x = np.stack([v for k, v in oof_preds_dict.items()], axis=0)
for k in range(x.shape[0]):
    rmse = competition_metric(x[k], target)
    all.append(rmse)
    print('Model %i has OOF RMSE = %.10f'%(k,rmse))
all = np.array(all)
m = [np.argsort(all)[0]]
w = []
RES = 5000 
PATIENCE = 10000 
TOL = 0.000000001
DUPLICATES = False
print(f'Ensemble ACC = {all[np.argsort(all)[0]]} by beginning with model {m[0]}')
print()
old = np.min(all)
while True:
    # BUILD CURRENT ENSEMBLE
    md = x[m[0]]
    for i,k in enumerate(m[1:]):
        md = w[i]*x[k] + (1-w[i])*md
        
    # FIND MODEL TO ADD
    mx = float("+inf"); mx_k = 0; mx_w = 0
    print('Searching for best model to add... ')
    
    # TRY ADDING EACH MODEL
    for k in range(x.shape[0]):
        print(k,', ',end='')
        if not DUPLICATES and (k in m): continue
            
        # EVALUATE ADDING MODEL K WITH WEIGHTS W
        bst_j = 0; bst = float("+inf"); ct = 0
        for j in range(RES):
            tmp = j/RES*x[k] + (1-j/RES)*md
            rmse = competition_metric(tmp, target)
            if rmse < bst:
                bst = rmse
                bst_j = j/RES
            else: ct += 1
            if ct>PATIENCE: break
        if bst<mx:
            mx = bst
            mx_k = k
            mx_w = bst_j
    
    # STOP IF INCREASE IS LESS THAN TOL
    inc = old - mx
    if inc<=TOL: 
        print(); print('No increase. Stopping.')
        break
        
    # DISPLAY RESULTS
    print(); #print(kk,mx,mx_k,mx_w,'%.5f'%inc)
    print('Ensemble RMSE = %.10f after adding model %i with weight %.10f. Increase of %.10f'%(mx,mx_k,mx_w,inc))
    print()
    
    old = mx; m.append(mx_k); w.append(mx_w)

Model 0 has OOF RMSE = 0.7155186773
Model 1 has OOF RMSE = 0.7155129366
Model 2 has OOF RMSE = 0.7154675593
Model 3 has OOF RMSE = 0.7155794136
Ensemble ACC = 0.7154675593464758 by beginning with model 2

Searching for best model to add... 
0 , 1 , 2 , 3 , 
Ensemble RMSE = 0.7154447083 after adding model 0 with weight 0.3572000000. Increase of 0.0000228511

Searching for best model to add... 
0 , 1 , 2 , 3 , 
Ensemble RMSE = 0.7154348258 after adding model 1 with weight 0.2624000000. Increase of 0.0000098825

Searching for best model to add... 
0 , 1 , 2 , 3 , 
Ensemble RMSE = 0.7154347813 after adding model 3 with weight 0.0172000000. Increase of 0.0000000445

Searching for best model to add... 
0 , 1 , 2 , 3 , 
No increase. Stopping.


In [None]:
print(m)
print(w)
md = x[m[0]]
for i,k in enumerate(m[1:]):
    md = w[i]*x[k] + (1-w[i])*md
print("Final RMSE: ", competition_metric(md, target))
test_preds = np.stack([v for k, v in test_preds_dict.items()])
final_test_preds = test_preds[m[0]]
for i,k in enumerate(m[1:]):
    final_test_preds = w[i]*test_preds[k] + (1-w[i])*final_test_preds

[2, 0, 1, 3]
[0.3572, 0.2624, 0.0172]
Final RMSE:  0.7154347812542671


In [None]:
submission_df["target"] = final_test_preds
submission_df.to_csv("./submission.csv", index=False)