In [1]:
!pip install -q hillclimbers

In [2]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score
from hillclimbers import climb_hill, partial
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import pickle
import glob

warnings.filterwarnings('ignore')

In [3]:
train_path = '/kaggle/input/playground-series-s4e11/train.csv'
test_path = '/kaggle/input/playground-series-s4e11/test.csv'
sample_sub_path = '/kaggle/input/playground-series-s4e11/sample_submission.csv'
original_data_path = '/kaggle/input/depression-surveydataset-for-analysis/final_depression_dataset_1.csv'

target = 'Depression'
n_folds = 5
seed = 42
time_limit = 3600 * 8

In [4]:
%%time
# train = pd.read_csv(CFG.train_path, index_col='id')
# test = pd.read_csv(CFG.test_path, index_col='id')
# original = pd.read_csv(CFG.original_data_path)

train = pd.read_csv('/kaggle/input/playground-series-s4e11/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e11/test.csv')
sample = pd.read_csv('/kaggle/input/playground-series-s4e11/sample_submission.csv')
original = pd.read_csv('/kaggle/input/depression-surveydataset-for-analysis/final_depression_dataset_1.csv')

original['Depression'] = original['Depression'].map({
    'No': 0,
    'Yes': 1
})

train = train.drop('id',axis=1)
test = test.drop('id',axis=1)

train = pd.concat(objs=[train, original], ignore_index=True)

X = train.drop(target, axis = 1)
y = train[target]

CPU times: user 718 ms, sys: 103 ms, total: 820 ms
Wall time: 1.06 s


In [5]:
def get_data(oof_path, tesst_oof_path):
    oof_pred_probs = pickle.load(open(oof_path, 'rb'))
    test_pred_probs = pickle.load(open(test_oof_path, 'rb'))
    
    scores = []
    skf = StratifiedKFold(n_splits = n_folds, random_state = seed, shuffle = True)
    for _, val_idx in skf.split(X, y):
        y_val = y[val_idx]
        y_pred_probs = (oof_pred_probs[val_idx] > 0.5).astype(int)
        score = accuracy_score(y_val, y_pred_probs)
        scores.append(score)
    
    return oof_pred_probs, test_pred_probs, scores

In [6]:
scores = {}
oof_pred_probs = {}
test_pred_probs = {}

In [7]:
autogluon_dir = "/kaggle/input/playground-s04e11-autogluon"
oof_paths = glob.glob(f'{autogluon_dir}/oof_pred_probs/**')
test_oof_paths = glob.glob(f'{autogluon_dir}/test_pred_probs/**')

for oof_path, test_oof_path in zip(oof_paths, test_oof_paths):
    model_name = oof_path.split('/')[-1][:-28]
    if "L1" in model_name:
        oof_pred_probs[model_name], test_pred_probs[model_name], scores[model_name] = get_data(oof_path, test_oof_path)

In [8]:
hc_test_pred_probs, hc_oof_pred_probs = climb_hill(
    train=train, 
    oof_pred_df=pd.DataFrame(oof_pred_probs), 
    test_pred_df=pd.DataFrame(test_pred_probs),
    target=target,
    objective='maximize', 
    eval_metric=partial(roc_auc_score),
    negative_weights=True, 
    precision=0.001, 
    plot_hill=True, 
    plot_hist=True,
    return_oof_preds=True
)

[1m[34m   /\  
  /__\  hillclimbers[0m[1m 
 /    \
/______\ 
[0m
[1m[33mModels to be ensembled | (15 total):[0m 

[1m[32mLightGBM_r96_BAG_L1:      0.97595 (best solo model)[0m
[1mCatBoost_r177_BAG_L1:     0.97571[0m
[1mCatBoost_BAG_L1:          0.97560[0m
[1mLightGBMXT_BAG_L1:        0.97529[0m
[1mLightGBM_r131_BAG_L1:     0.97486[0m
[1mCatBoost_r137_BAG_L1:     0.97482[0m
[1mXGBoost_BAG_L1:           0.97471[0m
[1mLightGBM_BAG_L1:          0.97426[0m
[1mLightGBMLarge_BAG_L1:     0.97417[0m
[1mCatBoost_r9_BAG_L1:       0.97343[0m
[1mRandomForestEntr_BAG_L1:  0.97236[0m
[1mXGBoost_r33_BAG_L1:       0.97204[0m
[1mRandomForestGini_BAG_L1:  0.97193[0m
[1mRandomForest_r195_BAG_L1: 0.96920[0m
[1mCatBoost_r13_BAG_L1:      0.96812[0m

[1m[33m[Data preparation completed successfully] - [Initiate hill climbing][0m 

[1m[32mIteration: 1 | Model added: CatBoost_r177_BAG_L1 | Best weight: 0.433 | Best roc_auc_score: 0.97638[0m
[1m[32mIteration: 2 | Mod




In [9]:
hc_score = accuracy_score(y, (hc_oof_pred_probs > 0.5).astype(int))
scores['Hill Climbing'] = [hc_score] * n_folds

In [10]:
sub = pd.read_csv(sample_sub_path)
sub[target] = (hc_test_pred_probs > 0.5).astype(int)
sub.to_csv(f'submission.csv', index=False)
sub.head()

Unnamed: 0,id,Depression
0,140700,0
1,140701,0
2,140702,0
3,140703,1
4,140704,0
