In [1]:
import numpy as np
import pandas as pd
import random
import os
import time
import pickle
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer

#import lightgbm as lgb
#import xgboost as xgb
import catboost as ctb

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

https://www.kaggle.com/dmitryuarov/tps-soft-voting-xgb-cb-lgbm

dmitryuarov

# Parameters

In [2]:
target = 'claim'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 2017
    CVSEED = 2017
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    #N_ITERS = 2
else:
    N_SPLITS = 10
    N_ESTIMATORS = 20000
    EARLY_STOPPING_ROUNDS = 300
    VERBOSE = 1000
    SEED = 2017
    CVSEED = 2017
    #N_ITERS = 10

In [3]:
def set_seed(seed=2017):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

# Datasets

In [4]:
INPUT = Path("../input/tabular-playground-series-sep-2021")

train = pd.read_csv(INPUT / "train.csv")
test = pd.read_csv(INPUT / "test.csv")
submission = pd.read_csv(INPUT / "sample_solution.csv")

In [11]:
oof9 = np.load("../input/tps-sep-catb-dmitry-ver5791112/9ctb_oof.npy")
oof11 = np.load("../input/tps-sep-catb-dmitry-ver5791112/11ctb_oof.npy")
oof12 = np.load("../input/tps-sep-catb-dmitry-ver5791112/12ctb_oof.npy")

roc_auc_score(train[target], oof9 + oof11 + oof12)

0.8156871132578625

In [12]:
pred9 = np.load("../input/tps-sep-catb-dmitry-ver5791112/9ctb_pred.npy")
pred11 = np.load("../input/tps-sep-catb-dmitry-ver5791112/11ctb_pred.npy")
pred12 = np.load("../input/tps-sep-catb-dmitry-ver5791112/12ctb_pred.npy")

pred9 + pred11 + pred12

array([0.58472404, 0.12299748, 0.62874762, ..., 0.76237508, 0.13212741,
       0.74951902])

In [13]:
np.save("ctb_oof.npy", oof9 + oof11 + oof12)
np.save("ctb_pred.npy", pred9 + pred11 + pred12)

In [14]:
submission[target] = pred9 + pred11 + pred12
submission.to_csv("submission.csv", index=False)

submission

Unnamed: 0,id,claim
0,957919,0.584724
1,957920,0.122997
2,957921,0.628748
3,957922,0.125143
4,957923,0.140801
...,...,...
493469,1451388,0.829369
493470,1451389,0.117270
493471,1451390,0.762375
493472,1451391,0.132127


# Preprocessing

In [None]:
features = [col for col in train.columns if 'f' in col]

In [None]:
train['n_missing'] = train[features].isna().sum(axis=1)
test['n_missing'] = test[features].isna().sum(axis=1)

train['std'] = train[features].std(axis=1)
test['std'] = test[features].std(axis=1)

features += ['n_missing', 'std']

In [None]:
imputer = SimpleImputer(strategy = 'median')

In [None]:
ss = StandardScaler()


In [None]:
test[features]

In [None]:
train[target]

# CatB

In [None]:
ctb_params = {
     #'objective': 'CrossEntropy',
     'n_estimators':N_ESTIMATORS,
     'thread_count' : -1,
     'bootstrap_type': 'Bernoulli',
     'eval_metric': 'AUC',
    
    'depth': 3, 
    'learning_rate': 0.014530866870832323,
    'max_bin': 265,
    'min_data_in_leaf': 14, 
    'reg_lambda': 0.004427550682515904, 
    'subsample': 0.5402586792667279,
    'grow_policy': 'SymmetricTree', 
    'leaf_estimation_method': 'Gradient',
    'loss_function': 'Logloss',
    #'task_type': 'GPU' 
}

In [None]:
ctb_oof = np.zeros(train.shape[0])
ctb_pred = np.zeros(test.shape[0])
ctb_importances = pd.DataFrame()


kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
seed_list=[SEED]

for fold, (trn_idx, val_idx) in enumerate(kf.split(X=train[features], y=train[target])):
    print(f"===== fold {fold} =====")
    if fold in [7,8,9]:

        X_train = train[features].iloc[trn_idx]
        y_train = train[target].iloc[trn_idx]
        X_valid = train[features].iloc[val_idx]
        y_valid = train[target].iloc[val_idx]
        X_test = test[features]

        for i in features:
            X_train[i] = imputer.fit_transform(np.array(X_train[i]).reshape(-1,1))
            X_valid[i] = imputer.transform(np.array(X_valid[i]).reshape(-1,1))
            X_test[i] = imputer.transform(np.array(X_test[i]).reshape(-1,1))

        X_train[features] = ss.fit_transform(X_train[features])
        X_valid[features] = ss.transform(X_valid[features])
        X_test[features] = ss.transform(X_test[features])

        start = time.time()
        for inseed in seed_list:
            ctb_params['random_state'] = inseed

            pre_model = ctb.CatBoostClassifier(**ctb_params)
            pre_model.fit(
                X_train, 
                y_train,
                eval_set=[(X_valid, y_valid)],
                use_best_model=True,
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                verbose=VERBOSE,
            )

            ctb_params2 = ctb_params.copy()
            ctb_params2['reg_lambda'] *= 0.9
            #ctb_params2['reg_alpha'] *= 0.9
            ctb_params2['learning_rate'] *= 0.1
            model = ctb.CatBoostClassifier(**ctb_params2)
            model.fit(
                    X_train, y_train,
                    eval_set=[(X_valid, y_valid)],
                    use_best_model=True,
                    early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                    verbose=VERBOSE,
                    init_model=pre_model
            )    

            with open(f"ctb_model{fold}_seed{inseed}.pkl", 'wb') as f:
                pickle.dump(model, f)

            fi_tmp = pd.DataFrame()
            fi_tmp['feature'] = X_train.columns
            fi_tmp['importance'] = model.feature_importances_
            fi_tmp['fold'] = fold
            fi_tmp['seed'] = inseed
            ctb_importances = ctb_importances.append(fi_tmp)

            ctb_oof[val_idx] += model.predict_proba(X_valid)[:,-1] / len(seed_list)
            ctb_pred += model.predict_proba(X_test)[:,-1] / len(seed_list)


        elapsed = time.time() - start
        auc = roc_auc_score(y_valid, ctb_oof[val_idx])
        print(f"fold {fold} - ctb auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")            

ctb_pred /= N_SPLITS
print(f"oof ctb_auc = {roc_auc_score(train[target], ctb_oof)}")

np.save("ctb_oof.npy", ctb_oof)
np.save("ctb_pred.npy", ctb_pred)

# OOF predictions

In [None]:
plt.plot(train[target], train[target])
plt.scatter(train[target], ctb_oof)

# Features importances

In [None]:
order = list(ctb_importances.groupby('feature').mean().sort_values('importance', ascending=False).index)

fig = plt.figure(figsize=(16, 16), tight_layout=True)
sns.barplot(x="importance", y="feature", data=ctb_importances.groupby('feature').mean().reset_index(), order=order)
plt.title("LightGBM feature importances")

# Submission

In [None]:
submission[target] = ctb_pred
submission.to_csv("submission.csv", index=False)

submission

# Log

mean ver5 fold 0-3 ver6 fold 4-7 ver7 fold 8-9
0.8143534337974737

median ver9 fold 0-3 ver11 4-6 ver12 7-9
0.8156871132578625
ver14 aggregate