In [129]:
import os
import math
import pickle
import random

import pandas as pd
import numpy as np

from kaggle import KaggleApi

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

import warnings
warnings.simplefilter('ignore')


def print_div(fold):
    title = f" fold {fold} "
    eq = "=" * (int((40 - len(title)) / 2))
    str_title = eq + title + eq
    print("=" * len(str_title))
    print(str_title)
    print("=" * len(str_title))


def kaggle_authentication():
    api = KaggleApi()
    api.authenticate()
    return api

def download_dataset(api, competition, path):
    api.competition_download_files(competition, path=None, force=False, quiet=False)

def submit_predict(api, competition, fpath, message):
    api.competition_submit(fpath, message, competition)

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

def save_model(model, fpath):
    with open(fpath,'wb') as file:
        file.write(pickle.dumps(model))

def load_datasets(train_fpath, test_fpath, solution_fpath):
    train = pd.read_csv(train_fpath)
    test = pd.read_csv(test_fpath)
    submission = pd.read_csv(solution_fpath)
    return train, test, submission

def get_row_statistics(train, test, features):
    train['n_missing'] = train[features].isna().sum(axis=1)
    test['n_missing'] = test[features].isna().sum(axis=1)
    train['std'] = train[features].std(axis=1)
    test['std'] = test[features].std(axis=1)
    features += ['n_missing', 'std']
    n_missing = train['n_missing'].copy()
    return train, test, features, n_missing

def fill_and_scale(train, test, features):
    scaler = StandardScaler()
    train[features] = train[features].fillna(train[features].mean())
    test[features] = test[features].fillna(test[features].mean())
    train[features] = scaler.fit_transform(train[features])
    test[features] = scaler.transform(test[features])
    return train, test

## Download Dataset

In [None]:
api = kaggle_authentication()
competition = 'tabular-playground-series-sep-2021'
destination = '.'
download_dataset(api, competition, '.')

## Upload Solution

In [None]:
api = kaggle_authentication()
submit_file = 'xgb_submission.csv'
message = 'Submit via python script'
competition = 'tabular-playground-series-sep-2021'
submit_predict(api, competition, submit_file, message)

In [None]:
N_SPLITS = 5
N_ESTIMATORS = 20000
EARLY_STOPPING_ROUNDS = 200
VERBOSE = 10
SEED = 2021

seed_everything(SEED)

In [83]:
train_fpath = "train.csv"
test_fpath = "test.csv"
solution_fpath = "sample_solution.csv"
train, test, submission = load_datasets(train_fpath, test_fpath, solution_fpath)

features = [col for col in test.columns if 'f' in col]
TARGET = 'claim'

target = train[TARGET].copy()

train, test, features, n_missing = get_row_statistics(train, test, features)
train, test = fill_and_scale(train, test, features)

In [131]:
Xgb_params = {
    'objective': 'binary:logistic',
    'n_estimators': N_ESTIMATORS,
    'learning_rate': 5e-3,
    'min_split_loss': 0,
    'max_depth': 6,
    'min_child_weight': 256,
    'max_delta_step': 0,
    'subsample': 0.6,
    'colsample_bytree': 0.4,
    'reg_lambda': 1e-1,
    'reg_alpha': 10.0
}

lgb_oof = np.zeros(train.shape[0])
lgb_pred = np.zeros(test.shape[0])
lgb_importances = pd.DataFrame()

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X=train, y=n_missing)):
    print_div(fold)

    X_train = train[features].iloc[trn_idx]
    y_train = target.iloc[trn_idx]
    X_valid = train[features].iloc[val_idx]
    y_valid = target.iloc[val_idx]
    X_test = test[features]

    model = XGBClassifier(**Xgb_params)
    model.fit(X_train,
              y_train,
              eval_set=[(X_valid, y_valid)],
              eval_metric='auc',
              early_stopping_rounds=EARLY_STOPPING_ROUNDS,
              verbose=VERBOSE)

    fi_tmp = pd.DataFrame( )
    fi_tmp['feature'] = X_train.columns
    fi_tmp['importance'] = model.feature_importances_
    fi_tmp['fold'] = fold
    fi_tmp['seed'] = SEED
    lgb_importances = lgb_importances.append(fi_tmp)

    lgb_oof[val_idx] = model.predict_proba(X_valid)[:, -1]
    lgb_pred += model.predict_proba(X_test)[:, -1] / N_SPLITS

    auc = roc_auc_score(y_valid, lgb_oof[val_idx])
    print(f"fold {fold} - lgb auc: {auc:.6f}\n")

print(f"oof lgb roc = {roc_auc_score(target, lgb_oof)}")

[0]	validation_0-auc:0.79960
