In [1]:
import os
import gc
from joblib import dump, load
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

# load files

In [2]:
train = pd.read_parquet("../input/train_full_features.parquet")

# define loss and metric

In [3]:
def amex_metric(y_true, y_pred):
    
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    
    for i in [1, 0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
        
    return 0.5 * (gini[1]/gini[0] + top_four)

In [4]:
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return "amex_metric", amex_metric(y_true, y_pred), True

# define training config

In [5]:
seed = 42
n_folds = 5

features = load("selected_features.pkl")

target = "target"

cat_features_base = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
] 
cat_features = [
    "{}_last".format(feature) for feature in cat_features_base
]
cat_features = [feature for feature in cat_features if feature in features]
            
params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "boosting": "dart",
    "seed": seed,
    "num_leaves": 100,
    "learning_rate": 0.01,
    "feature_fraction": 0.20,
    "bagging_freq": 10,
    "bagging_fraction": 0.50,
    "n_jobs": -1,
    "lambda_l2": 2,
    "min_data_in_leaf": 40,
}

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
seed_everything(seed)

In [6]:
target_fold = 2

kfold = StratifiedKFold(
    n_splits=n_folds, 
    shuffle=True, 
    random_state=seed
)

for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[target])):
    
    if fold == target_fold:
        break

x_train, x_val = train.loc[trn_ind, features], train.loc[val_ind, features]
y_train, y_val = train.loc[trn_ind, target], train.loc[val_ind, target]

lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)

In [7]:
del train, x_train, x_val, y_train, y_val
gc.collect()

0

In [8]:
save_folder = os.path.join("../ckpt/lgbm_seed_{}".format(seed))
if not os.path.exists(save_folder):
    os.mkdir(save_folder)
    
save_path = os.path.join(save_folder, "fold_{}".format(target_fold))
if not os.path.exists(save_path):
    os.mkdir(save_path)

In [None]:
print("#" * 50)
print("Training fold {} with {} features...".format(target_fold, len(features)))

global max_score 
max_score = 0.793

def save_model():
    def callback(env):
        global max_score
        iteration = env.iteration
        score = env.evaluation_result_list[3][2]
        if iteration % 100 == 0:
            print("iteration {}, score= {:.05f}".format(iteration,score))
        if score > max_score:
            max_score = score
            print("High Score: iteration {}, score={:.05f}".format(iteration, score))
            dump(env.model, os.path.join(save_path, "{:.05f}.pkl".format(score)))

    callback.order = 0
    return callback

model = lgb.train(
    params=params,
    train_set=lgb_train,
    num_boost_round=20000,
    valid_sets = [lgb_train, lgb_valid],
    early_stopping_rounds=100,
    verbose_eval = 500,
    feval = lgb_amex_metric,
    callbacks=[save_model()],
)

##################################################
Training fold 2 with 1950 features...




[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 211535
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 1950




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051523
[LightGBM] [Info] Start training from score -1.051523
iteration 0, score= 0.68221




iteration 100, score= 0.75586
iteration 200, score= 0.75780
iteration 300, score= 0.76081
iteration 400, score= 0.76212
[500]	training's binary_logloss: 0.33714	training's amex_metric: 0.777132	valid_1's binary_logloss: 0.340912	valid_1's amex_metric: 0.7647
iteration 500, score= 0.76471
iteration 600, score= 0.76656
iteration 700, score= 0.76991
iteration 800, score= 0.77232
iteration 900, score= 0.77497
[1000]	training's binary_logloss: 0.24614	training's amex_metric: 0.794756	valid_1's binary_logloss: 0.254017	valid_1's amex_metric: 0.776409
iteration 1000, score= 0.77641
iteration 1100, score= 0.77882
iteration 1200, score= 0.78038
iteration 1300, score= 0.78156
iteration 1400, score= 0.78203
[1500]	training's binary_logloss: 0.222052	training's amex_metric: 0.8082	valid_1's binary_logloss: 0.233728	valid_1's amex_metric: 0.782898
iteration 1500, score= 0.78299
iteration 1600, score= 0.78458
iteration 1700, score= 0.78491
iteration 1800, score= 0.78622
iteration 1900, score= 0.7864

High Score: iteration 4485, score=0.79678
High Score: iteration 4487, score=0.79679
High Score: iteration 4488, score=0.79681
High Score: iteration 4489, score=0.79681
High Score: iteration 4494, score=0.79689
High Score: iteration 4495, score=0.79689
High Score: iteration 4497, score=0.79690
[4500]	training's binary_logloss: 0.177403	training's amex_metric: 0.869575	valid_1's binary_logloss: 0.216605	valid_1's amex_metric: 0.796812
iteration 4500, score= 0.79671
High Score: iteration 4508, score=0.79690
High Score: iteration 4509, score=0.79694
High Score: iteration 4511, score=0.79694
High Score: iteration 4513, score=0.79694
iteration 4600, score= 0.79681
iteration 4700, score= 0.79642
iteration 4800, score= 0.79620
iteration 4900, score= 0.79639
[5000]	training's binary_logloss: 0.172081	training's amex_metric: 0.878253	valid_1's binary_logloss: 0.216004	valid_1's amex_metric: 0.796484
iteration 5000, score= 0.79652
iteration 5100, score= 0.79668
iteration 5200, score= 0.79675
High

iteration 11800, score= 0.79711
iteration 11900, score= 0.79728
[12000]	training's binary_logloss: 0.117741	training's amex_metric: 0.963447	valid_1's binary_logloss: 0.214302	valid_1's amex_metric: 0.797759
iteration 12000, score= 0.79776
iteration 12100, score= 0.79770
iteration 12200, score= 0.79786
iteration 12300, score= 0.79777
iteration 12400, score= 0.79780
[12500]	training's binary_logloss: 0.115167	training's amex_metric: 0.966702	valid_1's binary_logloss: 0.21433	valid_1's amex_metric: 0.797589
iteration 12500, score= 0.79776
iteration 12600, score= 0.79736
iteration 12700, score= 0.79759
iteration 12800, score= 0.79751
iteration 12900, score= 0.79746
[13000]	training's binary_logloss: 0.111838	training's amex_metric: 0.97049	valid_1's binary_logloss: 0.214402	valid_1's amex_metric: 0.796972
iteration 13000, score= 0.79697
iteration 13100, score= 0.79706
iteration 13200, score= 0.79755
iteration 13300, score= 0.79780
iteration 13400, score= 0.79752
[13500]	training's binary_

In [None]:
# def training(train):
    
#     # create a numpy array to store out of folds predictions
#     oof_predictions = np.zeros(len(train))
    
#     kfold = StratifiedKFold(
#         n_splits=n_folds, 
#         shuffle=True, 
#         random_state=seed
#     )
    
#     for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[target])):
        
#         print("#" * 50)
#         print("Training fold {} with {} features...".format(fold, len(features)))
        
#         x_train, x_val = train.loc[trn_ind, features], train.loc[val_ind, features]
#         y_train, y_val = train.loc[trn_ind, target], train.loc[val_ind, target]
        
#         lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
#         lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)
#         model = lgb.train(
#             params=params,
#             train_set=lgb_train,
#             num_boost_round=10500,
#             valid_sets = [lgb_train, lgb_valid],
#             early_stopping_rounds=100,
#             verbose_eval = 500,
#             feval = lgb_amex_metric
#         )
#         # save best model
#         dump(model, "../ckpt/lgbm_{}_{}.pkl".format(fold, seed))
        
#         # predict validation
#         val_pred = model.predict(x_val)
        
#         # add to out of folds array
#         oof_predictions[val_ind] = val_pred
        
#         # compute fold metric
#         score = amex_metric(y_val, val_pred)
#         print("fold {} score is {}".format(fold, score))
        
#         del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
#         gc.collect()
        
#     # compute oof
#     score = amex_metric(train[target], oof_predictions)
#     print("oof score is {}".format(score))
    
#     # create a dataframe to store out of folds predictions
#     oof_df = pd.DataFrame({"customer_ID": train["customer_ID"], "target": train[target], "prediction": oof_predictions})
#     oof_df.to_parquet("lgbm_oof_{}.parquet".format(seed))
    
#     return oof_df