In [6]:
import os
import gc
from joblib import dump
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

# load files

In [12]:
train = pd.read_parquet("../train.parquet")

# define loss and metric

In [3]:
def amex_metric(y_true, y_pred):
    
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    
    for i in [1, 0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
        
    return 0.5 * (gini[1]/gini[0] + top_four)

In [4]:
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return "amex_metric", amex_metric(y_true, y_pred), True

# define training config

In [7]:
seed = 42
n_folds = 5

features = [col for col in train.columns if col not in ["customer_ID", "target"]]
target = "target"
cat_features_base = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
] 
cat_features = [
    "{}_last".format(feature) for feature in cat_features_base
]

params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "boosting": "dart",
    "seed": seed,
    "num_leaves": 100,
    "learning_rate": 0.01,
    "feature_fraction": 0.20,
    "bagging_freq": 10,
    "bagging_fraction": 0.50,
    "n_jobs": -1,
    "lambda_l2": 2,
    "min_data_in_leaf": 40
}

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
seed_everything(seed)

In [13]:
def training(train):
    
    # round last float features to 2 decimal place
    num_cols = list(train.dtypes[(train.dtypes == "float32") | (train.dtypes == "float64")].index)
    num_last_cols = [col for col in num_cols if "last" in col]
    for col in num_last_cols:
        train[col + "_round2"] = train[col].round(2)
    
    # create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train))
    
    kfold = StratifiedKFold(
        n_splits=n_folds, 
        shuffle=True, 
        random_state=seed
    )
    
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[target])):
        
        print("#" * 50)
        print("Training fold {} with {} features...".format(fold, len(features)))
        
        x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
        y_train, y_val = train[target].iloc[trn_ind], train[target].iloc[val_ind]
        
        lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
        lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)
        model = lgb.train(
            params=params,
            train_set=lgb_train,
            num_boost_round=10500,
            valid_sets = [lgb_train, lgb_valid],
            early_stopping_rounds=100,
            verbose_eval = 500,
            feval = lgb_amex_metric
        )
        # save best model
        dump(model, "../ckpt/lgbm_{}_{}.pkl".format(fold, seed))
        
        # predict validation
        val_pred = model.predict(x_val)
        
        # add to out of folds array
        oof_predictions[val_ind] = val_pred
        
        # compute fold metric
        score = amex_metric(y_val, val_pred)
        print("fold {} score is {}".format(fold, score))
        
        del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
        gc.collect()
        
    # compute oof
    score = amex_metric(train[target], oof_predictions)
    print("oof score is {}".format(score))
    
    # create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame({"customer_ID": train["customer_ID"], "target": train[target], "prediction": oof_predictions})
    oof_df.to_parquet("lgbm_oof_{}.parquet".format(seed))
    
    return

In [14]:
training(train)

##################################################
Training fold 0 with 918 features...




[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 148522
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 909




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051523
[LightGBM] [Info] Start training from score -1.051523




[500]	training's binary_logloss: 0.339401	training's amex_metric: 0.776587	valid_1's binary_logloss: 0.342842	valid_1's amex_metric: 0.760569
[1000]	training's binary_logloss: 0.24805	training's amex_metric: 0.793409	valid_1's binary_logloss: 0.255314	valid_1's amex_metric: 0.771594
[1500]	training's binary_logloss: 0.223958	training's amex_metric: 0.806084	valid_1's binary_logloss: 0.234848	valid_1's amex_metric: 0.778971
[2000]	training's binary_logloss: 0.210676	training's amex_metric: 0.818282	valid_1's binary_logloss: 0.226035	valid_1's amex_metric: 0.783681
[2500]	training's binary_logloss: 0.204051	training's amex_metric: 0.827902	valid_1's binary_logloss: 0.223059	valid_1's amex_metric: 0.78678
[3000]	training's binary_logloss: 0.197475	training's amex_metric: 0.837146	valid_1's binary_logloss: 0.2208	valid_1's amex_metric: 0.787978
[3500]	training's binary_logloss: 0.19142	training's amex_metric: 0.846478	valid_1's binary_logloss: 0.219273	valid_1's amex_metric: 0.788952
[4000

KeyboardInterrupt: 