In [12]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from numba import jit
from scipy.stats import skew
from scipy.stats import kurtosis
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

train_df = pd.read_csv('train.csv.zip')
test_df = pd.read_csv('test.csv.zip')
random_state = 42
np.random.seed(random_state)

In [50]:
features = [col for col in train_df.columns if col not in ['target', 'ID_code']]
features_score=np.array([ 1,  1,  1, 54, 40,  1,  1, 83,  5,  1, 80,  5,  1,  1, 28, 23, 62,
       34,  1, 48,  8,  1,  1,  1,  1, 54,  1, 77,  1, 59, 68,  1,  1,  1,
        1,  1,  1, 75, 82, 66,  1, 86, 72,  1,  1,  1, 71, 51,  1,  1, 36,
        1,  1,  1, 26, 16,  1, 26,  1, 28, 85, 44, 22, 14, 28, 31,  5,  1,
       81, 73,  1,  1, 39, 77,  9,  1,  1, 10,  1, 61,  1,  1,  1, 12, 38,
        1,  1,  1, 12,  1,  1,  1,  1,  1,  1,  1, 68, 31, 56,  1, 45, 78,
       15, 35,  1,  1,  1,  1,  1,  1,  1,  1,  1, 42,  1,  1,  1, 74,  1,
        1, 49,  1,  1,  1, 46,  1, 56,  1,  1, 79,  1,  1,  2,  1, 31,  1,
       84,  1, 21,  1, 24,  1, 18, 51,  8,  1,  1,  1,  1,  1,  1,  1, 43,
       60,  1,  1,  5,  1, 64, 18, 58, 52,  1,  1,  1,  1,  1,  1, 66,  1,
        1, 42,  1,  1,  1, 14, 69,  1, 18,  1,  1, 47, 37, 70,  1, 58,  1,
       20,  1, 63,  1,  1,  1, 33,  2,  1,  1,  1,  1,  1])

In [62]:
features=np.take(features,np.where(features_score<50))[0]

In [69]:
%%time
@jit
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs); xn = np.vstack(xn)
    ys = np.ones(xs.shape[0]);yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn]); y = np.concatenate([y,ys,yn])
    return x,y
    
params = {
    "objective" : "binary", "metric" : "auc", "boosting": 'gbdt', "max_depth" : -1, "num_leaves" : 13,
    "learning_rate" : 0.01, "bagging_freq": 5, "bagging_fraction" : 0.4, "feature_fraction" : 0.05,
    "min_data_in_leaf": 80, "min_sum_heassian_in_leaf": 10, "tree_learner": "serial", "boost_from_average": "false",
    "bagging_seed" : random_state, "verbosity" : 1, "seed": random_state,'num_threads': 4,
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
oof = train_df[['ID_code', 'target']]
oof['predict'] = 0
predictions = test_df[['ID_code']]
val_aucs = []


X_test = test_df[features].values

for fold, (trn_idx, val_idx) in enumerate(skf.split(train_df, train_df['target'])):
    print("fold nbr",fold)
    X_train, y_train = train_df.iloc[trn_idx][features], train_df.iloc[trn_idx]['target']
    X_valid, y_valid = train_df.iloc[val_idx][features], train_df.iloc[val_idx]['target']
    
    N = 3
    p_valid,yp = 0,0
    for i in range(N):
        X_t, y_t = augment(X_train.values, y_train.values)
        X_t = pd.DataFrame(X_t)
        X_t = X_t.add_prefix('var_')
    
        trn_data = lgb.Dataset(X_t, label=y_t)
        val_data = lgb.Dataset(X_valid, label=y_valid)
        evals_result = {}
        lgb_clf = lgb.train(params,trn_data,100000,valid_sets = [trn_data, val_data],early_stopping_rounds=1000,verbose_eval = 5000,evals_result=evals_result)
        p_valid += lgb_clf.predict(X_valid)
        yp += lgb_clf.predict(X_test)
    
    oof['predict'][val_idx] = p_valid/N
    val_score = roc_auc_score(y_valid, p_valid)
    val_aucs.append(val_score)
    predictions['fold{}'.format(fold+1)] = yp/N
    
mean_auc = np.mean(val_aucs)
std_auc = np.std(val_aucs)
all_auc = roc_auc_score(oof['target'], oof['predict'])
print("Mean auc: %.9f, std: %.9f. All auc: %.9f." % (mean_auc, std_auc, all_auc))

predictions['target'] = np.mean(predictions[[col for col in predictions.columns if col not in ['ID_code', 'target']]].values, axis=1)
submission = pd.DataFrame({"ID_code":test_df["ID_code"].values})
submission["target"] = predictions['target']
submission.to_csv("lgb_data_augmentation_boruta_features_selection_submission.csv", index=False)

fold nbr 0
Training until validation scores don't improve for 1000 rounds.
[5000]	training's auc: 0.916126	valid_1's auc: 0.897451
[10000]	training's auc: 0.927496	valid_1's auc: 0.899232
Early stopping, best iteration is:
[9665]	training's auc: 0.926829	valid_1's auc: 0.899282
Training until validation scores don't improve for 1000 rounds.
[5000]	training's auc: 0.915075	valid_1's auc: 0.897904
[10000]	training's auc: 0.926568	valid_1's auc: 0.89985
Early stopping, best iteration is:
[9447]	training's auc: 0.925458	valid_1's auc: 0.899875
Training until validation scores don't improve for 1000 rounds.
[5000]	training's auc: 0.915631	valid_1's auc: 0.897731
[10000]	training's auc: 0.927221	valid_1's auc: 0.89949
Early stopping, best iteration is:
[10380]	training's auc: 0.927994	valid_1's auc: 0.899524
fold nbr 1
Training until validation scores don't improve for 1000 rounds.
[5000]	training's auc: 0.915544	valid_1's auc: 0.898531
[10000]	training's auc: 0.926923	valid_1's auc: 0.90027