In [9]:
import numpy as np
import pandas as pd
import lightgbm as lgb

In [10]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

In [11]:
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
import os

### load data

In [69]:
main_folder = r"C:\Users\Alexandre Boulenger\santransakt_main"
data_folder = os.path.join(main_folder, 'data')

In [14]:
train = pd.read_csv(os.path.join(data_folder,"train.csv"))
test = pd.read_csv(os.path.join(data_folder,"test.csv"))

### prep data

In [15]:
test['target'] = np.nan
cols = test.columns[:-1].tolist()
cols = [cols[0], 'target']+cols[1:]
test = test.loc[:,cols]

In [16]:
traintest = pd.concat([train, test])
traintest.reset_index(drop=True, inplace=True)
traintest['set'] = 'test'
traintest.loc[train.index,'set'] = 'train'
traintest.set.value_counts()

test     200000
train    200000
Name: set, dtype: int64

In [17]:
X_tr = train.drop(columns=['target', 'ID_code']).copy()
X_te = test.drop(columns=['target', 'ID_code']).copy()
X = traintest.drop(columns=['target', 'ID_code']).copy()

In [18]:
y_tr = train.target.copy()
y_te = test.target.copy()
y = traintest.target.copy()

### select part of the training data for a less greedy exploration

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
#X_tr, X_tr_2, y_tr, y_tr_2 = train_test_split(X_tr, y_tr, test_size=0.9, random_state=42)
#X_te, X_tr_2, y_te, y_tr_2 = train_test_split(X_tr_2, y_tr_2, test_size=0.9, random_state=42)

In [21]:
#X_tr, X_te, y_tr, y_te = train_test_split(X_tr, y_tr, test_size=0.1, random_state=42)

### train GBM model

In [22]:
folds = StratifiedKFold(n_splits=12, shuffle=False, random_state=10)

In [23]:
oof = np.zeros(len(X_tr))
y_te_pred = np.zeros(len(X_te))

#### create model

In [24]:
param = {
    'bagging_freq': 5,          
    'bagging_fraction': 0.38,   'boost_from_average':'false',   
    'boost': 'gbdt',             'feature_fraction': 0.04,     'learning_rate': 0.0085,
    'max_depth': -1,             'metric':'auc',                'min_data_in_leaf': 80,     'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,            'num_threads': 8,              'tree_learner': 'serial',   'objective': 'binary',
    'reg_alpha': 0.1302650970728192, 'reg_lambda': 0.3603427518866501,'verbosity': 1
}

#### perform cross-validation

In [25]:
%%time
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_tr.values, y_tr.values)):
    
    print("Fold {}".format(fold_))
    
    # set train and validation data
    trn_data = lgb.Dataset(
        X_tr.iloc[trn_idx]
        ,label=y_tr.iloc[trn_idx]
    )
    val_data = lgb.Dataset(
        X_tr.iloc[val_idx]
        ,label=y_tr.iloc[val_idx]
    )
    
    # train model
    clf = lgb.train(
        param
        ,trn_data
        ,1000000
        ,valid_sets = [trn_data, val_data]
        ,verbose_eval=5000
        ,early_stopping_rounds = 2000
    )
    
    # record predictions
    oof[val_idx] = clf.predict(X_tr.iloc[val_idx], num_iteration=clf.best_iteration)
    y_te_pred += clf.predict(X_te, num_iteration=clf.best_iteration) / folds.n_splits

Fold 0
Training until validation scores don't improve for 2000 rounds.
[5000]	training's auc: 0.925151	valid_1's auc: 0.898164
[10000]	training's auc: 0.940839	valid_1's auc: 0.901261
Early stopping, best iteration is:
[10313]	training's auc: 0.941697	valid_1's auc: 0.901416
Fold 1
Training until validation scores don't improve for 2000 rounds.
[5000]	training's auc: 0.925388	valid_1's auc: 0.897582
[10000]	training's auc: 0.941018	valid_1's auc: 0.899296
[15000]	training's auc: 0.953868	valid_1's auc: 0.899497
Early stopping, best iteration is:
[13782]	training's auc: 0.950925	valid_1's auc: 0.899681
Fold 2
Training until validation scores don't improve for 2000 rounds.
[5000]	training's auc: 0.925878	valid_1's auc: 0.889935
[10000]	training's auc: 0.941418	valid_1's auc: 0.892076
Early stopping, best iteration is:
[11513]	training's auc: 0.945485	valid_1's auc: 0.892175
Fold 3
Training until validation scores don't improve for 2000 rounds.
[5000]	training's auc: 0.924659	valid_1's au

#### estimate test score from cross-validated score

In [36]:
print("CV score: {:<8.5f}".format(roc_auc_score(y_tr, oof)))
sub = pd.DataFrame({"ID_code": test.ID_code.values})
sub["target"] = y_te_pred

CV score: 0.90062 


#### save predictions

In [58]:
from datetime import datetime
date_today = datetime.now().strftime("%Y%m%d")

In [55]:
sub.to_csv(os.path.join(data_folder,"submission_{}.csv".format(name, date_today)), index=False)

#### save trained model

In [57]:
json_model = clf.dump_model()

In [61]:
import json
with open(os.path.join(data_folder,'model_gbm_{}.json'.format(name, date_today)), 'w+') as f:
    json.dump(json_model, f, indent=4)

In [None]:
# https://www.kaggle.com/jesucristo/30-lines-starter-solution-fast
# https://lightgbm.readthedocs.io/en/latest/Python-Intro.html