In [3]:
import numpy as np
import pandas as pd
import lightgbm as lgb

In [4]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
import os

### load data

In [7]:
main_folder = r"C:\Users\Alexandre Boulenger\santransakt_main"
data_folder = os.path.join(main_folder, 'data')

In [8]:
train = pd.read_csv(os.path.join(data_folder,"train.csv"))
test = pd.read_csv(os.path.join(data_folder,"test.csv"))

### prep data

In [9]:
test['target'] = np.nan
cols = test.columns[:-1].tolist()
cols = [cols[0], 'target']+cols[1:]
test = test.loc[:,cols]

In [10]:
traintest = pd.concat([train, test])
traintest.reset_index(drop=True, inplace=True)
traintest['set'] = 'test'
traintest.loc[train.index,'set'] = 'train'
traintest.set.value_counts()

test     200000
train    200000
Name: set, dtype: int64

In [42]:
X_tr = train.drop(columns=['target', 'ID_code']).copy()
X_te = test.drop(columns=['target', 'ID_code']).copy()
X = traintest.drop(columns=['target', 'ID_code']).copy()

In [43]:
y_tr = train.target.copy()
y_te = test.target.copy()
y = traintest.target.copy()

### select part of the training data for a less greedy exploration

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
#X_tr, X_tr_2, y_tr, y_tr_2 = train_test_split(X_tr, y_tr, test_size=0.9, random_state=42)
#X_te, X_tr_2, y_te, y_tr_2 = train_test_split(X_tr_2, y_tr_2, test_size=0.9, random_state=42)

In [44]:
X_tr, X_te, y_tr, y_te = train_test_split(X_tr, y_tr, test_size=0.1, random_state=42)

### train GBM model

In [45]:
folds = StratifiedKFold(n_splits=12, shuffle=False, random_state=10)

In [46]:
oof = np.zeros(len(X_tr))
y_te_pred = np.zeros(len(X_te))

#### create model

In [47]:
param = {
    'bagging_freq': 5,          
    'bagging_fraction': 0.38,   'boost_from_average':'false',   
    'boost': 'gbdt',             'feature_fraction': 0.04,     'learning_rate': 0.0085,
    'max_depth': -1,             'metric':'auc',                'min_data_in_leaf': 80,     'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,            'num_threads': 8,              'tree_learner': 'serial',   'objective': 'binary',
    'reg_alpha': 0.1302650970728192, 'reg_lambda': 0.3603427518866501,'verbosity': 1
}

#### perform cross-validation

In [48]:
%%time
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_tr.values, y_tr.values)):
    
    print("Fold {}".format(fold_))
    
    # set train and validation data
    trn_data = lgb.Dataset(
        X_tr.iloc[trn_idx]
        ,label=y_tr.iloc[trn_idx]
    )
    val_data = lgb.Dataset(
        X_tr.iloc[val_idx]
        ,label=y_tr.iloc[val_idx]
    )
    
    # train model
    clf = lgb.train(
        param
        ,trn_data
        ,1000000
        ,valid_sets = [trn_data, val_data]
        ,verbose_eval=5000
        ,early_stopping_rounds = 2000
    )
    
    # record predictions
    oof[val_idx] = clf.predict(X_tr.iloc[val_idx], num_iteration=clf.best_iteration)
    y_te_pred += clf.predict(X_te, num_iteration=clf.best_iteration) / folds.n_splits

Fold 0
Training until validation scores don't improve for 2000 rounds.
[5000]	training's auc: 0.927855	valid_1's auc: 0.894857
[10000]	training's auc: 0.944527	valid_1's auc: 0.896006
Early stopping, best iteration is:
[10490]	training's auc: 0.945961	valid_1's auc: 0.896166
Fold 1
Training until validation scores don't improve for 2000 rounds.
[5000]	training's auc: 0.927771	valid_1's auc: 0.893617
[10000]	training's auc: 0.944402	valid_1's auc: 0.896037
[15000]	training's auc: 0.957891	valid_1's auc: 0.896251
Early stopping, best iteration is:
[14383]	training's auc: 0.956349	valid_1's auc: 0.896404
Fold 2
Training until validation scores don't improve for 2000 rounds.
[5000]	training's auc: 0.927447	valid_1's auc: 0.899639
[10000]	training's auc: 0.944143	valid_1's auc: 0.902454
[15000]	training's auc: 0.957573	valid_1's auc: 0.902792
Early stopping, best iteration is:
[14281]	training's auc: 0.95581	valid_1's auc: 0.902919
Fold 3
Training until validation scores don't improve for 2

#### estimate test error from cross-validated validation error

In [None]:
print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))
sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
sub["target"] = predictions
sub.to_csv("submission.csv", index=False)

In [None]:
# https://www.kaggle.com/jesucristo/30-lines-starter-solution-fast
# https://lightgbm.readthedocs.io/en/latest/Python-Intro.html

### train light model

In [74]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, make_scorer
from sklearn.model_selection import cross_val_score, GridSearchCV

In [75]:
roc_auc_weighted = make_scorer(roc_auc_score, average='weighted')

#### create model

In [76]:
# set default parameters
cv = 5
n_estimators = 100
max_depth = 20
max_features = np.sqrt(200)
param_grid = {
    'max_depth':[4,10,20]
    ,'n_estimators':[25,100,150]
}

In [77]:
# instanciate model with default parameters
clf = RandomForestClassifier(
    n_estimators=n_estimators
    ,max_depth=max_depth
    #,max_features=max_features
    ,class_weight='balanced'
    ,random_state=0
)

#### hyperparameter tuning and cross-validated test error

In [78]:
# instanciate CV object
gscv = GridSearchCV(
    clf
    ,param_grid=param_grid
    ,scoring=roc_auc_weighted
    ,cv=cv
)

In [80]:
%%time
# run CV
gscv.fit(X_tr, y_tr)

Wall time: 2h 6min 8s


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=20, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=None, oob_score=False, random_state=0,
            verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [4, 10, 20], 'n_estimators': [25, 100, 150]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score, average=weighted), verbose=0)

In [81]:
# display out-of-sample performance 
print("estimated test ROC AUC score: {:.2%}".format(pred_score))
best_params = gscv.best_params_
print("best parameters: {}".format(best_params))

estimated test ROC AUC score: 69.63%
best parameters: {'max_depth': 4, 'n_estimators': 150}


#### performance on test set

In [82]:
y_te_pr = gscv.predict(X_te)
print("ROC AUC score on test set: {:.2%}".format(roc_auc_score(y_te, y_te_pr)))
y_te_pr = gscv.predict(X_tr_2)
print("ROC AUC score on test set: {:.2%}".format(roc_auc_score(y_tr_2, y_te_pr)))

ROC AUC score on test set: 70.75%
ROC AUC score on test set: 71.44%
