In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from time import gmtime, strftime

import optuna

In [None]:
def cv(param, X, y, X_test=None):
    print(param)
    if X_test is not None:
        n_splits = 10
        predictions = np.zeros(len(X_test))
    else:
        n_splits = 5
        predictions = None
    folds = StratifiedKFold(n_splits=n_splits, random_state = 7485, shuffle=True)
    oof = np.zeros(len(X))

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X.values, y.values)):
        print("Fold {}".format(fold_))
        trn_data = lgb.Dataset(X.iloc[trn_idx], label=y.iloc[trn_idx])
        val_data = lgb.Dataset(X.iloc[val_idx], label=y.iloc[val_idx])
        if X_test is not None:
            num_round = 1000000
            clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 1000)
            predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
        else:
            num_round = 300
            clf = lgb.train(param, trn_data, num_round)
        oof[val_idx] = clf.predict(X.iloc[val_idx], num_iteration=clf.best_iteration)

    score = roc_auc_score(y, oof)
    print("CV score: {:<8.5f}".format(score))
    if X_test is not None:
        return predictions
    else:
        return score
    

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
param = {
    'objective': 'binary',
    'boosting_type': 'rf',
    'metric': 'auc',
    'num_threads': 8,
    'verbosity': 0,
    #'num_leaves': 10,
    #'min_data_in_leaf': 80,
    #'feature_fraction': 0.05,
    'bagging_freq': 5,
    #'bagging_fraction': 0.4,
    'boost_from_average':'false',
    #'min_sum_hessian_in_leaf': 10.0,
    'learning_rate': 0.01,
    'max_depth': -1,
    "tree_learner": "serial",
}

In [None]:
num_data, num_feature = train_df.shape
MAX_NUM_LEAVES = min(100, num_feature//10)
MAX_MIN_DATA_IN_LEAF = 100
print(num_data, num_feature)

In [None]:
def objective(trial):
    param['num_leaves'] = trial.suggest_int('num_leaves', 2, MAX_NUM_LEAVES)
    param['min_data_in_leaf'] = trial.suggest_int('min_data_in_leaf', 0, MAX_MIN_DATA_IN_LEAF)
    param['min_sum_hessian_in_leaf'] = trial.suggest_loguniform('min_sum_hessian_in_leaf', 1e-5, 20)
    param['feature_fraction'] = trial.suggest_uniform('feature_fraction', 0.01, 1.0)
    #param['bagging_freq'] = trial.suggest_int('bagging_freq', 0, 5)
    #param['learning_rate'] = trial.suggest_loguniform('learning_rate', 0.001, 0.3)
    
    if param['bagging_freq'] > 0:
        param['bagging_fraction'] = trial.suggest_uniform('bagging_fraction', 0.01, 1.0)

    if param['boosting_type'] == 'dart':
        param['drop_rate'] = trial.suggest_loguniform('drop_rate', 1e-8, 1.0)
        param['skip_drop'] = trial.suggest_loguniform('skip_drop', 1e-8, 1.0)
    if param['boosting_type'] == 'goss':
        param['top_rate'] = trial.suggest_uniform('top_rate', 0.0, 1.0)
        param['other_rate'] = trial.suggest_uniform('other_rate', 0.0, 1.0 - param['top_rate'])

    score = cv(param, X, y)
    return 1-score

In [None]:
train_df.head()

In [None]:
train_df.target.value_counts(normalize=True)

In [None]:
test_df.head()

In [None]:
train_df.dtypes

In [None]:
train_df.isnull().sum().sort_values(ascending=False)[:10]

In [None]:
# Drop Different Columns from train and test
print('\nTrain and Test Datasets have the same columns?:',
      train_df.drop('target',axis=1).columns.tolist()==test_df.columns.tolist())
print("\nVariables not in test but in train : ", 
      set(train_df.drop('target',axis=1).columns).difference(set(test_df.columns)))
dif = list(set(train_df.drop('target',axis=1).columns).difference(set(test_df.columns)))

In [None]:
# Prepare data
X = train_df.drop(['ID_code', 'target'], axis=1)
X_test = test_df.drop(['ID_code'], axis=1)
y = train_df.target
print(len(X), len(X_test))

In [None]:
# Parameter Tuning
study = optuna.create_study()
study.optimize(objective, n_trials=100)

print('Number of finished trials: {}'.format(len(study.trials)))

print('Best trial:')
trial = study.best_trial
best_params = study.best_params

print('  Value: {}'.format(trial.value))

print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

In [None]:
# Building model using BEST parameters, then predict test data
print("randomforest_model ...")
param.update(best_params)
param['verbosity'] = 1
prediction = cv(param, X, y, X_test)
print("...Done")

In [None]:
# Save
test_df['target'] = prediction
submission_string = 'randomforest_' + strftime("%Y-%m-%d %H:%M:%S", gmtime()) + '.csv'
test_df.loc[:, ['ID_code', 'target']].to_csv(submission_string, index=False)