In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from time import gmtime, strftime


import optuna

In [None]:
def cv(param, X, y, X_test=None):
    print(param)
    if X_test is not None:
        n_splits = 10
        n_estimators = 1000000
        predictions = np.zeros(len(X_test))
        decom = TruncatedSVD(n_components=param['n_components'], random_state=7485)
    else:
        n_splits = 5
        n_estimators = 300
        predictions = None
    folds = StratifiedKFold(n_splits=n_splits, random_state = 7485, shuffle=True)
    oof = np.zeros(len(X))
    
    pca_pipeline = Pipeline([
        ('decomposition', TruncatedSVD(n_components=param['n_components'], random_state=7485)),
        ('model', lgb.LGBMModel(n_estimators=n_estimators, **param))
    ])

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X.values, y.values)):
        print("Fold {}".format(fold_))
        if X_test is not None:
            decom.fit(X=X.iloc[trn_idx].values, y=y.iloc[trn_idx].values)
            eval_set = [
                (decom.transform(X.iloc[trn_idx]), y.iloc[trn_idx]),
                (decom.transform(X.iloc[val_idx]), y.iloc[val_idx])
            ]
            clf = pca_pipeline.fit(
                X=X.iloc[trn_idx].values, y=y.iloc[trn_idx].values, model__eval_set=eval_set, 
                model__eval_metric=param['eval_metric'], model__early_stopping_rounds = 1000, model__verbose=1000)
            predictions += clf.predict(X_test, num_iteration=clf.named_steps['model'].best_iteration_) / folds.n_splits
        else:
            clf = pca_pipeline.fit(X=X.iloc[trn_idx].values, y=y.iloc[trn_idx].values)
        oof[val_idx] = clf.predict(X.iloc[val_idx], num_iteration=clf.named_steps['model'].best_iteration_)

    score = roc_auc_score(y, oof)
    print("CV score: {:<8.5f}".format(score))
    if X_test is not None:
        return predictions
    else:
        return score
    

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
param = {
    #'n_components': 75,
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'eval_metric': 'auc',
    'n_jobs': -1,
    'verbosity': 0,
    #'num_leaves': 10,
    #'min_child_samples': 80,
    #'colsample_bytree': 0.05,
    'subsample_freq': 5,
    #'subsample': 0.4,
    #'min_child_weight': 10.0,
    'learning_rate': 0.01,
    'max_depth': -1,
}

In [None]:
num_data, num_feature = train_df.shape
MAX_NUM_LEAVES = min(100, num_feature//10)
MAX_MIN_DATA_IN_LEAF = 100
print(num_data, num_feature)

In [None]:
def objective(trial):
    param['n_components'] = trial.suggest_int('n_components', 1, X.shape[1]-1)
    param['num_leaves'] = trial.suggest_int('num_leaves', 2, MAX_NUM_LEAVES)
    param['min_child_samples'] = trial.suggest_int('min_child_samples', 0, MAX_MIN_DATA_IN_LEAF)
    param['min_child_weight'] = trial.suggest_loguniform('min_child_weight', 1e-5, 20)
    param['colsample_bytree'] = trial.suggest_uniform('colsample_bytree', 0.01, 1.0)
    #param['subsample_freq'] = trial.suggest_int('subsample_freq', 0, 5)
    #param['learning_rate'] = trial.suggest_loguniform('learning_rate', 0.001, 0.3)
    
    if param['subsample_freq'] > 0:
        param['subsample'] = trial.suggest_uniform('subsample', 0.01, 1.0)

    if param['boosting_type'] == 'dart':
        param['drop_rate'] = trial.suggest_loguniform('drop_rate', 1e-8, 1.0)
        param['skip_drop'] = trial.suggest_loguniform('skip_drop', 1e-8, 1.0)
    if param['boosting_type'] == 'goss':
        param['top_rate'] = trial.suggest_uniform('top_rate', 0.0, 1.0)
        param['other_rate'] = trial.suggest_uniform('other_rate', 0.0, 1.0 - param['top_rate'])

    score = cv(param, X, y)
    return 1-score

In [None]:
train_df.head()

In [None]:
train_df.target.value_counts(normalize=True)

In [None]:
test_df.head()

In [None]:
train_df.dtypes

In [None]:
train_df.isnull().sum().sort_values(ascending=False)[:10]

In [None]:
# Drop Different Columns from train and test
print('\nTrain and Test Datasets have the same columns?:',
      train_df.drop('target',axis=1).columns.tolist()==test_df.columns.tolist())
print("\nVariables not in test but in train : ", 
      set(train_df.drop('target',axis=1).columns).difference(set(test_df.columns)))
dif = list(set(train_df.drop('target',axis=1).columns).difference(set(test_df.columns)))

In [None]:
# Prepare data
X = train_df.drop(['ID_code', 'target'], axis=1)
X_test = test_df.drop(['ID_code'], axis=1)
y = train_df.target
print(len(X), len(X_test))

In [None]:
#pca = PCA(n_components=200)
#pca.fit(X.values)
#plt.bar([n for n in range(1, len(pca.explained_variance_ratio_)+1)], pca.explained_variance_ratio_)
#ev_ratio = pca.explained_variance_ratio_
#ev_ratio = np.hstack([0,ev_ratio.cumsum()])
#plt.plot(ev_ratio)
#plt.show()

In [None]:
# Parameter Tuning
study = optuna.create_study()
study.optimize(objective, n_trials=100)

print('Number of finished trials: {}'.format(len(study.trials)))

print('Best trial:')
trial = study.best_trial
best_params = study.best_params

print('  Value: {}'.format(trial.value))

print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

In [None]:
# Building model using BEST parameters, then predict test data
print("svd_lgb_model ...")
param.update(best_params)
param['verbosity'] = 1
prediction = cv(param, X, y, X_test)
print("...Done")

In [None]:
# Save
test_df['target'] = prediction
submission_string = 'svd_gbm_' + strftime("%Y-%m-%d %H:%M:%S", gmtime()) + '.csv'
test_df.loc[:, ['ID_code', 'target']].to_csv(submission_string, index=False)