# Pipeline with LGBM using SMOTE (Oversampling)

* Kaggle Score: 0.99999

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import requests
import warnings
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
import seaborn as sns

from IPython.display import Image
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv("dev.csv")
test_df = pd.read_csv("compete.csv")

In [3]:
train_df = train_df.drop(['is_host_login', 'num_outbound_cmds'], axis=1);
test_df = test_df.drop(['is_host_login', 'num_outbound_cmds'], axis=1);

In [4]:
train_df = pd.get_dummies(train_df, columns=['protocol_type'])
test_df = pd.get_dummies(test_df, columns=['protocol_type'])

In [5]:
from sklearn import preprocessing

cat_cols = ['service', 'flag']
for col in cat_cols:
    if col in train_df.columns:
        le = preprocessing.LabelEncoder()
        le.fit(list(train_df[col].astype(str).values) + list(test_df[col].astype(str).values))
        train_df[col] = le.transform(list(train_df[col].astype(str).values))
        test_df[col] = le.transform(list(test_df[col].astype(str).values))   

In [6]:
numerical_features = list(train_df.columns[train_df.dtypes != object].values[:-1])
categorical_features = list(train_df.columns[train_df.dtypes == object].values)

corr_table = train_df.corr()
triu = corr_table.where(np.triu(np.ones(corr_table.shape) ,k=1).astype(np.bool))
to_drop = [feat for feat in triu.columns if any(triu[feat] > 0.95)]

train_df = train_df.drop(to_drop, axis=1)

for feat in to_drop:
    if feat in categorical_features:
        categorical_features.remove(feat)
    else:
        numerical_features.remove(feat)

print(f'\nFeatures dropped: {to_drop}')
# plt.figure(figsize=(50, 30))
# _ = sns.heatmap(corr_table, annot=True, fmt='.2f')


Features dropped: ['num_root', 'srv_serror_rate', 'srv_rerror_rate', 'dst_host_same_srv_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'protocol_type_icmp']


In [7]:
train_df = train_df.drop(['num_shells', 'num_file_creations', 'urgent'], axis=1)
test_df = test_df.drop(['num_shells', 'num_file_creations', 'urgent'], axis=1)

In [8]:
x = train_df.drop('class', axis=1).values
x_test = test_df.drop('Id', axis=1).values

In [9]:
from sklearn.preprocessing import StandardScaler

x_scaled = StandardScaler().fit_transform(x)
x_test_scaled = StandardScaler().fit_transform(x_test)

df = pd.DataFrame(x_scaled)
df_test = pd.DataFrame(x_test_scaled)

In [10]:
import numpy as np
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
with_pca = pca.fit_transform(df)
with_pca_test = pca.fit_transform(df_test)

In [11]:
df_final = pd.DataFrame(with_pca)
df_final_test = pd.DataFrame(with_pca_test)

In [12]:
train_df['c1'] = df_final[0]
train_df['c2'] = df_final[1]

In [13]:
test_df['c1'] = df_final_test[0]
test_df['c2'] = df_final_test[1]

In [14]:
X = train_df.drop('class', axis=1)
y = train_df['class']

In [15]:
from imblearn.over_sampling import SMOTE 

oversample = SMOTE()
X_train_over, y_train_over = oversample.fit_resample(X, y)

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train_over, y_train_over, test_size=0.3)

In [17]:
#basic tools 
import os
import numpy as np
import pandas as pd
import warnings

#tuning hyperparameters
from bayes_opt import BayesianOptimization
from skopt  import BayesSearchCV 

#graph, plots
import matplotlib.pyplot as plt
import seaborn as sns

#building models
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
import time
import sys

#metrics 
from sklearn.metrics import roc_auc_score, roc_curve
warnings.simplefilter(action='ignore', category=FutureWarning)

### Bayesian optimization to find the best parameters for LGBM

In [18]:
%%time

def bayes_parameter_opt_lgb(X, y, init_round=15, opt_round=25, n_folds=3, random_seed=6,n_estimators=10000, output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y, free_raw_data=False)
    # parameters
    def lgb_eval(learning_rate,num_leaves, feature_fraction, bagging_fraction, max_depth, max_bin, min_data_in_leaf,min_sum_hessian_in_leaf,subsample):
        params = {'application':'binary', 'metric':'auc'}
        params['learning_rate'] = max(min(learning_rate, 1), 0)
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['max_bin'] = int(round(max_depth))
        params['min_data_in_leaf'] = int(round(min_data_in_leaf))
        params['min_sum_hessian_in_leaf'] = min_sum_hessian_in_leaf
        params['subsample'] = max(min(subsample, 1), 0)
        
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval =200, metrics=['auc'])
        return max(cv_result['auc-mean'])
     
    lgbBO = BayesianOptimization(lgb_eval, {'learning_rate': (0.01, 1.0),
                                            'num_leaves': (24, 80),
                                            'feature_fraction': (0.1, 0.9),
                                            'bagging_fraction': (0.8, 1),
                                            'max_depth': (5, 30),
                                            'max_bin':(20,90),
                                            'min_data_in_leaf': (20, 80),
                                            'min_sum_hessian_in_leaf':(0,100),
                                           'subsample': (0.01, 1.0)}, random_state=200)

    

    
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    model_auc=[]
    for model in range(len( lgbBO.res)):
        model_auc.append(lgbBO.res[model]['target'])
    
    # return best parameters
    return lgbBO.res[pd.Series(model_auc).idxmax()]['target'],lgbBO.res[pd.Series(model_auc).idxmax()]['params']

opt_params = bayes_parameter_opt_lgb(X, y, init_round=5, opt_round=10, n_folds=3, random_seed=6,n_estimators=10000)

|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | max_depth | min_da... | min_su... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
[LightGBM] [Info] Number of positive: 185147, number of negative: 45396
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 461
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 29
[LightGBM] [Info] Number of positive: 185146, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 461
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 29
[LightGBM] [Info] Number of positive: 185147, number of negative: 45397
You can set `force



| [0m 1       [0m | [0m 1.0     [0m | [0m 0.9895  [0m | [0m 0.2812  [0m | [0m 0.5985  [0m | [0m 49.98   [0m | [0m 24.1    [0m | [0m 20.17   [0m | [0m 35.74   [0m | [0m 74.94   [0m | [0m 0.4615  [0m |
[LightGBM] [Info] Number of positive: 185147, number of negative: 45396
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 274
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27


[LightGBM] [Info] Number of positive: 185146, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 274
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185147, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 274
[LightGBM] [Info] Number of data points in the train set: 230544, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.803091 -> initscore=1.405726
[LightGBM] [Info] Start training from score 1.405726
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.803087 -> initscore=1.405699
[LightGBM] [Info] Start training from score 1.405699
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.803087 -> initscore=1.405704
[LightGBM] [Inf



| [95m 2       [0m | [95m 1.0     [0m | [95m 0.9964  [0m | [95m 0.7939  [0m | [95m 0.9862  [0m | [95m 84.63   [0m | [95m 12.59   [0m | [95m 70.77   [0m | [95m 12.12   [0m | [95m 67.99   [0m | [95m 0.258   [0m |
[LightGBM] [Info] Number of positive: 185147, number of negative: 45396
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185146, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185147, number of negative: 45397
You can set `force_row_wise=true` to remove the overh





| [0m 3       [0m | [0m 1.0     [0m | [0m 0.8192  [0m | [0m 0.8548  [0m | [0m 0.8278  [0m | [0m 56.28   [0m | [0m 26.84   [0m | [0m 54.7    [0m | [0m 45.01   [0m | [0m 62.09   [0m | [0m 0.4252  [0m |
[LightGBM] [Info] Number of positive: 185147, number of negative: 45396
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 456
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185146, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 456
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185147, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if



| [0m 4       [0m | [0m 1.0     [0m | [0m 0.9281  [0m | [0m 0.5869  [0m | [0m 0.1144  [0m | [0m 87.62   [0m | [0m 23.97   [0m | [0m 60.78   [0m | [0m 32.93   [0m | [0m 25.48   [0m | [0m 0.8056  [0m |
[LightGBM] [Info] Number of positive: 185147, number of negative: 45396
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 217
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185146, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 217
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185147, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if





| [0m 5       [0m | [0m 1.0     [0m | [0m 0.9946  [0m | [0m 0.3264  [0m | [0m 0.6526  [0m | [0m 38.59   [0m | [0m 9.692   [0m | [0m 45.14   [0m | [0m 66.6    [0m | [0m 52.98   [0m | [0m 0.8559  [0m |
[LightGBM] [Info] Number of positive: 185147, number of negative: 45396
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 470
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185146, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 470
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185147, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if

| [0m 6       [0m | [0m 1.0     [0m | [0m 0.8333  [0m | [0m 0.7175  [0m | [0m 0.06083 [0m | [0m 87.46   [0m | [0m 25.17   [0m | [0m 62.81   [0m | [0m 32.9    [0m | [0m 29.53   [0m | [0m 0.2623  [0m |
[LightGBM] [Info] Number of positive: 185147, number of negative: 45396
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 308
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185146, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 308
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185147, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if





| [0m 7       [0m | [0m 1.0     [0m | [0m 0.9094  [0m | [0m 0.4572  [0m | [0m 0.8601  [0m | [0m 85.95   [0m | [0m 14.68   [0m | [0m 54.13   [0m | [0m 43.72   [0m | [0m 59.64   [0m | [0m 0.8433  [0m |
[LightGBM] [Info] Number of positive: 185147, number of negative: 45396
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 339
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185146, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 339
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185147, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if





| [95m 8       [0m | [95m 1.0     [0m | [95m 0.806   [0m | [95m 0.4615  [0m | [95m 0.274   [0m | [95m 87.69   [0m | [95m 17.28   [0m | [95m 74.95   [0m | [95m 14.08   [0m | [95m 68.69   [0m | [95m 0.9412  [0m |
[LightGBM] [Info] Number of positive: 185147, number of negative: 45396
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 423
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185146, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 423
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185147, number of negative: 45397
You can set `force_row_wise=true` to remove the overh





| [0m 10      [0m | [0m 1.0     [0m | [0m 0.9023  [0m | [0m 0.6931  [0m | [0m 0.4663  [0m | [0m 59.03   [0m | [0m 20.47   [0m | [0m 48.44   [0m | [0m 94.19   [0m | [0m 61.48   [0m | [0m 0.5169  [0m |
[LightGBM] [Info] Number of positive: 185147, number of negative: 45396
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 372
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185146, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 372
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185147, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if

| [0m 11      [0m | [0m 1.0     [0m | [0m 0.872   [0m | [0m 0.1299  [0m | [0m 0.01683 [0m | [0m 85.55   [0m | [0m 19.09   [0m | [0m 75.8    [0m | [0m 23.76   [0m | [0m 75.17   [0m | [0m 0.8197  [0m |
[LightGBM] [Info] Number of positive: 185147, number of negative: 45396
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 407
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185146, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 407
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185147, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if





| [0m 12      [0m | [0m 1.0     [0m | [0m 0.8293  [0m | [0m 0.762   [0m | [0m 0.9211  [0m | [0m 88.36   [0m | [0m 20.58   [0m | [0m 71.13   [0m | [0m 12.98   [0m | [0m 65.52   [0m | [0m 0.6965  [0m |
[LightGBM] [Info] Number of positive: 185147, number of negative: 45396
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 324
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185146, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 324
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185147, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if





| [0m 13      [0m | [0m 1.0     [0m | [0m 0.8505  [0m | [0m 0.5743  [0m | [0m 0.4817  [0m | [0m 81.54   [0m | [0m 15.97   [0m | [0m 72.88   [0m | [0m 7.743   [0m | [0m 63.47   [0m | [0m 0.7332  [0m |
[LightGBM] [Info] Number of positive: 185147, number of negative: 45396
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 308
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185146, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 308
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185147, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if



| [0m 14      [0m | [0m 1.0     [0m | [0m 0.8888  [0m | [0m 0.8731  [0m | [0m 0.7     [0m | [0m 89.52   [0m | [0m 15.08   [0m | [0m 72.11   [0m | [0m 4.88    [0m | [0m 67.58   [0m | [0m 0.9589  [0m |
[LightGBM] [Info] Number of positive: 185147, number of negative: 45396
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 355
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27


[LightGBM] [Info] Number of positive: 185146, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 355
[LightGBM] [Info] Number of data points in the train set: 230543, number of used features: 27
[LightGBM] [Info] Number of positive: 185147, number of negative: 45397
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 355
[LightGBM] [Info] Number of data points in the train set: 230544, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.803091 -> initscore=1.405726
[LightGBM] [Info] Start training from score 1.405726
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.803087 -> initscore=1.405699
[LightGBM] [Info] Start training from score 1.405699
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.803087 -> initscore=1.405704
[LightGBM] [Inf



| [95m 15      [0m | [95m 1.0     [0m | [95m 0.9568  [0m | [95m 0.7486  [0m | [95m 0.6521  [0m | [95m 85.55   [0m | [95m 18.17   [0m | [95m 78.75   [0m | [95m 11.33   [0m | [95m 58.7    [0m | [95m 0.7489  [0m |
Wall time: 47.1 s


* Best parameters

In [19]:
opt_params[1]["num_leaves"] = int(round(opt_params[1]["num_leaves"]))
opt_params[1]['max_depth'] = int(round(opt_params[1]['max_depth']))
opt_params[1]['min_data_in_leaf'] = int(round(opt_params[1]['min_data_in_leaf']))
opt_params[1]['max_bin'] = int(round(opt_params[1]['max_bin']))
opt_params[1]['objective']='binary'
opt_params[1]['metric']='auc'
opt_params[1]['is_unbalance']=True
opt_params[1]['boost_from_average']=False
opt_params=opt_params[1]
opt_params

{'bagging_fraction': 0.9567708999269537,
 'feature_fraction': 0.748594348895611,
 'learning_rate': 0.6521421930251328,
 'max_bin': 86,
 'max_depth': 18,
 'min_data_in_leaf': 79,
 'min_sum_hessian_in_leaf': 11.32931118035695,
 'num_leaves': 59,
 'subsample': 0.7489382632506367,
 'objective': 'binary',
 'metric': 'auc',
 'is_unbalance': True,
 'boost_from_average': False}

In [20]:
%%time 
from sklearn.model_selection import StratifiedKFold

target = y
features= [c for c in train_df.columns if c not in ['class']]


folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=31416)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])

    num_round = 15000
    clf = lgb.train(opt_params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 250)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

Fold 0
[LightGBM] [Info] Number of positive: 249948, number of negative: 61285
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1320
[LightGBM] [Info] Number of data points in the train set: 311233, number of used features: 27
Training until validation scores don't improve for 250 rounds




Early stopping, best iteration is:
[16]	training's auc: 1	valid_1's auc: 0.999999
Fold 1
[LightGBM] [Info] Number of positive: 249948, number of negative: 61285
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1321
[LightGBM] [Info] Number of data points in the train set: 311233, number of used features: 27
Training until validation scores don't improve for 250 rounds






Early stopping, best iteration is:
[103]	training's auc: 1	valid_1's auc: 1
Fold 2
[LightGBM] [Info] Number of positive: 249948, number of negative: 61285
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1325
[LightGBM] [Info] Number of data points in the train set: 311233, number of used features: 27
Training until validation scores don't improve for 250 rounds








Early stopping, best iteration is:
[211]	training's auc: 1	valid_1's auc: 1
Fold 3
[LightGBM] [Info] Number of positive: 249948, number of negative: 61285
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1323
[LightGBM] [Info] Number of data points in the train set: 311233, number of used features: 27
Training until validation scores don't improve for 250 rounds






Early stopping, best iteration is:
[147]	training's auc: 1	valid_1's auc: 1
Fold 4
[LightGBM] [Info] Number of positive: 249948, number of negative: 61285
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1331
[LightGBM] [Info] Number of data points in the train set: 311233, number of used features: 27
Training until validation scores don't improve for 250 rounds






Early stopping, best iteration is:
[26]	training's auc: 1	valid_1's auc: 1
Fold 5
[LightGBM] [Info] Number of positive: 249948, number of negative: 61286
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1326
[LightGBM] [Info] Number of data points in the train set: 311234, number of used features: 27
Training until validation scores don't improve for 250 rounds




Early stopping, best iteration is:
[12]	training's auc: 0.999999	valid_1's auc: 0.999998
Fold 6
[LightGBM] [Info] Number of positive: 249948, number of negative: 61286
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1323
[LightGBM] [Info] Number of data points in the train set: 311234, number of used features: 27
Training until validation scores don't improve for 250 rounds








Early stopping, best iteration is:
[200]	training's auc: 1	valid_1's auc: 0.999999
Fold 7
[LightGBM] [Info] Number of positive: 249948, number of negative: 61286
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1328
[LightGBM] [Info] Number of data points in the train set: 311234, number of used features: 27
Training until validation scores don't improve for 250 rounds








[500]	training's auc: 1	valid_1's auc: 1






Early stopping, best iteration is:
[525]	training's auc: 1	valid_1's auc: 1
Fold 8
[LightGBM] [Info] Number of positive: 249948, number of negative: 61286
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1323
[LightGBM] [Info] Number of data points in the train set: 311234, number of used features: 27
Training until validation scores don't improve for 250 rounds




Early stopping, best iteration is:
[23]	training's auc: 1	valid_1's auc: 0.999998
Fold 9
[LightGBM] [Info] Number of positive: 249948, number of negative: 61286
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1328
[LightGBM] [Info] Number of data points in the train set: 311234, number of used features: 27
Training until validation scores don't improve for 250 rounds








[500]	training's auc: 1	valid_1's auc: 1








Early stopping, best iteration is:
[664]	training's auc: 1	valid_1's auc: 1
CV score: 1.00000 
Wall time: 1min


In [21]:
test_id = test_df.Id.values
test_df = test_df.drop("Id", axis=1)
test_df = test_df.drop(to_drop, axis=1)

In [22]:
submit = pd.DataFrame({'Id': test_id, 'class':predictions})
submit.to_csv('lgbm_BayeSmotePca.csv', index=False)