In [1]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score

import time, warnings, json, gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb

In [2]:
def verbalise_dataset(train, test):
    print('Train shape:' + str(train.shape))
    print('Test shape:' + str(test.shape))
    print()

In [3]:
def load_file(filepath):

    start_time = time.time()
    df = pd.read_csv(filepath, low_memory=False)
    elapsed_time = time.time() - start_time
    print("Dataset loaded, time elapsed: " + str(elapsed_time))

    return df

train = load_file('../data/train.csv')  # (76020, 371)
test = load_file('../data/test.csv')  # (75818, 370)
verbalise_dataset(train, test)

Dataset loaded, time elapsed: 15.59230661392212
Dataset loaded, time elapsed: 17.333653926849365
Train shape:(76020, 371)
Test shape:(75818, 370)



In [4]:
def remove_duplicate_col(train, test):

    print('Removing duplicated features')
    output = []
    columns = train.columns  # list of headers
    for i in range(len(columns)-1):
        for j in range(i+1,len(columns)):
            if np.array_equal(train[columns[i]].values, train[columns[j]].values) and columns[j] not in output:
                    output.append(columns[j])
    
    train = train.drop(output, axis=1)
    test = test.drop(output, axis=1)

    return train, test


clean_train, clean_test = remove_duplicate_col(train, test)
verbalise_dataset(clean_train, clean_test)

Removing duplicated features
Train shape:(76020, 309)
Test shape:(75818, 308)



In [5]:
def remove_constant_col(train, test):

    print('Removing constant features')
    columns = []
    for col in train.columns:
        if train[col].std() == 0:
            columns.append(col)

    train = train.drop(columns, axis=1)
    test = test.drop(columns, axis=1)

    return train, test

clean_train, clean_test = remove_constant_col(clean_train, clean_test)
verbalise_dataset(clean_train, clean_test)

Removing constant features
Train shape:(76020, 308)
Test shape:(75818, 307)



In [6]:
# split data into train and test
X = clean_train.drop(["TARGET","ID"],axis=1)
Y = clean_train['TARGET'].values

test_id = clean_test.ID
test = clean_test.drop(["ID"],axis=1)
seeds = [2534324, 13454236, 34623, 1367457, 12321]

In [8]:
def lgbm_evaluate(**params):
    warnings.simplefilter('ignore')

    params['num_leaves'] = int(params['num_leaves'])
    params['max_depth'] = int(params['max_depth'])

    test_pred_proba = np.zeros(len(X))
    
    n_splits = 10
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2534324)

    for train_idx, valid_idx in skf.split(X, Y):

        # Feature selection
        clf = lgb.LGBMClassifier()
        selector = clf.fit(X, Y)
        fs = SelectFromModel(selector, prefit=True)
        train_df = fs.transform(X)
        test_df = fs.transform(test)

        train_x, train_y = train_df[train_idx], Y[train_idx]
        valid_x, valid_y = train_df[valid_idx], Y[valid_idx]

        clf = lgb.LGBMClassifier(**params, n_estimators=100, nthread=4)
        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=False, early_stopping_rounds=100)

        test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]

        del train_x, train_y, valid_x, valid_y
        gc.collect()

    return roc_auc_score(Y, test_pred_proba)


if __name__ == "__main__":

    # set to ignore warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)
    
    params = {
        'learning_rate': (.01, .02),
        'num_leaves': (20, 35),
        'colsample_bytree': (0.8, 1),
        'subsample': (0.8, 1),
        'max_depth': (7, 9),
        'reg_alpha': (.03, .05),
        'reg_lambda': (.06, .08),
        'min_split_gain': (.01, .03),
        'min_child_weight': (38, 60)
    }

    bo = BayesianOptimization(lgbm_evaluate, params)
    bo.maximize(init_points=5, n_iter=10)
    best_params = bo.max['params']
    
    best_params['num_leaves'] = int(best_params['num_leaves'])

    print(best_params)
    # output copied and stored as params.json
    
    import json
    json.dump(best_params, open('params0.json', 'w'))


|   iter    |  target   | colsam... | learni... | max_depth | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.8342  [0m | [0m 0.9345  [0m | [0m 0.01596 [0m | [0m 7.028   [0m | [0m 54.64   [0m | [0m 0.02701 [0m | [0m 21.35   [0m | [0m 0.03943 [0m | [0m 0.06914 [0m | [0m 0.8195  [0m |
| [95m 2       [0m | [95m 0.8345  [0m | [95m 0.8545  [0m | [95m 0.01172 [0m | [95m 7.93    [0m | [95m 48.57   [0m | [95m 0.02514 [0m | [95m 34.93   [0m | [95m 0.0327  [0m | [95m 0.07505 [0m | [95m 0.8822  [0m |
| [95m 3       [0m | [95m 0.8351  [0m | [95m 0.811   [0m | [95m 0.01541 [0m | [95m 8.35    [0m | [95m 57.07   [0m | [95m 0.02991 [0m | [95m 24.16   [0m | [95m 0.03105 [0m | [95m 0.07476 [0m | [95m 0.9812  [0m |
| [95m 4       [0m | [95m 0.8362  [0m | 

In [10]:
def lgbm_evaluate(**params):
    warnings.simplefilter('ignore')

    params['num_leaves'] = int(params['num_leaves'])
    params['max_depth'] = int(params['max_depth'])

    test_pred_proba = np.zeros(len(X))
    
    n_splits = 10
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=13454236)

    for train_idx, valid_idx in skf.split(X, Y):

        # Feature selection
        clf = lgb.LGBMClassifier()
        selector = clf.fit(X, Y)
        fs = SelectFromModel(selector, prefit=True)
        train_df = fs.transform(X)
        test_df = fs.transform(test)

        train_x, train_y = train_df[train_idx], Y[train_idx]
        valid_x, valid_y = train_df[valid_idx], Y[valid_idx]

        clf = lgb.LGBMClassifier(**params, n_estimators=100, nthread=4)
        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=False, early_stopping_rounds=100)

        test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]

        del train_x, train_y, valid_x, valid_y
        gc.collect()

    return roc_auc_score(Y, test_pred_proba)


if __name__ == "__main__":

    # set to ignore warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)
    
    params = {
        'learning_rate': (.01, .02),
        'num_leaves': (20, 35),
        'colsample_bytree': (0.8, 1),
        'subsample': (0.8, 1),
        'max_depth': (7, 9),
        'reg_alpha': (.03, .05),
        'reg_lambda': (.06, .08),
        'min_split_gain': (.01, .03),
        'min_child_weight': (38, 60)
    }

    bo = BayesianOptimization(lgbm_evaluate, params)
    bo.maximize(init_points=5, n_iter=10)
    best_params = bo.max['params']
    
    best_params['num_leaves'] = int(best_params['num_leaves'])

    print(best_params)
    # output copied and stored as params.json
    
    import json
    json.dump(best_params, open('params1.json', 'w'))


|   iter    |  target   | colsam... | learni... | max_depth | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.8357  [0m | [0m 0.9047  [0m | [0m 0.01831 [0m | [0m 7.939   [0m | [0m 50.29   [0m | [0m 0.01127 [0m | [0m 29.09   [0m | [0m 0.04976 [0m | [0m 0.07681 [0m | [0m 0.8951  [0m |
| [0m 2       [0m | [0m 0.8355  [0m | [0m 0.804   [0m | [0m 0.01725 [0m | [0m 8.783   [0m | [0m 51.81   [0m | [0m 0.0104  [0m | [0m 26.1    [0m | [0m 0.04386 [0m | [0m 0.06591 [0m | [0m 0.9893  [0m |
| [0m 3       [0m | [0m 0.832   [0m | [0m 0.9728  [0m | [0m 0.01259 [0m | [0m 7.008   [0m | [0m 49.28   [0m | [0m 0.02042 [0m | [0m 22.09   [0m | [0m 0.03606 [0m | [0m 0.07453 [0m | [0m 0.8347  [0m |
| [0m 4       [0m | [0m 0.8353  [0m | [0m 0.8257  [0m | [0m

In [11]:
def lgbm_evaluate(**params):
    warnings.simplefilter('ignore')

    params['num_leaves'] = int(params['num_leaves'])
    params['max_depth'] = int(params['max_depth'])

    test_pred_proba = np.zeros(len(X))
    
    n_splits = 10
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=34623)

    for train_idx, valid_idx in skf.split(X, Y):

        # Feature selection
        clf = lgb.LGBMClassifier()
        selector = clf.fit(X, Y)
        fs = SelectFromModel(selector, prefit=True)
        train_df = fs.transform(X)
        test_df = fs.transform(test)

        train_x, train_y = train_df[train_idx], Y[train_idx]
        valid_x, valid_y = train_df[valid_idx], Y[valid_idx]

        clf = lgb.LGBMClassifier(**params, n_estimators=100, nthread=4)
        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=False, early_stopping_rounds=100)

        test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]

        del train_x, train_y, valid_x, valid_y
        gc.collect()

    return roc_auc_score(Y, test_pred_proba)


if __name__ == "__main__":

    # set to ignore warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)
    
    params = {
        'learning_rate': (.01, .02),
        'num_leaves': (20, 35),
        'colsample_bytree': (0.8, 1),
        'subsample': (0.8, 1),
        'max_depth': (7, 9),
        'reg_alpha': (.03, .05),
        'reg_lambda': (.06, .08),
        'min_split_gain': (.01, .03),
        'min_child_weight': (38, 60)
    }

    bo = BayesianOptimization(lgbm_evaluate, params)
    bo.maximize(init_points=5, n_iter=10)
    best_params = bo.max['params']
    
    best_params['num_leaves'] = int(best_params['num_leaves'])

    print(best_params)
    # output copied and stored as params.json
    
    import json
    json.dump(best_params, open('params2.json', 'w'))


|   iter    |  target   | colsam... | learni... | max_depth | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.8347  [0m | [0m 0.8414  [0m | [0m 0.01835 [0m | [0m 7.551   [0m | [0m 59.61   [0m | [0m 0.01476 [0m | [0m 32.0    [0m | [0m 0.03484 [0m | [0m 0.06161 [0m | [0m 0.8317  [0m |
| [0m 2       [0m | [0m 0.8346  [0m | [0m 0.8574  [0m | [0m 0.01535 [0m | [0m 7.52    [0m | [0m 39.4    [0m | [0m 0.02904 [0m | [0m 25.05   [0m | [0m 0.04591 [0m | [0m 0.06508 [0m | [0m 0.9961  [0m |
| [0m 3       [0m | [0m 0.8344  [0m | [0m 0.9869  [0m | [0m 0.0183  [0m | [0m 8.44    [0m | [0m 39.95   [0m | [0m 0.01208 [0m | [0m 23.59   [0m | [0m 0.032   [0m | [0m 0.07836 [0m | [0m 0.9315  [0m |
| [0m 4       [0m | [0m 0.8345  [0m | [0m 0.9408  [0m | [0m

In [12]:
def lgbm_evaluate(**params):
    warnings.simplefilter('ignore')

    params['num_leaves'] = int(params['num_leaves'])
    params['max_depth'] = int(params['max_depth'])

    test_pred_proba = np.zeros(len(X))
    
    n_splits = 10
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1367457)

    for train_idx, valid_idx in skf.split(X, Y):

        # Feature selection
        clf = lgb.LGBMClassifier()
        selector = clf.fit(X, Y)
        fs = SelectFromModel(selector, prefit=True)
        train_df = fs.transform(X)
        test_df = fs.transform(test)

        train_x, train_y = train_df[train_idx], Y[train_idx]
        valid_x, valid_y = train_df[valid_idx], Y[valid_idx]

        clf = lgb.LGBMClassifier(**params, n_estimators=100, nthread=4)
        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=False, early_stopping_rounds=100)

        test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]

        del train_x, train_y, valid_x, valid_y
        gc.collect()

    return roc_auc_score(Y, test_pred_proba)


if __name__ == "__main__":

    # set to ignore warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)
    
    params = {
        'learning_rate': (.01, .02),
        'num_leaves': (20, 35),
        'colsample_bytree': (0.8, 1),
        'subsample': (0.8, 1),
        'max_depth': (7, 9),
        'reg_alpha': (.03, .05),
        'reg_lambda': (.06, .08),
        'min_split_gain': (.01, .03),
        'min_child_weight': (38, 60)
    }

    bo = BayesianOptimization(lgbm_evaluate, params)
    bo.maximize(init_points=5, n_iter=10)
    best_params = bo.max['params']
    
    best_params['num_leaves'] = int(best_params['num_leaves'])

    print(best_params)
    # output copied and stored as params.json
    
    import json
    json.dump(best_params, open('params3.json', 'w'))


|   iter    |  target   | colsam... | learni... | max_depth | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.8361  [0m | [0m 0.8415  [0m | [0m 0.01971 [0m | [0m 8.745   [0m | [0m 54.06   [0m | [0m 0.01049 [0m | [0m 26.36   [0m | [0m 0.04994 [0m | [0m 0.06903 [0m | [0m 0.9926  [0m |
| [0m 2       [0m | [0m 0.8328  [0m | [0m 0.9919  [0m | [0m 0.01151 [0m | [0m 8.218   [0m | [0m 58.65   [0m | [0m 0.01025 [0m | [0m 29.8    [0m | [0m 0.04028 [0m | [0m 0.07583 [0m | [0m 0.8913  [0m |
| [0m 3       [0m | [0m 0.8348  [0m | [0m 0.9292  [0m | [0m 0.01589 [0m | [0m 8.94    [0m | [0m 48.14   [0m | [0m 0.01227 [0m | [0m 25.5    [0m | [0m 0.0481  [0m | [0m 0.07888 [0m | [0m 0.9536  [0m |
| [0m 4       [0m | [0m 0.8354  [0m | [0m 0.9435  [0m | [0m

In [13]:
def lgbm_evaluate(**params):
    warnings.simplefilter('ignore')

    params['num_leaves'] = int(params['num_leaves'])
    params['max_depth'] = int(params['max_depth'])

    test_pred_proba = np.zeros(len(X))
    
    n_splits = 10
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=12321)

    for train_idx, valid_idx in skf.split(X, Y):

        # Feature selection
        clf = lgb.LGBMClassifier()
        selector = clf.fit(X, Y)
        fs = SelectFromModel(selector, prefit=True)
        train_df = fs.transform(X)
        test_df = fs.transform(test)

        train_x, train_y = train_df[train_idx], Y[train_idx]
        valid_x, valid_y = train_df[valid_idx], Y[valid_idx]

        clf = lgb.LGBMClassifier(**params, n_estimators=100, nthread=4)
        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=False, early_stopping_rounds=100)

        test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]

        del train_x, train_y, valid_x, valid_y
        gc.collect()

    return roc_auc_score(Y, test_pred_proba)


if __name__ == "__main__":

    # set to ignore warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)
    
    params = {
        'learning_rate': (.01, .02),
        'num_leaves': (20, 35),
        'colsample_bytree': (0.8, 1),
        'subsample': (0.8, 1),
        'max_depth': (7, 9),
        'reg_alpha': (.03, .05),
        'reg_lambda': (.06, .08),
        'min_split_gain': (.01, .03),
        'min_child_weight': (38, 60)
    }

    bo = BayesianOptimization(lgbm_evaluate, params)
    bo.maximize(init_points=5, n_iter=10)
    best_params = bo.max['params']
    
    best_params['num_leaves'] = int(best_params['num_leaves'])

    print(best_params)
    # output copied and stored as params.json
    
    import json
    json.dump(best_params, open('params4.json', 'w'))

|   iter    |  target   | colsam... | learni... | max_depth | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.834   [0m | [0m 0.8049  [0m | [0m 0.01419 [0m | [0m 8.289   [0m | [0m 57.73   [0m | [0m 0.0221  [0m | [0m 23.64   [0m | [0m 0.03556 [0m | [0m 0.06523 [0m | [0m 0.8815  [0m |
| [0m 2       [0m | [0m 0.8335  [0m | [0m 0.9196  [0m | [0m 0.01121 [0m | [0m 7.926   [0m | [0m 41.79   [0m | [0m 0.01644 [0m | [0m 30.37   [0m | [0m 0.04919 [0m | [0m 0.07983 [0m | [0m 0.926   [0m |
| [95m 3       [0m | [95m 0.8353  [0m | [95m 0.8188  [0m | [95m 0.01494 [0m | [95m 7.234   [0m | [95m 48.02   [0m | [95m 0.02922 [0m | [95m 30.8    [0m | [95m 0.03877 [0m | [95m 0.07062 [0m | [95m 0.8606  [0m |
| [0m 4       [0m | [0m 0.8341  [0m | [0m 0.8876  

In [None]:
# https://towardsdatascience.com/automated-machine-learning-hyperparameter-tuning-in-python-dfda59b72f8a