In [None]:
# ! pip install smart_open
# ! pip install xgboost
# ! pip install category_encoders
# ! pip install feature-engine
# ! pip install lightgbm
# ! pip install catboost
# ! pip install hyperopt

In [None]:
import pandas as pd
import numpy as np
from smart_open import smart_open
import lightgbm as lgb

# trainpath = 's3://sagemaker-shared-resources/model_outputs/fraud_model_6/intermediate_data/X_train_20210603.csv'
# testpath = 's3://sagemaker-shared-resources/model_outputs/fraud_model_6/intermediate_data/X_test_20210603.csv'
# X_train = pd.read_csv(smart_open(trainpath), low_memory = False)
# X_test = pd.read_csv(smart_open(testpath), low_memory = False)

In [None]:
transformed_trainpath = 's3://sagemaker-shared-resources/model_outputs/fraud_model_6/intermediate_data/X_train_transformed_06242021.csv'
transformed_testpath = 's3://sagemaker-shared-resources/model_outputs/fraud_model_6/intermediate_data/X_test_transformed_06242021.csv'

In [None]:
X_train = pd.read_csv(smart_open(transformed_trainpath), low_memory = False)
X_test = pd.read_csv(smart_open(transformed_testpath), low_memory = False)

In [None]:
y_train = X_train['dep_var']
X_train = X_train.drop('dep_var', axis = 1)

#HOLD-OUT SET BELOW
X_test = pd.read_csv(smart_open(transformed_testpath), low_memory = False)
y_test = X_test['dep_var']
X_test = X_test.drop('dep_var', axis = 1)

In [None]:
# CANNOT TRAIN THESE FEATURES ON NEURAL NETWORK

# for c in X_train.columns:
#     if max(X_train[c]) > 2:
#         print (c, max(X_train[c]))

In [None]:
np.random.seed(42)
val_select = np.random.choice(len(y_train), len(y_train) // 5)
X_val = X_train.iloc[val_select]
y_val = y_train[val_select]
X_val.shape, y_val.shape

In [None]:
X_train = X_train.drop(val_select).reset_index(drop=True)
y_train = y_train.drop(val_select).reset_index(drop=True)

In [None]:
import time
import sys
from io import StringIO
import os
import shutil

In [None]:
module_path = os.path.abspath(os.path.join('fraud_models/fraud-model-6.0.0'))
sys.path.append(module_path+"/sagemaker_scripts")

In [None]:
from platform import python_version

import argparse
import csv
import json
import joblib
import numpy as np
import pandas as pd

# basic modules
import sys
import importlib

#parsing modules
import json
import re
from urllib.parse import unquote
from datetime import datetime, timezone

In [None]:
#custom parsing transfomers
from parsing_pipeline import FeatureSelector, ParsingTransformer, MissingHandler, FeatureDropper, DateMissingTransformer
from data_integrity_pipeline import DataIntegrityTransformer
from feature_engineering import DateTransformer, idaParser, MissingIndicatorTransformer


In [None]:
#Pipeline imports
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import _VectorizerMixin
from sklearn.feature_selection._base import SelectorMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

In [None]:
#evaluation imports
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, f1_score,recall_score,precision_score, average_precision_score
from sklearn.model_selection import cross_validate

In [None]:
#ML Imports
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression

In [None]:
import xgboost as xgb

In [None]:
import category_encoders as ce

In [None]:
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from feature_engine.encoding import RareLabelEncoder, MeanEncoder
from feature_engine.wrappers import SklearnTransformerWrapper

#import featuretools as ft

In [None]:
dataframe_cols = list(X_train)

In [None]:
all_features = list(set([*numeric_vars, *all_date_vars, *categorical_vars]))

In [None]:
parsing_pipeline = Pipeline(steps = [('cleaner', ParsingTransformer(df_names = dataframe_cols,
                                                                    tmx_unclean_vars = tmx_vars,
                                                                    exact_match_vars = find_replace_exact_vars,
                                                                    regex_match_vars = find_replace_regex_vars)),
                                     ('date_impute', DateMissingTransformer(datemissing_cols = date_missing_vars,
                                                                            reference_cols = date_missing_ref_vars,
                                                                           reference_dict = date_impute_dict)),
                                ('feature_selector', FeatureSelector(all_features)),
                                ('data_integrity', DataIntegrityTransformer(all_date_vars,
                                                                           numeric_vars,
                                                                           categorical_vars))
                                        ])

In [None]:

#Pipeline imports
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import _VectorizerMixin
from sklearn.feature_selection._base import SelectorMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector


#evaluation imports
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, f1_score,recall_score,precision_score, average_precision_score
from sklearn.model_selection import cross_validate

import re


# inference functions ---------------

def model_fn(model_dir):
    """
    model_fn re-defined to accomodate Sagemaker Python SDK.

    Args:
        model_dir: Default location where model is stored on the instance

    Returns: a pipeline object which can be used to predict with

    """
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf


def predict_fn(input_data, model):
    """
    A modified predict_fn for Scikit-learn. Calls a model on data deserialized in input_fn. Returns probabilities instead of predictions.

    Args:
        input_data: input data (Numpy array) for prediction deserialized by input_fn
        model: Scikit-learn model loaded in memory by model_fn

    Returns: probability of fraud
    """
    pred_prob = model.predict_proba(input_data)
    return pred_prob

In [None]:
# fe_pipeline = Pipeline(steps = [("ida_fe", idaParser(ida_impute_vars)),
#                                 ('dates_fe', DateTransformer(fe_date_vars,
#                                                              fe_timestamp_vars,
#                                                              all_date_vars)),
#                                 ('missing_dealer', MissingHandler())
#                                 ])

# #categorical and numeric transformers
# categorical_transformer = Pipeline(steps = [('cat_imputer', CategoricalImputer(imputation_method = 'missing', fill_value='Missing')),
#                                       ('rare_label_encoder',RareLabelEncoder(tol=0.02, n_categories=3,replace_with='Other')),
#                                            ('cat_encoder', ce.TargetEncoder()),
#                                            ('scaler', StandardScaler())
#                                            ])

# numeric_transformer = Pipeline(steps = [('num_imputer', SimpleImputer()),
#                                        ('scaler', StandardScaler())
#                                        ])

# #combining final cat and num transformer pipelines
# preprocessor = ColumnTransformer(transformers=[
#         ("num_t", numeric_transformer, make_column_selector(dtype_exclude="object")),
#         ("cat_t", categorical_transformer, make_column_selector(dtype_include="object"))],
#                                 n_jobs = -1)




# clf = Pipeline(steps=[('parser', parsing_pipeline),
#                   ('feature creation', fe_pipeline),
#                   ('preprocessor', preprocessor),
#                   ('classifier', xgb.XGBClassifier())])


In [None]:
num_missing_imputer = SimpleImputer()
percent_missing = 0.85
cat_missing_imputer = CategoricalImputer(imputation_method = 'missing', fill_value='Missing')
cat_encoder = ce.TargetEncoder()
num_scaler = RobustScaler()
tol_rare_label = 0.0017

In [None]:
# JUNE 22 version

#Features that will be analysed and used in initial model training
all_features = list(set([*numeric_vars, *all_date_vars, *categorical_vars]))
#cleaning data and ensuring formats of variables
parsing_pipeline = Pipeline(steps = [('cleaner', ParsingTransformer(df_names = dataframe_cols,
                                                                tmx_unclean_vars = tmx_vars,
                                                                exact_match_vars = find_replace_exact_vars,
                                                                regex_match_vars = find_replace_regex_vars)),
                                 ('date_impute', DateMissingTransformer(datemissing_cols = date_missing_vars,
                                                                        reference_cols = date_missing_ref_vars,
                                                                       reference_dict = date_impute_dict)),
                            ('feature_selector', FeatureSelector(all_features)),
                            ('data_integrity', DataIntegrityTransformer(all_date_vars,
                                                                       numeric_vars,
                                                                       categorical_vars))
                                    ])
#creating features
fe_pipeline = Pipeline(steps = [("ida_fe", idaParser(ida_impute_vars)),
                                ('dates_fe', DateTransformer(fe_date_vars,
                                                             fe_timestamp_vars,
                                                             all_date_vars)),
                                ('missing_dealer', MissingHandler(percent_missing))
                                ])
#categorical and numeric transformers
categorical_transformer = Pipeline(steps = [('cat_imputer', cat_missing_imputer),
                                      ('rare_label_encoder', RareLabelEncoder(tol=tol_rare_label, n_categories=3,replace_with='Other')),
                                           ('cat_encoder', cat_encoder)
                                           ])
numeric_transformer = Pipeline(steps = [('num_imputer', num_missing_imputer),
                                       ('scaler', num_scaler)
                                       ])
#combining final cat and num transformer pipelines
preprocessor = ColumnTransformer(transformers=[
        ("num_t", numeric_transformer, make_column_selector(dtype_exclude="object")),
        ("cat_t", categorical_transformer, make_column_selector(dtype_include="object"))])


In [None]:
# cv_results = cross_validate(clf, X_train, y_train, cv=10,
#                                scoring=('average_precision', 'roc_auc'),
#                                n_jobs = -1)

In [None]:
# %%writefile cv_results_june17.txt

# {'fit_time': array([3092.79319549, 2225.58159685, 3121.47579575, 2224.81072187,
#         2054.69752979, 2321.39297771, 2171.7442019 , 2588.71676588,
#         3289.96349502, 2741.13525772]),
#  'score_time': array([229.11062717, 278.21359372, 221.84894419, 284.22967839,
#         437.0422523 , 281.2246213 , 350.1626215 , 224.6388104 ,
#         173.50640059, 261.87610626]),
#  'test_average_precision': array([0.28749455, 0.61527515, 0.78443781, 0.81882475, 0.89038318,
#         0.88343627, 0.90505814, 0.78419411, 0.72189914, 0.41737601]),
#  'test_roc_auc': array([0.87441938, 0.90931607, 0.97205096, 0.96659792, 0.97365605,
#         0.98095623, 0.98167272, 0.96564525, 0.94869597, 0.86197157])}

In [None]:
data_transformer = Pipeline(steps=[('parser', parsing_pipeline),
                      ('feature creation', fe_pipeline),
                      ('preprocessor', preprocessor)])

In [None]:
from datetime import timedelta
from datetime import datetime

start = datetime.now().time()
# parsing_pipeline.fit_transform(X_train)
end =  datetime.now().time()
datetime.combine(datetime.today(), end) - datetime.combine(datetime.today(), start)

In [None]:
X_test_transformed = data_transformer.transform(X_test)

In [None]:
X_val_transformed = data_transformer.transform(X_val)

In [None]:
def get_feature_out(estimator, feature_in):
    if hasattr(estimator,'get_feature_names'):
        if isinstance(estimator, _VectorizerMixin):
            # handling all vectorizers
            return [f'vec_{f}' \
                for f in estimator.get_feature_names()]
        else:
            return estimator.get_feature_names()
    elif isinstance(estimator, SelectorMixin):
        return np.array(feature_in)[estimator.get_support()]
    else:
        return feature_in

def get_ct_feature_names(ct):
    # handles all estimators, pipelines inside ColumnTransfomer
    # doesn't work when remainder =='passthrough'
    # which requires the input column names.
    output_features = []
    for name, estimator, features in ct.transformers_:
        if name!='remainder':
            if isinstance(estimator, Pipeline):
                current_features = features
                for step in estimator:
                    current_features = get_feature_out(step, current_features)
                features_out = current_features
            else:
                features_out = get_feature_out(estimator, features)
            output_features.extend(features_out)
        elif estimator=='passthrough':
            output_features.extend(ct._feature_names_in[features])
    return output_features

In [None]:
column_names = get_ct_feature_names(data_transformer.named_steps['preprocessor'])

In [None]:
X_val_transformed_named = pd.DataFrame(X_val_transformed, columns = column_names)

In [None]:
X_train_transformed_named = pd.DataFrame(transformed_X_train, columns = column_names)

In [None]:
# X_val_transformed = data_transformer.transform(X_val)
X_test_transformed_named = pd.DataFrame(X_test_transformed, columns = column_names)

In [None]:
X_train_transformed_named.to_csv('X_train_transformed_named.csv', index=False)
X_test_transformed_named.to_csv('X_test_transformed_named.csv', index=False)

In [None]:
import pickle              # import module first
f = open('X_val_transformed.pkl', 'wb')   # Pickle file is newly created where foo1.py is
pickle.dump(X_val_transformed, f)          # dump data to f
f.close()

In [None]:
X_test_transformed_named.to_csv('X_test_transformed_named.csv', index=False)

import pickle              # import module first
f = open('X_test_transformed.pkl', 'w')   # Pickle file is newly created where foo1.py is
pickle.dump(X_test_transformed, f)          # dump data to f
f.close()

In [None]:
X_train_transformed_named = pd.read_csv('X_train_transformed_named.csv')

f = open('X_val_transformed.pkl', 'r')   # 'r' for reading; can be omitted
X_val_transformed = pickle.load(f)         # load file content as mydict
f.close()

In [None]:

lgb_params = {'colsample_bytree': 0.4592204345120233,
    'learning_rate': 0.7734962535799013,
    'max_depth': 14,
    'min_child_weight': 47,
    'min_split_gain': 0.47802390488529,
    'num_leaves': 39,
    'reg_alpha': 0.6038325706218322,
    'reg_lambda': 0.26914645113480473,
    'subsample': 0.8926876963076777}

clflgb = lgb.sklearn.LGBMClassifier(**lgb_params)

clflgb.fit(X_train, y_train)

accuracy_score(y_val, clflgb.predict(X_val))

probas_ = clflgb.predict_proba(X_val)
plots_ytrue  = y_val.copy()
plots_yscore = probas_[:,1]
auc_roc = roc_auc_score(y_true = plots_ytrue, y_score = plots_yscore)
auc_pr = average_precision_score(y_true = plots_ytrue, y_score = plots_yscore)
print("AUC-ROC score is {}".format(round(auc_roc,4)))
print("AUC-PR score is {}".format(round(auc_pr,4)))

In [None]:
import lightgbm as lgb

lgb_params = {'colsample_bytree': 0.4592204345120233,
    'learning_rate': 0.7734962535799013,
    'max_depth': 14,
    'min_child_weight': 47,
    'min_split_gain': 0.47802390488529,
    'num_leaves': 39,
    'reg_alpha': 0.6038325706218322,
    'reg_lambda': 0.26914645113480473,
    'subsample': 0.8926876963076777}

clflgb = lgb.sklearn.LGBMClassifier(**lgb_params)

clflgb.fit(X_train, y_train)

# accuracy_score(y_val, clflgb.predict(X_val))

probas_ = clflgb.predict_proba(X_val)
plots_ytrue  = y_val.copy()
plots_yscore = probas_[:,1]
auc_roc = roc_auc_score(y_true = plots_ytrue, y_score = plots_yscore)
auc_pr = average_precision_score(y_true = plots_ytrue, y_score = plots_yscore)
print("AUC-ROC score is {}".format(round(auc_roc,4)))
print("AUC-PR score is {}".format(round(auc_pr,4)))

In [None]:
train_data = lgb.Dataset(X_train_transformed_named, label=y_train)
train_data.get_data

In [None]:
val_data = lgb.Dataset(X_val_transformed, label=y_val)

In [None]:
lgb_params = {
        'bagging_fraction': 0.77,
        'bagging_freq': 2,
        'lambda_l1': 0.7,
        'lambda_l2': 2,
        'learning_rate': 0.01,
        'max_depth': 10,
        'min_data_in_leaf': 22,
        'min_gain_to_split': 0.07,
        'min_sum_hessian_in_leaf': 19,
        'num_leaves': 20,
        'feature_fraction': 1,
        'save_binary': True,
        'seed': 42,
        'feature_fraction_seed': 42,
        'bagging_seed': 42,
        'drop_seed': 42,
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'metric': 'auc',
        'is_unbalance': True,
        'boost_from_average': 'false',
        'num_threads': 6
}

In [None]:
train_data = lgb.Dataset(X_train_transformed_named, label=y_train)
# clf = lgb.train(lgb_params, train_data, 10000, valid_sets=[val_data], verbose_eval=-1, early_stopping_rounds=100)

In [None]:
clflgb = lgb.sklearn.LGBMClassifier(min_data_in_leaf = 1, min_data_in_bin = 1)
# clflgb.fit(X_train, y_train)

In [None]:
score = cross_val_score(clflgb, X_train, y_train, cv=20,scoring='accuracy',n_jobs=-1)
score

In [None]:
clf.best_score

In [None]:
import pandas as pd
import numpy as np
from smart_open import smart_open
import lightgbm as lgb

from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, space_eval
from hyperopt.early_stop import no_progress_loss
from sklearn.model_selection import cross_val_score
from sklearn import datasets
from sklearn import svm
import sklearn.metrics


In [None]:
lgb_params = {'colsample_bytree': 0.4705939,
  'learning_rate': 0.01407,
  'max_depth': 13,
  'min_child_weight': 21,
  'min_split_gain': 0.4238,
  'n_estimators': 787,
  'num_leaves': 182,
  'reg_alpha': 0.3619,
  'reg_lambda': 0.32818,
  'subsample': 0.8505}

In [None]:
lgb_reg_params = {
    'max_depth':        hp.choice('max_depth', np.arange(7, 15, 1, dtype=int)),
    'learning_rate':    hp.uniform('learning_rate', 0.001, 0.01),
    'min_child_weight': hp.choice('min_child_weight', np.arange(12, 30, 2, dtype=int)),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 0.6),
    'subsample':        hp.uniform('subsample', 0.6, 1),
    'num_leaves':       hp.choice('num_leaves', np.arange(5, 200, 1, dtype=int)),
    'min_split_gain':   hp.uniform('min_split_gain', 0.3, 0.6),
    'reg_alpha':        hp.uniform('reg_alpha', 0.2, 0.5),
    'reg_lambda':       hp.uniform('reg_lambda',0.2, 0.7),
    'n_estimators':     hp.choice('n_estimators', np.arange(500, 1200, 25, dtype=int)),
    'max_delta_step': hp.uniform('max_delta_step',5, 200),
}

def f(params):
    print (params)
    lgbm = lgb.sklearn.LGBMClassifier(n_jobs=-1, early_stopping_rounds=None,**params)
    score = cross_val_score(lgbm, X_train, y_train, cv=10,scoring='average_precision',n_jobs=-1).mean()
    return score

trials = Trials()
result = fmin(
    fn=f,                           # objective function
    space=lgb_reg_params,   # parameter space
    algo=tpe.suggest,             # surrogate algorithm
    max_evals=50,
    early_stop_fn=no_progress_loss(iteration_stop_count=10, percent_increase=0.0),
    trials=trials# no. of evaluations
)
print(result)

space_eval(lgb_reg_params, result)

In [None]:
len([t for t in trials.trials])
# amphetamine

In [None]:
loss = []
for t in trials.trials:
    try:
        print(t['result']['loss'])
        loss.append(t['result']['loss'])
    except:
        pass

np.argmax(loss), np.max(loss)

In [None]:
best_values_dict = [t['misc']['vals'] for t in trials.trials][np.argmax(loss)]
dict(zip(list(best_values_dict.keys()), [x[0] for x in best_values_dict.values()]))


In [None]:
best_values_dict = [t['misc']['vals'] for t in trials.trials][np.argmax([t['result']['loss'] for t in trials.trials])]
dict(zip(list(best_values_dict.keys()), [x[0] for x in best_values_dict.values()]))
# create text file for storing

In [None]:
transformed_trainpath = 's3://sagemaker-shared-resources/model_outputs/fraud_model_6/intermediate_data/X_train_transformed_06242021.csv'
transformed_testpath = 's3://sagemaker-shared-resources/model_outputs/fraud_model_6/intermediate_data/X_test_transformed_06242021.csv'

filtered_trainpath = 's3://sagemaker-shared-resources/model_outputs/fraud_model_6/intermediate_data/X_train_filtered_07072021.csv'
filtered_testpath = 's3://sagemaker-shared-resources/model_outputs/fraud_model_6/intermediate_data/X_test_filtered_07072021.csv'

X_train = pd.read_csv(smart_open(filtered_trainpath), low_memory = False)
X_test = pd.read_csv(smart_open(filtered_testpath), low_memory = False)
y_train = X_train['dep_var']
X_train = X_train.drop('dep_var', axis = 1)
y_test = X_test['dep_var']
X_test = X_test.drop('dep_var', axis = 1)

np.random.seed(42)
val_select = np.random.choice(len(y_train), len(y_train) // 5)
X_val = X_train.iloc[val_select]
y_val = y_train[val_select]

X_train = X_train.drop(val_select).reset_index(drop=True)
y_train = y_train.drop(val_select).reset_index(drop=True)

In [None]:
lgb_params = {'colsample_bytree': 0.4790533301682829,
 'learning_rate': 0.0774950067234158,
 'max_depth': 15,
 'min_child_weight': 28,
 'min_split_gain': 0.3014256129493069,
 'n_estimators': 725,
 'num_leaves': 98,
 'max_delta_step': 8,
 'reg_alpha': 0.33684212620847115,
 'reg_lambda': 0.6818330325814729,
 'subsample': 0.8728280598689245}

clflgb = lgb.sklearn.LGBMClassifier(**lgb_params)

clflgb.fit(X_train, y_train)

# accuracy_score(y_val, clflgb.predict(X_val))

probas_ = clflgb.predict_proba(X_val)
plots_ytrue  = y_val.copy()
plots_yscore = probas_[:,1]
auc_roc = roc_auc_score(y_true = plots_ytrue, y_score = plots_yscore)
auc_pr = average_precision_score(y_true = plots_ytrue, y_score = plots_yscore)
print("AUC-ROC score is {}".format(round(auc_roc,4)))
print("AUC-PR score is {}".format(round(auc_pr,4)))

In [None]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score

# zKMODEL

# lgb_params = {'colsample_bytree': 0.4705939,
#   'learning_rate': 0.001407,
#   'max_depth': 13,
#   'min_child_weight': 21,
#   'min_split_gain': 0.4238,
#   'n_estimators': 787,
#   'num_leaves': 182,
#   'max_delta_step': 2,
#   'reg_alpha': 0.3619,
#   'reg_lambda': 0.32818,
#   'subsample': 0.8505}

lgb_params = {'colsample_bytree': 0.5617655338175288,
              'learning_rate': 0.00405414048712561,
              'max_delta_step': 173.2194970737864,
              'max_depth': 9, 'min_child_weight': 14,
              'min_split_gain': 0.4017601799355375,
              'n_estimators': 775, 'num_leaves': 154,
              'reg_alpha': 0.4603849990916753,
              'reg_lambda': 0.45386557520650367,
              'subsample': 0.8260702841662341}

clflgb = lgb.sklearn.LGBMClassifier(**lgb_params)

clflgb.fit(X_train, y_train)

# accuracy_score(y_test, clflgb.predict(X_test_transformed))

probas_ = clflgb.predict_proba(X_val)
plots_ytrue  = y_val.copy()
plots_yscore = probas_[:,1]
auc_roc = roc_auc_score(y_true = plots_ytrue, y_score = plots_yscore)
auc_pr = average_precision_score(y_true = plots_ytrue, y_score = plots_yscore)
print("AUC-ROC score is {}".format(round(auc_roc,4)))
print("AUC-PR score is {}".format(round(auc_pr,4)))

In [None]:
shap_values = shap.TreeExplainer(clflgb).shap_values(X_val)

In [None]:
shap.summary_plot(shap_values, X_val)

In [None]:
lgb_params = {'colsample_bytree': 0.4705939,
  'learning_rate': 0.01407,
  'max_depth': 13,
  'min_child_weight': 21,
  'min_split_gain': 0.4238,
  'n_estimators': 787,
  'num_leaves': 182,
  'reg_alpha': 0.3619,
  'reg_lambda': 0.32818,
  'subsample': 0.8505}

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
# lgb_params = {'colsample_bytree': 0.5299950340768536,
#  'learning_rate': 0.005278780380213163,
#  'max_depth': 2,
#  'min_child_weight': 3,
#  'min_split_gain': 0.3141438019439081,
#  'n_estimators': 6,
#  'num_leaves': 107,
#  'reg_alpha': 0.4762560219705076,
#  'reg_lambda': 0.23912490293874628,
#  'subsample': 0.9554349613610617}

lgb_params = {'colsample_bytree': 0.5617655338175288,
              'learning_rate': 0.00405414048712561,
              'max_delta_step': 173.2194970737864,
              'max_depth': 9, 'min_child_weight': 14,
              'min_split_gain': 0.4017601799355375,
              'n_estimators': 775, 'num_leaves': 154,
              'reg_alpha': 0.4603849990916753,
              'reg_lambda': 0.45386557520650367,
              'subsample': 0.8260702841662341}

lgbm = lgb.sklearn.LGBMClassifier(n_jobs=-1, early_stopping_rounds=None,**lgb_params)

cv_results = cross_validate(lgbm, X_train, y_train, cv=10,
                               scoring=('average_precision', 'roc_auc'),
                               n_jobs = -1)

In [None]:
print('printing final results')
auc_pr_mean = cv_results['test_average_precision'].mean()
print('AUC-PR = {}'.format(auc_pr_mean))
auc_roc_mean = cv_results['test_roc_auc'].mean()
print('AUC-ROC = {}'.format(auc_roc_mean))

In [None]:
lgb_params = {'colsample_bytree': 0.4705939,
  'learning_rate': 0.01407,
  'max_depth': 13,
  'min_child_weight': 21,
  'min_split_gain': 0.4238,
  'n_estimators': 787,
  'num_leaves': 182,
  'reg_alpha': 0.3619,
  'reg_lambda': 0.32818,
  'subsample': 0.8505}

lgbm = lgb.sklearn.LGBMClassifier(n_jobs=-1, early_stopping_rounds=None,**lgb_params)

cv_results = cross_validate(lgbm, X_train_, y_train_, cv=10,
                               scoring=('average_precision', 'roc_auc'),
                               n_jobs = -1)

In [None]:
print('printing final results')
auc_pr_mean = cv_results['test_average_precision'].mean()
print('AUC-PR = {}'.format(auc_pr_mean))
auc_roc_mean = cv_results['test_roc_auc'].mean()
print('AUC-ROC = {}'.format(auc_roc_mean))

In [None]:
def get_lgbm_varimp(model, train_columns, max_vars=50):

    if "basic.Booster" in str(model.__class__):
        # lightgbm.basic.Booster was trained directly, so using feature_importance() function
        cv_varimp_df = pd.DataFrame([train_columns, model.feature_importance()]).T
    else:
        # Scikit-learn API LGBMClassifier or LGBMRegressor was fitted,
        # so using feature_importances_ property
        cv_varimp_df = pd.DataFrame([train_columns, model.feature_importances_]).T

    cv_varimp_df.columns = ['feature_name', 'varimp']

    cv_varimp_df.sort_values(by='varimp', ascending=False, inplace=True)

    cv_varimp_df = cv_varimp_df.iloc[0:max_vars]

    return cv_varimp_df

In [None]:
get_lgbm_varimp(clflgb, X_train.columns, max_vars=50)