In [4]:
# To run benchmark script, you will need to install XGBoost 
# (pip install XGBoost)
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer

from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit, cross_validate

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.linear_model import SGDClassifier, LogisticRegression

from interpret.glassbox import ExplainableBoostingClassifier
import warnings
warnings.filterwarnings("ignore")

In [5]:
def load_breast_data():
    breast = load_breast_cancer()
    feature_names = list(breast.feature_names)
    X, y = pd.DataFrame(breast.data, columns=feature_names), breast.target
    dataset = {
        'problem': 'classification',
        'full': {
            'X': X,
            'y': y,
        },
    }
    return dataset


def load_adult_data():
    df = pd.read_csv("data/adult.data",header=None)
    df.columns = [
        "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
        "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
        "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
    ]
    train_cols = df.columns[0:-1]
    label = df.columns[-1]
    X_df = df[train_cols]
    y_df = df[label]

    dataset = {
        'problem': 'classification',
        'full': {
            'X': X_df,
            'y': y_df,
        },
    }

    return dataset


def load_heart_data():
    # https://www.kaggle.com/ronitf/heart-disease-uci
    df = pd.read_csv('data/heart.csv')
    train_cols = df.columns[0:-1]
    label = df.columns[-1]
    X_df = df[train_cols]
    y_df = df[label]
    dataset = {
        'problem': 'classification',
        'full': {
            'X': X_df,
            'y': y_df,
        },
    }
    
    return dataset


def load_credit_data():
    # https://www.kaggle.com/mlg-ulb/creditcardfraud
    df = pd.read_csv('data/creditcard.csv')
    train_cols = df.columns[0:-1]
    label = df.columns[-1]
    X_df = df[train_cols]
    y_df = df[label]
    dataset = {
        'problem': 'classification',
        'full': {
            'X': X_df,
            'y': y_df,
        },
    }
    
    return dataset


def load_telco_churn_data():
    # https://www.kaggle.com/blastchar/telco-customer-churn/downloads/WA_Fn-UseC_-Telco-Customer-Churn.csv/1
    df = pd.read_csv('data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
    train_cols = df.columns[1:-1] # First column is an ID
    label = df.columns[-1]
    X_df = df[train_cols]
    y_df = df[label] # 'Yes, No'
    dataset = {
        'problem': 'classification',
        'full': {
            'X': X_df,
            'y': y_df,
        },
    }
    
    return dataset

In [9]:
def format_n(x):
    return "{0:.3f}".format(x)


def process_model(clf, name, X, y, n_splits=3):
    # Evaluate model
    ss = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.25, random_state=1337)
    scores = cross_validate(
        clf, X, y, scoring='roc_auc', cv=ss,
        n_jobs=None, return_estimator=True
    )

    record = dict()
    record['model_name'] = name
    record['fit_time_mean'] = format_n(np.mean(scores['fit_time']))
    record['fit_time_std'] = format_n(np.std(scores['fit_time']))
    record['test_score_mean'] = format_n(np.mean(scores['test_score']))
    record['test_score_std'] = format_n(np.std(scores['test_score']))

    return record



def benchmark_models(dataset_name, X, y, ct=None, n_splits=3, random_state=1337):
    if ct is None:
        is_cat = np.array([dt.kind == 'O' for dt in X.dtypes])
        cat_cols = X.columns.values[is_cat]
        num_cols = X.columns.values[~is_cat]

        cat_ohe_step = ('ohe', OneHotEncoder(sparse=False,
                                             handle_unknown='ignore'))

        cat_pipe = Pipeline([cat_ohe_step])
        num_pipe = Pipeline([('identity', FunctionTransformer())])
        transformers = [
            ('cat', cat_pipe, cat_cols),
            ('num', num_pipe, num_cols)
        ]
        ct = ColumnTransformer(transformers=transformers)

    records = []

    summary_record = {}
    summary_record['dataset_name'] = dataset_name
    print()
    print('-' * 78)
    print(dataset_name)
    print('-' * 78)
    print(summary_record)
    print()

    pipe = Pipeline([
        ('ct', ct),
        ('std', StandardScaler()),
        ('linear-sgd', SGDClassifier(random_state=random_state)),
    ])
    record = process_model(pipe, 'linear-sgd', X, y, n_splits=n_splits)
    print(record)
    record.update(summary_record)
    records.append(record)

    pipe = Pipeline([
        ('ct', ct),
        ('std', StandardScaler()),
        ('lr', LogisticRegression(random_state=random_state)),
    ])
    record = process_model(pipe, 'lr', X, y, n_splits=n_splits)
    print(record)
    record.update(summary_record)
    records.append(record)

    pipe = Pipeline([
        ('ct', ct),
        # n_estimators updated from 10 to 100 due to sci-kit defaults changing in future versions
        ('rf-100', RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=random_state)),
    ])
    record = process_model(pipe, 'rf-100', X, y, n_splits=n_splits)
    print(record)
    record.update(summary_record)
    records.append(record)
    
    pipe = Pipeline([
        ('ct', ct),
        ('xgb', XGBClassifier(random_state=random_state)),
    ])
    record = process_model(pipe, 'xgb', X, y, n_splits=n_splits)
    print(record)
    record.update(summary_record)
    records.append(record)
    
    pipe = Pipeline([
        ('ct', ct),
        ('lgb', lgb.LGBMClassifier(boosting_type="gbdt", num_leaves=16, reg_alpha=0, reg_lambda=1,
                             max_depth=-1, n_estimators=2000, objective='binary', subsample=0.8,
                             colsample_bytree=0.8, subsample_freq=1,learning_rate=0.02, 
                             random_state=random_state, metric="auc",n_jobs=-1)),
    ])
    record = process_model(pipe, 'lgb', X, y, n_splits=n_splits)
    print(record)
    record.update(summary_record)
    records.append(record)
    

    # No pipeline needed due to EBM handling string datatypes
    ebm_main = ExplainableBoostingClassifier(n_jobs=-1, interactions=0, random_state=random_state)
    record = process_model(ebm_main, 'ebm main', X, y, n_splits=n_splits)
    print(record)
    record.update(summary_record)
    records.append(record)

    return records

## heart disease

In [10]:
results = []
n_splits = 5

dataset = load_heart_data()
result = benchmark_models('heart', dataset['full']['X'], dataset['full']['y'], n_splits=n_splits)
results.append(result)


------------------------------------------------------------------------------
heart
------------------------------------------------------------------------------
{'dataset_name': 'heart'}

{'model_name': 'linear-sgd', 'fit_time_mean': '0.005', 'fit_time_std': '0.001', 'test_score_mean': '0.885', 'test_score_std': '0.018'}
{'model_name': 'lr', 'fit_time_mean': '0.006', 'fit_time_std': '0.001', 'test_score_mean': '0.915', 'test_score_std': '0.034'}
{'model_name': 'rf-100', 'fit_time_mean': '0.120', 'fit_time_std': '0.021', 'test_score_mean': '0.908', 'test_score_std': '0.023'}
{'model_name': 'xgb', 'fit_time_mean': '0.026', 'fit_time_std': '0.002', 'test_score_mean': '0.879', 'test_score_std': '0.017'}
{'model_name': 'lgb', 'fit_time_mean': '0.434', 'fit_time_std': '0.175', 'test_score_mean': '0.869', 'test_score_std': '0.017'}
{'model_name': 'ebm main', 'fit_time_mean': '0.996', 'fit_time_std': '0.479', 'test_score_mean': '0.925', 'test_score_std': '0.014'}


In [12]:
dataset['full']['X'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 13 columns):
age         303 non-null int64
sex         303 non-null int64
cp          303 non-null int64
trestbps    303 non-null int64
chol        303 non-null int64
fbs         303 non-null int64
restecg     303 non-null int64
thalach     303 non-null int64
exang       303 non-null int64
oldpeak     303 non-null float64
slope       303 non-null int64
ca          303 non-null int64
thal        303 non-null int64
dtypes: float64(1), int64(12)
memory usage: 30.9 KB


## breast cancer

In [11]:
dataset = load_breast_data()
result = benchmark_models('breast-cancer', dataset['full']['X'], dataset['full']['y'], n_splits=n_splits)
results.append(result)


------------------------------------------------------------------------------
breast-cancer
------------------------------------------------------------------------------
{'dataset_name': 'breast-cancer'}

{'model_name': 'linear-sgd', 'fit_time_mean': '0.007', 'fit_time_std': '0.004', 'test_score_mean': '0.993', 'test_score_std': '0.007'}
{'model_name': 'lr', 'fit_time_mean': '0.010', 'fit_time_std': '0.003', 'test_score_mean': '0.996', 'test_score_std': '0.005'}
{'model_name': 'rf-100', 'fit_time_mean': '0.173', 'fit_time_std': '0.011', 'test_score_mean': '0.991', 'test_score_std': '0.009'}
{'model_name': 'xgb', 'fit_time_mean': '0.084', 'fit_time_std': '0.002', 'test_score_mean': '0.995', 'test_score_std': '0.005'}
{'model_name': 'lgb', 'fit_time_mean': '0.387', 'fit_time_std': '0.014', 'test_score_mean': '0.994', 'test_score_std': '0.007'}
{'model_name': 'ebm main', 'fit_time_mean': '1.892', 'fit_time_std': '0.800', 'test_score_mean': '0.995', 'test_score_std': '0.005'}


## adult data

In [12]:
dataset = load_adult_data()
result = benchmark_models('adult', dataset['full']['X'], dataset['full']['y'], n_splits=n_splits)
results.append(result)


------------------------------------------------------------------------------
adult
------------------------------------------------------------------------------
{'dataset_name': 'adult'}

{'model_name': 'linear-sgd', 'fit_time_mean': '0.601', 'fit_time_std': '0.063', 'test_score_mean': '0.891', 'test_score_std': '0.005'}
{'model_name': 'lr', 'fit_time_mean': '0.302', 'fit_time_std': '0.010', 'test_score_mean': '0.907', 'test_score_std': '0.003'}
{'model_name': 'rf-100', 'fit_time_mean': '1.006', 'fit_time_std': '0.009', 'test_score_mean': '0.903', 'test_score_std': '0.002'}
{'model_name': 'xgb', 'fit_time_mean': '8.373', 'fit_time_std': '0.018', 'test_score_mean': '0.922', 'test_score_std': '0.002'}
{'model_name': 'lgb', 'fit_time_mean': '3.465', 'fit_time_std': '0.632', 'test_score_mean': '0.929', 'test_score_std': '0.002'}
{'model_name': 'ebm main', 'fit_time_mean': '47.470', 'fit_time_std': '1.949', 'test_score_mean': '0.929', 'test_score_std': '0.002'}


## credit data

In [15]:
dataset = load_credit_data()
['full']['X'].head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99


In [19]:
import lightgbm as lgb
train_x,train_y = dataset['full']['X'],dataset['full']['y']

skf = StratifiedKFold(n_splits=5, random_state=2019, shuffle=True)
for fold,(train_idx, val_idx) in enumerate(skf.split(train_x, train_y)):
#     for fold,(train_idx, val_idx) in enumerate(kf.split(train_x)):
    print(f'===========================fold:{fold}===================================')
    X_train, y_train, X_valid, y_valid = train_x.iloc[train_idx], train_y[train_idx], train_x.iloc[val_idx], train_y[val_idx]

    clf = lgb.LGBMClassifier(boosting_type="gbdt", num_leaves=10, reg_alpha=0, reg_lambda=1,
                             max_depth=-1, n_estimators=2000, objective='binary', subsample=0.8,
                             colsample_bytree=0.8, subsample_freq=1,learning_rate=0.02, 
                             random_state=1000*fold+66, metric="None",n_jobs=-1)
    
    clf = ExplainableBoostingClassifier(n_jobs=-1, interactions=0, random_state=random_state)
    
    clf.fit(X_train, y_train, eval_set=[(X_train, y_train),(X_valid, y_valid)], eval_metric = ['auc'],
            verbose=100, early_stopping_rounds=200)
    
    temp_score = clf.best_score_['valid_1']['auc']
    print(f'=====fold:{fold}, best_score:{temp_score}, best_iteration:{clf.best_iteration_}')

Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.999875	valid_1's auc: 0.982535
[200]	training's auc: 0.999999	valid_1's auc: 0.981885
Early stopping, best iteration is:
[55]	training's auc: 0.997912	valid_1's auc: 0.989163
=====fold:0, best_score:0.9891628594475789, best_iteration:55
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.99986	valid_1's auc: 0.984373
[200]	training's auc: 0.999999	valid_1's auc: 0.983368
[300]	training's auc: 1	valid_1's auc: 0.983232
Early stopping, best iteration is:
[117]	training's auc: 0.999966	valid_1's auc: 0.984852
=====fold:1, best_score:0.9848524816957717, best_iteration:117
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.999838	valid_1's auc: 0.984401
[200]	training's auc: 0.999998	valid_1's auc: 0.984817
[300]	training's auc: 1	valid_1's auc: 0.98376
Early stopping, best iteration is:
[136]	training's auc: 0.999976	valid_1's auc: 0.

In [None]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

train_x,train_y = dataset['full']['X'],dataset['full']['y']

skf = StratifiedKFold(n_splits=5, random_state=2019, shuffle=True)
for fold,(train_idx, val_idx) in enumerate(skf.split(train_x, train_y)):
#     for fold,(train_idx, val_idx) in enumerate(kf.split(train_x)):
    print(f'===========================fold:{fold}===================================')
    X_train, y_train, X_valid, y_valid = train_x.iloc[train_idx], train_y[train_idx], train_x.iloc[val_idx], train_y[val_idx]

    clf = ExplainableBoostingClassifier(n_jobs=-1, interactions=0, n_estimators=500, learning_rate=0.05)
    
    clf.fit(X_train, y_train)
    
    val_pred = clf.predict_proba(X_valid)[:,1]

    temp_score = roc_auc_score(y_valid, val_pred)
    print(f'=====fold:{fold}, best_score:{temp_score}, best_iteration:{clf.best_iteration_}')

In [16]:
result = benchmark_models('credit-fraud', dataset['full']['X'], dataset['full']['y'], n_splits=n_splits)
results.append(result)


------------------------------------------------------------------------------
credit-fraud
------------------------------------------------------------------------------
{'dataset_name': 'credit-fraud'}

{'model_name': 'linear-sgd', 'fit_time_mean': '0.676', 'fit_time_std': '0.084', 'test_score_mean': '0.980', 'test_score_std': '0.007'}
{'model_name': 'lr', 'fit_time_mean': '0.882', 'fit_time_std': '0.075', 'test_score_mean': '0.974', 'test_score_std': '0.008'}
{'model_name': 'rf-100', 'fit_time_mean': '34.787', 'fit_time_std': '1.363', 'test_score_mean': '0.942', 'test_score_std': '0.016'}
{'model_name': 'xgb', 'fit_time_mean': '45.166', 'fit_time_std': '0.739', 'test_score_mean': '0.979', 'test_score_std': '0.004'}
{'model_name': 'lgb', 'fit_time_mean': '17.583', 'fit_time_std': '0.374', 'test_score_mean': '0.984', 'test_score_std': '0.005'}
{'model_name': 'ebm main', 'fit_time_mean': '456.001', 'fit_time_std': '665.034', 'test_score_mean': '0.969', 'test_score_std': '0.007'}


## telcom churn data

In [8]:
dataset = load_telco_churn_data()
result = benchmark_models('telco-churn', dataset['full']['X'], dataset['full']['y'], n_splits=3)
results.append(result)


------------------------------------------------------------------------------
telco-churn
------------------------------------------------------------------------------
{'dataset_name': 'telco-churn'}

{'model_name': 'linear-sgd', 'fit_time_mean': '2.236', 'fit_time_std': '0.213', 'test_score_mean': '0.798', 'test_score_std': '0.008'}
{'model_name': 'lr', 'fit_time_mean': '25.970', 'fit_time_std': '2.931', 'test_score_mean': '0.804', 'test_score_std': '0.015'}
{'model_name': 'rf-100', 'fit_time_mean': '3.310', 'fit_time_std': '1.204', 'test_score_mean': '0.824', 'test_score_std': '0.002'}
{'model_name': 'xgb', 'fit_time_mean': '140.873', 'fit_time_std': '0.970', 'test_score_mean': '0.850', 'test_score_std': '0.006'}
{'model_name': 'ebm main', 'fit_time_mean': '11.451', 'fit_time_std': '1.413', 'test_score_mean': '0.851', 'test_score_std': '0.005'}
{'model_name': 'ebm-interact', 'fit_time_mean': '5.363', 'fit_time_std': '0.854', 'test_score_mean': '0.851', 'test_score_std': '0.005'}


In [9]:
records = [item for result in results for item in result]
record_df = pd.DataFrame.from_records(records)[['dataset_name', 'model_name', 'test_score_mean', 'test_score_std']]
record_df.to_csv('ebm-perf-classification-overnight.csv')