In [1]:
!pip install thundergbm

Collecting thundergbm
[?25l  Downloading https://files.pythonhosted.org/packages/39/83/2b3823be05ecceaf0edcb666950dddcba9601ef764b22b6e99cea3259a98/thundergbm-0.3.16-py3-none-any.whl (2.8MB)
[K     |████████████████████████████████| 2.8MB 3.5MB/s 
Installing collected packages: thundergbm
Successfully installed thundergbm-0.3.16


In [2]:
!pip install scikit-optimize

Collecting scikit-optimize
[?25l  Downloading https://files.pythonhosted.org/packages/5c/87/310b52debfbc0cb79764e5770fa3f5c18f6f0754809ea9e2fc185e1b67d3/scikit_optimize-0.7.4-py2.py3-none-any.whl (80kB)
[K     |████                            | 10kB 12.7MB/s eta 0:00:01[K     |████████▏                       | 20kB 2.1MB/s eta 0:00:01[K     |████████████▎                   | 30kB 2.7MB/s eta 0:00:01[K     |████████████████▎               | 40kB 2.0MB/s eta 0:00:01[K     |████████████████████▍           | 51kB 2.3MB/s eta 0:00:01[K     |████████████████████████▌       | 61kB 2.6MB/s eta 0:00:01[K     |████████████████████████████▌   | 71kB 3.0MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 2.6MB/s 
Collecting pyaml>=16.9
  Downloading https://files.pythonhosted.org/packages/15/c4/1310a054d33abc318426a956e7d6df0df76a6ddfa9c66f6310274fb75d42/pyaml-20.4.0-py2.py3-none-any.whl
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-

In [4]:
import pandas as pd
import numpy as np
from numpy import mean
import os
import time
from scipy import interp

from thundergbm import TGBMClassifier

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, precision_recall_curve, auc, roc_curve
from sklearn.preprocessing import label_binarize, LabelEncoder

from skopt.space import Integer
from skopt.space import Real
from skopt.utils import use_named_args
from skopt import gp_minimize


In [37]:
np.seterr(divide='ignore', invalid='ignore')


{'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'ignore'}

In [6]:

files_path = '/path/to/datasets'

In [7]:
files_list = os.listdir(files_path)

# Data preparation

In [8]:
def read_dataset(folder_path, filename):
    df = pd.read_csv(os.path.join(folder_path, filename))
    return df

In [2]:
def dummy_encode(df):
    """  
    one hot encoding to categorical columns
    returns numpy array
    """

    cols_to_encode = list(df.select_dtypes(include=['category','object']))
    if len(cols_to_encode): 
        df = pd.get_dummies(df, columns = cols_to_encode, prefix=cols_to_encode)

    return df.values

In [11]:
def prepare_data(df):
    df = df.fillna(df.mean())
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    lb = LabelEncoder()
    y = lb.fit_transform(y)

    X = dummy_encode(X)
    classes = np.unique(y)

    return X, y, classes

# Metrics calculations

In [35]:
def metrics_calc(y_true, y_pred, classes, clf):

    # adding missing class in case the group labels are less than the initial classes size
    if len(clf.group_label) < len(classes):
        clf.group_label.extend(list(set(classes) - set(clf.group_label)))
    y_true_label = label_binarize(y_true, clf.group_label)

    if len(classes) > 2:
        y_pred_arg = np.argmax(y_pred, axis=1)
        cnf_matrix = confusion_matrix(np.argmax(y_true_label, axis=1),
                                      y_pred_arg, 
                                      labels=sorted(clf.group_label))
    else:
        y_pred_arg = np.where(clf.predict_label > 0.5, 1, 0)
        cnf_matrix = confusion_matrix(y_true, y_pred_arg)

    fp = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
    fn = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    tp = np.diag(cnf_matrix)
    tn = cnf_matrix.sum() - (fp + fn + tp)

    fp = fp.astype(float)
    fn = fn.astype(float)
    tp = tp.astype(float)
    tn = tn.astype(float)

    tpr = tp/(tp+fn)
    fpr = fp/(fp+tn)
    precision = tp/(tp+fp)
    acc = (tp+tn)/(tp+fp+fn+tn)

    mean_acc = np.nanmean(acc)
    mean_tpr = np.nanmean(tpr)
    mean_fpr = np.nanmean(fpr)
    mean_precision = np.nanmean(precision)

    precision = dict()
    recall = dict()
    fpr = dict()
    tpr = dict()
    roc_auc_list = []
    pr_auc_list = []

    if len(classes) == 2:
        fpr, tpr, threshold_roc = roc_curve(y_true, y_pred)

        roc_auc = auc(fpr, tpr)
        roc_auc_list.append(roc_auc)

        precision, recall, threshold_pr = precision_recall_curve(y_true,
                                                            y_pred)

        pr_auc = auc(recall, precision)

        pr_auc_list.append(pr_auc)

    else:
        for i in range(y_pred.shape[1]):
            fpr[i], tpr[i], threshold_roc = roc_curve(y_true_label[:, i], y_pred[:, i])
            roc_auc = auc(fpr[i], tpr[i])
            roc_auc_list.append(roc_auc)
            
            precision[i], recall[i], threshold_pr = precision_recall_curve(y_true_label[:, i],
                                                                y_pred[:, i])
            
            pr_auc = auc(recall[i], precision[i])
            pr_auc_list.append(pr_auc)

    mean_roc_auc = np.nanmean(roc_auc_list)
    mean_pr_auc = np.nanmean(pr_auc_list)

    return mean_acc, mean_tpr, mean_fpr, mean_precision, mean_roc_auc, mean_pr_auc

In [15]:
import warnings
warnings.filterwarnings('ignore', message='The objective has been evaluated at this point before.')

# Training

In [38]:
for file in files_list:
    print('file', file)

    df = read_dataset(files_path, file)
    df = df.iloc[:, 1:]

    X, y, classes = prepare_data(df)

    search_space = [Integer(4, 10 , name='depth', dtype=int),
                    Integer(10, 80, name='n_trees', dtype=int)]

    @use_named_args(search_space)
    def evaluate_model(**params):
        results = []
        kf = KFold(n_splits=3, random_state=42, shuffle=True)
        kf.get_n_splits(X)
        for train_index, test_index in kf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf = TGBMClassifier(num_class=len(classes),
                                 objective="multi:softprob")
            clf.set_params(**params)

            clf.fit(X_train, y_train)

            y_proba = clf.predict(X_test)
            prob_list = list(clf.predict_label_ptr)
            # creating a numpy array filled with zeros with the size of the original classes
            prob_array = np.zeros((X_test.shape[0], len(classes)))
            # filling the probabilities results from the model to the numpy array
            prob_array[:, :clf.num_class] = np.array(prob_list).reshape(clf.num_class, -1).T
            y_pred = prob_array

            group_classes = clf.group_label
            result = accuracy_score(np.argmax(label_binarize(y_test, group_classes), axis=1),
                                    y_pred.argmax(axis=1))
            results.append(result)

        # calculate the mean of the scores
        estimate = mean(results)
        return 1.0 - estimate

    # perform optimization
    result = gp_minimize(evaluate_model, search_space, n_calls=50)

    print('Best Parameters: %s=%d, %s=%d' % (search_space[0].name, result.x[0],
                                             search_space[1].name, result.x[1]))

    best_params_vals = result.x
    params_name = ['depth', 'n_trees']
    best_params = dict(zip(params_name, best_params_vals))
    
    kf = KFold(n_splits=10, random_state=21, shuffle=True)
    kf.get_n_splits(X)

    acc_list = []
    tpr_list = []
    fpr_list = []
    precision_list = []
    roc_auc_list = []
    pr_auc_list = []
    training_time_list = []
    inference_time_list = []
    cross_val = []

    for fold_num, data_index in enumerate(kf.split(X, y)):
        train_index, test_index = data_index[0], data_index[1]
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf = TGBMClassifier( objective="multi:softprob")

        clf.set_params(**best_params)
        t0 = time.time()
        clf.fit(X_train, y_train)
        t1 = time.time()  
        train_time = t1 - t0

        t2 = time.time()
        preds = clf.predict(X_test)
        t3 = time.time()
        inference_time = (t3 - t2) * 1000 / y_test.shape[0]

        prob_list = list(clf.predict_label_ptr)

        if len(classes) > 2:
            prob_array = np.zeros((X_test.shape[0], len(classes)))
            prob_array[:, :clf.num_class] = np.array(prob_list).reshape(clf.num_class, -1).T
            y_pred = prob_array
        else:
            prob_array = np.zeros((X_test.shape[0], len(classes)))
            prob_array[:, :clf.num_class] = np.array(prob_list).reshape(clf.num_class, -1).T
            y_pred = prob_array
            y_pred = preds

        acc, mean_tpr, mean_fpr, mean_precision, mean_roc_auc, mean_pr_auc = metrics_calc(y_test, y_pred, classes, clf)
        
        acc_list.append("{:.3f}".format(acc))
        tpr_list.append("{:.3f}".format(mean_tpr))
        fpr_list.append("{:.3f}".format(mean_fpr))
        precision_list.append("{:.3f}".format(mean_precision))
        roc_auc_list.append("{:.3f}".format(mean_roc_auc))
        pr_auc_list.append("{:.3f}".format(mean_pr_auc))
        training_time_list.append("{:.1f}".format(train_time))
        inference_time_list.append("{:.1f}".format(inference_time))

        cross_val.append(fold_num + 1)

    results_dict = {'Dataset Name':[file.split('.')[0]] * 10,
                'Algorithm Name':['thundergbm'] * 10,
                'Cross Validation':cross_val,
                'Hyper Parameters Values': [best_params_vals] * 10,
                'Accuracy':acc_list,
                'tpr':tpr_list,
                'FPR':fpr_list,
                'Precision':precision_list,
                'AUC':roc_auc_list,
                'PR-Curve':pr_auc_list,
                'Training Time':training_time_list,
                'Inference Time':inference_time_list
                }

    df = pd.DataFrame.from_dict(results_dict)
    df = df.fillna(0)

    df.to_csv('path/to/results/file',
              mode='a', header=False)



file analcatdata_lawsuit.csv
Best Parameters: depth=6, n_trees=51


