In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
!pip install catboost
!pip install scikit-optimize

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/90/86/c3dcb600b4f9e7584ed90ea9d30a717fb5c0111574675f442c3e7bc19535/catboost-0.24.1-cp36-none-manylinux1_x86_64.whl (66.1MB)
[K     |████████████████████████████████| 66.1MB 43kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.1
Collecting scikit-optimize
[?25l  Downloading https://files.pythonhosted.org/packages/5c/87/310b52debfbc0cb79764e5770fa3f5c18f6f0754809ea9e2fc185e1b67d3/scikit_optimize-0.7.4-py2.py3-none-any.whl (80kB)
[K     |████████████████████████████████| 81kB 3.8MB/s 
Collecting pyaml>=16.9
  Downloading https://files.pythonhosted.org/packages/15/c4/1310a054d33abc318426a956e7d6df0df76a6ddfa9c66f6310274fb75d42/pyaml-20.4.0-py2.py3-none-any.whl
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-20.4.0 scikit-optimize-0.7.4


In [14]:
import pandas as pd
import numpy as np
import os
import time

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score

from scipy import interp

import catboost
from catboost import CatBoostClassifier, Pool

from numpy import mean
from skopt.space import Integer
from skopt.space import Real
from skopt.utils import use_named_args
from skopt import gp_minimize

In [6]:
files_path = '/content/drive/My Drive/ml_2020/project/classification_datasets'

In [7]:
files_list = os.listdir(files_path)

# Data preparation

In [8]:
def read_dataset(folder_path, filename):
    df = pd.read_csv(os.path.join(folder_path, filename))
    return df

In [9]:
def prepare_data(df, target_col=None):
    df = df.fillna(df.mean())
    df = df.fillna('')

    if target_col:
        X = df.iloc[:, df.columns != target_col]
        y = df.loc[:, df.columns == target_col]
    else:
        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]

    lb = LabelEncoder()
    y = lb.fit_transform(y)

    cat_cols = list(X.select_dtypes(include=['category','object']))
    cat_features = [X.columns.get_loc(c) for c in cat_cols]        

    classes = np.unique(y)

    return X.values, y, cat_features, classes

#Metrics calculationns

In [16]:
def metrics_calc(y_true, y_pred, classes):

    y_true_label = label_binarize(y_true, classes)

    y_pred_arg = np.argmax(y_pred, axis=1)

    cnf_matrix = confusion_matrix(y_true, y_pred_arg)

    fp = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
    fn = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    tp = np.diag(cnf_matrix)
    tn = cnf_matrix.sum() - (fp + fn + tp)

    fp = fp.astype(float)
    fn = fn.astype(float)
    tp = tp.astype(float)
    tn = tn.astype(float)

    tpr = tp/(tp+fn)
    fpr = fp/(fp+tn)
    precision = tp/(tp+fp)
    acc = (tp+tn)/(tp+fp+fn+tn)

    mean_acc = np.nanmean(acc)
    mean_tpr = np.nanmean(tpr)
    mean_fpr = np.nanmean(fpr)
    mean_precision = np.nanmean(precision)

    precision = dict()
    recall = dict()
    fpr = dict()
    tpr = dict()
    roc_auc_list = []
    pr_auc_list = []

    if len(classes) == 2:
        fpr, tpr, threshold_roc = roc_curve(y_true, y_pred[:, 1])

        roc_auc = auc(fpr, tpr)
        roc_auc_list.append(roc_auc)

        
        precision, recall, threshold_pr = precision_recall_curve(y_true,
                                                            y_pred[:, 1])
        
        pr_auc = auc(recall, precision)
        pr_auc_list.append(pr_auc)

    else:
        for i in range(y_pred.shape[1]):
            fpr[i], tpr[i], threshold_roc = roc_curve(y_true_label[:, i], y_pred[:, i])
            roc_auc = auc(fpr[i], tpr[i])
            roc_auc_list.append(roc_auc)

            precision[i], recall[i], threshold_pr = precision_recall_curve(y_true_label[:, i],
                                                                y_pred[:, i])

            pr_auc = auc(recall[i], precision[i])

            pr_auc_list.append(pr_auc)


    mean_roc_auc = np.nanmean(roc_auc_list)
    mean_pr_auc = np.nanmean(pr_auc_list)

    print('mean roc auc',mean_roc_auc)
    print('mean pr auc', mean_pr_auc)

    return mean_acc, mean_tpr, mean_fpr, mean_precision, mean_roc_auc, mean_pr_auc


# Hyperparameters optimization

In [20]:
def hyper_optimization(file, X, y):
    search_space = [Integer(2, 4, name='depth', dtype=int),
                    Real(0.1, 0.3, name='learning_rate', prior='log_uniform'),
                    Integer(10, 20, name='l2_leaf_reg', dtype=int)]

    @use_named_args(search_space)
    def evaluate_model(**params):
        results = []
        kf = KFold(n_splits=3, random_state=42, shuffle=True)
        kf.get_n_splits(X)
        for train_index, test_index in kf.split(X, y):

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf = CatBoostClassifier(iterations=50,
                                    random_seed=0, 
                                    custom_metric=['AUC:hints=skip_train~false'],
                                    eval_metric='AUC',
                                    )

            clf.set_params(**params)

            clf.fit(X_train, y_train, cat_features, verbose=False)

            y_pred = clf.predict_proba(X_test)
            y_pred_arg = np.argmax(y_pred, axis=1)

            result = accuracy_score(y_test, y_pred_arg)

            results.append(result)

        # calculate the mean of the scores
        estimate = mean(results)
        return 1.0 - estimate

    # perform optimization
    result = gp_minimize(evaluate_model, search_space, n_calls=50)

    return result


# Training

In [21]:
for file in files_list:
    print('file', file)

    df = read_dataset(files_path, file)

    if file == 'analcatdata_germangss.csv':
        X, y, cat_features, classes = prepare_data(df, 'Political_system')
    else:
        X, y, cat_features, classes = prepare_data(df)

    result = hyper_optimization(file, X, y)

    best_params_vals = result.x
    params_name = ['depth', 'learning_rate', 'l2_leaf_reg']
    best_params = dict(zip(params_name, best_params_vals))

    print(best_params)

    kf = KFold(n_splits=10, random_state=42, shuffle=True)
    kf.get_n_splits(X)

    acc_list = []
    tpr_list = []
    fpr_list = []
    precision_list = []
    roc_auc_list = []
    pr_auc_list = []
    training_time_list = []
    inference_time_list = []
    cross_val = []

    for fold_num, data_index in enumerate(kf.split(X, y)):
        train_index, test_index = data_index[0], data_index[1]
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf = CatBoostClassifier(iterations=300,
                           random_seed=0, 
                           custom_metric=['AUC:hints=skip_train~false'],
                           eval_metric='AUC',
                        )

        clf.set_params(**best_params)

        t0 = time.time()

        clf.fit(X_train, y_train, cat_features, verbose=False)

        t1 = time.time()
        train_time = t1 - t0

        print('train time', train_time)

        t2 = time.time()

        y_pred = clf.predict_proba(X_test)

        t3 = time.time()
        inference_time = (t3 - t2) * 1000 / y_test.shape[0]

        acc, mean_tpr, mean_fpr, mean_precision, mean_roc_auc, mean_pr_auc = metrics_calc(y_test, y_pred, classes)

        acc_list.append("{:.3f}".format(acc))
        tpr_list.append("{:.3f}".format(mean_tpr))
        fpr_list.append("{:.3f}".format(mean_fpr))
        precision_list.append("{:.3f}".format(mean_precision))
        roc_auc_list.append("{:.3f}".format(mean_roc_auc))
        pr_auc_list.append("{:.3f}".format(mean_pr_auc))
        training_time_list.append("{:.1f}".format(train_time))
        inference_time_list.append("{:.1f}".format(inference_time))

        cross_val.append(fold_num + 1)

    results_dict = {'Dataset Name':[file.split('.')[0]] * 10,
                'Algorithm Name':['catboost'] * 10,
                'Cross Validation':cross_val,
                'Hyper Parameters Values': [best_params_vals] * 10,
                'Accuracy':acc_list,
                'tpr':tpr_list,
                'FPR':fpr_list,
                'Precision':precision_list,
                'AUC':roc_auc_list,
                'PR-Curve':pr_auc_list,
                'Training Time':training_time_list,
                'Inference Time':inference_time_list
                }

    df_res = pd.DataFrame.from_dict(results_dict)
    df_res = df_res.fillna(0)

    df_res.to_csv('path/to/results',
              mode='a', header=False, index=False)

file analcatdata_lawsuit.csv
{'depth': 3, 'learning_rate': 0.25477871012925496, 'l2_leaf_reg': 12}
train time 0.18136358261108398
mean roc auc 0.98
mean pr auc 0.7916666666666666
train time 0.13842296600341797
mean roc auc 0.9791666666666666
mean pr auc 0.8777777777777778
train time 0.13599443435668945
mean roc auc 1.0
mean pr auc 1.0




train time 0.14283394813537598
mean roc auc 1.0
mean pr auc 1.0
train time 0.13167905807495117
mean roc auc 1.0
mean pr auc 1.0
train time 0.14202642440795898
mean roc auc 1.0
mean pr auc 1.0
train time 0.13060665130615234
mean roc auc 1.0
mean pr auc 1.0
train time 0.14205622673034668
mean roc auc 1.0
mean pr auc 1.0
train time 0.1385810375213623
mean roc auc 1.0
mean pr auc 1.0
train time 0.1402289867401123
mean roc auc 1.0
mean pr auc 1.0


FileNotFoundError: ignored