In [None]:
## This notebook does not display any charts in Colab. Download it and run it in Jupyter notebook

In [None]:
# Last amended: 29th May, 2022
# Data Source: https://www.kaggle.com/c/amazon-employee-access-challenge
# catboost Ref: 
#                1. https://github.com/catboost/tutorials/blob/master/python_tutorial_with_tasks.ipynb
#                2. USeful examples: https://catboost.ai/en/docs/concepts/python-usages-examples
#                3. Data visualization: https://catboost.ai/en/docs/features/visualization

# Objective:
#             a. Predict an employee's access needs, given his/her job role
#             b. catboost learning visualization
#             c. catboost crossvalidation
#             d. catboost hyperparameter tuning
#             e. catboost and SHAP integration. Explaining models
#
"""

On Anaconda create a new environment 'catboost' as follows (one line code):

   >conda create -n catboost python=3.7.13 scikit-learn pandas numpy ipython shap jupyter catboost matplotlib seaborn ipywidgets hyperopt -c anaconda -c conda-forge


If required run the following command in 'catboost' environment.

    >conda activate catboost
    >ipython kernel install --name catboost --user

"""

In this tutorial we will use dataset Amazon Employee Access Challenge from [Kaggle](https://www.kaggle.com) competition for our experiments. Data can be downloaded [here](https://www.kaggle.com/c/amazon-employee-access-challenge/data). But data is also available in catboost library. We will use the dataset available within catboost library. 

## Libraries installation
Ony needed in colab

In [None]:
# -1.0  Not needed if software already installed
#!pip install --user --upgrade catboost
#!pip install --user --upgrade ipywidgets

# SHAP (SHapley Additive exPlanations) is a game theoretic
#  approach to explain the output of any machine learning model.
# https://shap.readthedocs.io/en/latest/index.html
#!pip install shap

#!pip install sklearn
#!pip install --upgrade numpy
#!jupyter nbextension enable --py widgetsnbextension

In [None]:
# -1.1 Not needed
#      Check catboost and python versions
#      Restart runtime if import fails:

#import catboost           # 1.0.6
#print(catboost.__version__)
#!python --version         # 3.7.13

## Call libraries

In [None]:
# 1.0 Call libraries

# 1.1 Data manipulation
import pandas as pd
import numpy as np

# 1.2 Catboost related
from catboost import *
from catboost import CatBoostClassifier, Pool, metrics, cv
from catboost import datasets

# 1.2.1 For catboost results visualization
from catboost import MetricVisualizer

# 1.3 Bayesian optimization
import hyperopt

# 1.4
import sklearn
from sklearn.model_selection import train_test_split

# 1.5 Plotting, in general
import matplotlib.pyplot as plt
import os, time ,gc

In [None]:
# 1.6 Numpy output be printed to
#      how many decimal points:

np.set_printoptions(precision=4)

In [None]:
# 1.7 Display multiple command outputs from a cell:

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# 1.8 Set path to dataset

path = "D:\\data\\OneDrive\\Documents\\Amazon.com-Employee_Access_ Challenge"
os.chdir(path)
os.listdir()

## Reading the data

In [None]:
# 2.0 Load train/test data from catboost package:
#     Machine must be connected to Internet to download the dataset

#(train_df,test_df) =  catboost.datasets.amazon()

# 2.0.1 Read from disk
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [None]:
# 2.1 Very brief exploration:

print("\n===dtypes===")
train_df.dtypes
print("\n===Shape ===")
train_df.shape
print("\n======")
train_df.head()

In [None]:
# 2.2 Get unique values in each feature:

for i in train_df.columns:
    print(i, train_df[i].nunique())

## Preparing your data

### Transform all columns to category:

In [None]:
# 2.3 Shrink memory usage:

print("\n==Memory usage in mb==")
train_df.memory_usage().sum()/1000000   # 2.621648

print("\n")

# 2.3.1 To 'category'
for i in train_df.columns:
  train_df[i] = train_df[i].astype('category')

# 2.3.2
print("\n==Memory usage after transformation==")
train_df.memory_usage().sum()/1000000   # 1.221142

Extract X and y

In [None]:
# 2.4 X and y:

y = train_df["ACTION"]
X = train_df.drop('ACTION', axis=1)

Categorical features declaration

In [None]:
# 2.5 Get column indicies of all categorical features

cat_features = list(range(X.shape[1]))
cat_features

Is our data balanced?

In [None]:
# 2.6 Not much

y.value_counts()
y.value_counts(normalize = True)

### What is a Pool class:
See [here](https://stackoverflow.com/a/68238224/3282777)

Catboost only works with <i>Pools</i>, which is internal data format. If you pass numpy array to it, it will implicitly convert it to Pool first, without telling you. If you need to apply many formulas to one dataset, using Pool drastically increases performance (like 10x), because you'll omit converting step each time.

In [None]:
# 3.0

# 3.1 Pool will be created and saved in this folder
dataset_dir = './amazon'
if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

# 3.2 Save your dataset now in csv format
train_df.to_csv(
    os.path.join(dataset_dir, 'train.csv'),
    index=False, sep=',', header=True
)

# 3.3 Save test dataset in csv format:
test_df.to_csv(
    os.path.join(dataset_dir, 'test.csv'),
    index=False, sep=',', header=True
)


# 3.4 Save also both train/test files to 'tsv' format:

train_df.to_csv(
    os.path.join(dataset_dir, 'train.tsv'),
    index=False, sep='\t', header=False
)


# 3.5
test_df.to_csv(
    os.path.join(dataset_dir, 'test.tsv'),
    index=False, sep='\t', header=False
)


In [None]:
# 3.6 Understanding code that follows:

# 3.6.1
list(range(1, len(train_df.columns)))    # [1,2,3,4,5,6,7,8,9]

# 3.6.2
list(enumerate(train_df.columns[1:]))    # Gives a tuple [(0, 'RESOURCE'),(1, 'MGR_ID'),(2, 'ROLE_ROLLUP_1')...
# Convert the above list to dict
dict(list(enumerate(train_df.columns[1:]))) # {0: 'RESOURCE',1: 'MGR_ID', 2: 'ROLE_ROLLUP_1',

# 3.6.3
feature_names = dict(list(enumerate(train_df.columns[1:])))
feature_names       #  {0: 'RESOURCE',1: 'MGR_ID', 2: 'ROLE_ROLLUP_1',

In [None]:
# 3.7 Generate a columns description file (cd) with the given structure.
#     https://catboost.ai/en/docs/concepts/python-reference_utils_create_cd

from catboost.utils import create_cd

# 3.7.1
create_cd(
         label = 0,                    # What is the index of target column
         cat_features=list(range(1, len(train_df.columns))),  # Index of other cat columns: [1, 2, 3, 4, 5, 6, 7, 8, 9]
         feature_names= feature_names,        # A dictionary with the list of column indices 
                                              #   and the corresponding feature names
         output_path=os.path.join(dataset_dir, 'train.cd')  # Store the information here
         )

In [None]:
# 3.8 Just look at your saved data:
#     A complex powershell command:

print("\n\n==train.csv==\n")
!powershell -command "& {Get-Content amazon\train.csv -TotalCount 5}"

print("\n\n==train.tsv==\n")
!powershell -command "& {Get-Content amazon\train.tsv -TotalCount 5}"


In [None]:
# 3.9 Your column descriptor file

!more  amazon\\train.cd

In [None]:
help(Pool)

"""
Pool(  data, label=None, 
       cat_features=None,
       text_features=None, embedding_features=None,
       column_description=None, pairs=None, delimiter='\t',
       has_header=False, ignore_csv_quoting=False, weight=None,
       group_id=None, group_weight=None, subgroup_id=None,
       pairs_weight=None, baseline=None, feature_names=None, 
       thread_count=-1, log_cout=<ipykernel.iostream.OutStream object at 0x0
       )
"""

In [None]:
# 4.0 What is a Pool in catboost? 
#     https://catboost.ai/en/docs/concepts/python-reference_pool
#     See StackOverflow here: https://stackoverflow.com/a/65852092/3282777
#     Pool used in CatBoost as a data structure to train model from.

# 4.1 One way to define pool
pool1 = Pool(
             data=X, label=y,              # Has both X,y
             cat_features=cat_features     # [0, 1, 2, 3, 4, 5, 6, 7, 8]
             )


# 4.2 More parameters in pool. Has just train data and column description
pool2 = Pool(
                data=os.path.join(dataset_dir, 'train.csv'), 
                delimiter=',', 
                column_description=os.path.join(dataset_dir, 'train.cd'),
                has_header=True
)


# 4.2 Another way to define pool
pool3 = Pool(data=X, 
             cat_features=cat_features
            )


print('Dataset shape')
print('dataset 1:' + str(pool1.shape) +
      '\ndataset 2:' + str(pool2.shape) + 
      '\ndataset 3:' + str(pool3.shape)) 

    

print('\n')
print('Column names')
print('dataset 1:')
print(pool1.get_feature_names()) 
print('\ndataset 2:')
print(pool2.get_feature_names())
print('\ndataset 3:')
print(pool3.get_feature_names())


## Split your data into train and validation

In [None]:
# 5.0

X_train, X_validation, y_train, y_validation = train_test_split(X,
                                                                y,
                                                                train_size=0.8,
                                                                random_state=1234
                                                               )

#### Selecting the objective function

The default optimized objective depends on various conditions:

    Logloss — The target has only two different values 
    MultiClass — The target has more than two different values 
    

## Train the model

In [None]:
# 5.1 Instantiate the model:
#     https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier

model = CatBoostClassifier(
                            iterations=50,
                            loss_function='Logloss',  # Options are: 
                                                      # Logloss, CrossEntropy, MultiClass, MultiClassOneVsAll
                            verbose=5,
                            )


# 5.2 Train the model:
#     https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier_fit

model.fit(
            X_train, y_train,
            cat_features=cat_features,
            eval_set=(X_validation, y_validation),  # Evaluate and print LogLoss
         )

## Metrics calculation and graph plotting

In [None]:
# 6.0 Fitting as also 
#     https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier

model = CatBoostClassifier(
    iterations=50,
    random_seed=63,
    learning_rate=0.5,
    custom_loss=['AUC', 'Accuracy']
)


# 6.1
#  https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier_fit

model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_validation, y_validation),
    verbose=False,
    plot=True
)

## Model comparison

In [None]:
# 7.0 Compare model in one graph
#     First store necessary data in files:
#     All data is getting stored in files.
#     Nothing is displayed (stdout)

# 7.1 Define Ist model:

##======== Defining====
model1 = CatBoostClassifier(
    learning_rate=0.7,
    iterations=100,
    random_seed=0,
    train_dir='learing_rate_0.7'   # A folder is created in current dir
)

# 7.2 Define IInd model
model2 = CatBoostClassifier(
    learning_rate=0.01,
    iterations=100,
    random_seed=0,
    train_dir='learing_rate_0.01' # Another folder created in current directory
)

#=======Fitting ==========

# 7.3 Fit Ist model:
model1.fit(
    X_train, y_train,
    eval_set=(X_validation, y_validation),
    cat_features=cat_features,
    verbose=False
)


# 7.4 Fit IInd model
model2.fit(
    X_train, y_train,
    eval_set=(X_validation, y_validation),
    cat_features=cat_features,
    verbose=False
)

In [None]:
# 7.5 Some useful results
#     https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier#attributes

model.best_score_
model.best_iteration_
model.tree_count_    # How many trees were constructed

In [None]:
# 7.5 Read data from directories listed in the list:

MetricVisualizer(['learing_rate_0.01',   # Ist  folder
                  'learing_rate_0.7']    # IInd folder
                ).start()

## Cross-validation
Refer [here](https://catboost.ai/en/docs/features/cross-validation) and [here](https://catboost.ai/en/docs/concepts/python-reference_cv)

Training can be launched in cross-validation mode. In this case, only the training dataset is required. This dataset is split, and the resulting folds are used as the learning and evaluation datasets. If the input dataset contains the GroupId column, all objects from one group are added to the same fold.

Each cross-validation run from the command-line interface launches one training out of N trainings in N-fold cross-validation. Use the cv function of the Python package instead of the command-line version. It returns aggregated results out-of-the-box.

Description

Plot the following information during training:

    the metric values per fold;
    the custom loss values, if any;
    the loss function change during feature selection;
    the time has passed since training started;
    the remaining time until the end of training.
    This option can be used if training is performed in Jupyter notebook.


In [None]:
# 8.0
# Specify the parameters explicitly:
params = {}
params['loss_function'] = 'Logloss'
params['iterations'] = 80
params['custom_loss'] = 'AUC'
params['random_seed'] = 63
params['learning_rate'] = 0.5

# 8.1 Perform cross-validation
cv_data = cv(
    params = params,    # What parameters?
    pool = Pool(X, label=y, cat_features=cat_features),  # Works only with Pool dataset and not with numpy arrays
    fold_count=5,
    shuffle=True,
    partition_random_seed=0,
    type = 'Classical',
    plot=True,
    stratified=False,
    verbose=False
)

In [None]:
# 8.2 See Ist five rows of result:
cv_data.head()

In [None]:
# 8.3 Best values
best_value = np.min(cv_data['test-Logloss-mean'])
best_iter  = np.argmin(cv_data['test-Logloss-mean'])
stdDev = cv_data['test-Logloss-std'][best_iter]


print(best_value)
print(best_iter)
print(stdDev)



Type of Cross-validation:<br>
[Possible values](https://catboost.ai/en/docs/concepts/python-reference_cv#type):

    Classical — The dataset is split into fold_count folds, fold_count trainings are performed. Each test set consists of a single fold, and the corresponding train set consists of the remaining k–1 folds.

    Inverted — The dataset is split into fold_count folds, fold_count trainings are performed. Each test set consists of the first k–1 folds, and the corresponding train set consists of the remaining fold.

    TimeSeries — The dataset is split into (fold_count + 1) consecutive parts without shuffling the data, fold_count trainings are performed. The k-th train set consists of the first k folds, and the corresponding test set consists of the (k+1)-th fold.
    
Default is Classical    


## Overfitting detector
Use early_stopping_rounds

In [None]:
# 9.0
model_with_early_stop = CatBoostClassifier(
    iterations=200,
    random_seed=63,
    learning_rate=0.5,
    early_stopping_rounds=20
)

# 9.1
model_with_early_stop.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_validation, y_validation),
    verbose=False,
    plot=True
)

In [None]:
# 9.2 How many trees were constructed?
print(model_with_early_stop.tree_count_)

In [None]:
# 10.0 Change eval
model_with_early_stop = CatBoostClassifier(
    eval_metric='AUC',    # Calculate the specified metrics for the specified dataset
                          # List of supported metrics is here:
                          # https://catboost.ai/en/docs/references/custom-metric__supported-metrics
    iterations=200,
    random_seed=63,
    learning_rate=0.5,
    early_stopping_rounds=20
)

# 10.1
model_with_early_stop.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_validation, y_validation),
    verbose=True,
    #plot=True
)

In [None]:
# 10.2 How many trees?
print(model_with_early_stop.tree_count_)

![](https://habrastorage.org/webt/y4/1q/yq/y41qyqfm9mcerp2ziys48phpjia.png)

In [None]:
# 11.0 ROC curve
from catboost.utils import get_roc_curve
eval_pool = Pool(X_validation, y_validation, cat_features=cat_features)
curve = get_roc_curve(model, eval_pool)
(fpr, tpr, thresholds) = curve
roc_auc = sklearn.metrics.auc(fpr, tpr)

In [None]:
# 11.1 

import matplotlib.pyplot as plt

plt.figure(figsize=(16, 8))
lw = 2

plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc, alpha=0.5)

plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--', alpha=0.5)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(True)
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('Receiver operating characteristic', fontsize=20)
plt.legend(loc="lower right", fontsize=16)
plt.show()

In [None]:
# 12.0
from catboost.utils import get_fpr_curve
from catboost.utils import get_fnr_curve
(thresholds, fpr) = get_fpr_curve(curve=curve)
(thresholds, fnr) = get_fnr_curve(curve=curve)

In [None]:
# 12.1
plt.figure(figsize=(16, 8))
lw = 2

plt.plot(thresholds, fpr, color='blue', lw=lw, label='FPR', alpha=0.5)
plt.plot(thresholds, fnr, color='green', lw=lw, label='FNR', alpha=0.5)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(True)
plt.xlabel('Threshold', fontsize=16)
plt.ylabel('Error Rate', fontsize=16)
plt.title('FPR-FNR curves', fontsize=20)
plt.legend(loc="lower left", fontsize=16)
plt.show()

In [None]:
# 13.0
from catboost.utils import select_threshold

print(select_threshold(model=model, data=eval_pool, FNR=0.01))
print(select_threshold(model=model, data=eval_pool, FPR=0.01))

## Model predictions

In [None]:
# 14.0 Make predictions:

print(model.predict_proba(X=X_validation))

In [None]:
# 14.1
print(model.predict(data=X_validation))

## Metric evaluation on a new dataset

In [None]:
# 15.0
model = CatBoostClassifier(
    random_seed=63,
    iterations=200,
    learning_rate=0.03,
)


model.fit(
    X_train, y_train,
    cat_features=cat_features,
    verbose=50
)

In [None]:
# 15.1
metrics = model.eval_metrics(
    data=pool1,
    metrics=['Logloss','AUC'],
    ntree_start=0,
    ntree_end=0,
    eval_period=1,
    plot=True
)

In [None]:
# 15.2
print('AUC values:')
print(np.array(metrics['AUC']))


## Feature importances

In [None]:
# 15.3 Feature importances are in decreasing order
model.get_feature_importance(prettified=True)

## Shap values
Get feature importance observation by observation

In [None]:
# 16.0 Get feature importance 
shap_values = model.get_feature_importance(pool1, type='ShapValues')
shap_values.shape  # (32769, 10)

In [None]:
# 16.1 For the Ist two-observations impt of respective features.
#      The last number is the overall score. It indicates how strongly
#      +ve or -ve class is (sort of probability):

shap_values[:2,:]

In [None]:
# 16.2 The last dimension: 
expected_value = shap_values[0,-1]
expected_value

In [None]:
# 16.3 Get all but the last dimension:
shap_values = shap_values[:,:-1]

In [None]:
# 16.4
shap_values.shape
shap_values

In [None]:
# 16.5
import shap

shap.initjs()
shap.force_plot(expected_value, shap_values[2,:], X.iloc[2,:])
X.iloc[2,:]
y[2]

In [None]:
# 16.6
import shap

shap.initjs()
shap.force_plot(expected_value, shap_values[91,:], X.iloc[91,:])
y[91]

In [None]:
# 16.7
shap.summary_plot(shap_values, X)

In [None]:
# 16.8 Sample by sample, effect of a particular feature: Look Vertically
# Y-axis select ROLE_TTILE effects
# X=axis select sample order by similarity or original sample ordering
# Limit X-axis to 100 samples 
X_small = X.iloc[0:100]
shap_small = shap_values[:100]
shap.force_plot(expected_value, shap_small, X_small)

## Feature evaluation

In [None]:
# 17.0 Not clear
from catboost.eval.catboost_evaluation import *
learn_params = {'iterations': 20, # 2000
                'learning_rate': 0.5, # we set big learning_rate,
                                      # because we have small
                                      # #iterations
                'random_seed': 0,
                'verbose': False,
                'loss_function' : 'Logloss',
                'boosting_type': 'Plain'}
evaluator = CatboostEvaluation('amazon/train.tsv',
                               fold_size=10000, # <= 50% of dataset
                               fold_count=20,
                               column_description='amazon/train.cd',
                               partition_random_seed=0,
                               #working_dir=... 
)
result = evaluator.eval_features(learn_config=learn_params,
                                 eval_metrics=['Logloss', 'Accuracy'],
                                 features_to_eval=[6, 7, 8])

In [None]:
from catboost.eval.evaluation_result import *
logloss_result = result.get_metric_results('Logloss')
logloss_result.get_baseline_comparison(
    ScoreConfig(ScoreType.Rel, overfit_iterations_info=False)
)

## Saving the model

In [None]:
# 18.0
# Fit your model with best parameters, run it and then save it:

# 18.1 Fit model with best parameters
my_best_model = CatBoostClassifier(iterations=10)

# 18.2 Train it
my_best_model.fit(
    X_train, y_train,
    eval_set=(X_validation, y_validation),
    cat_features=cat_features,
    verbose=False
)

# 18.3 Save it
my_best_model.save_model('catboost_model.bin')
my_best_model.save_model('catboost_model.json', format='json')

In [None]:
# 18.4 Load the saved model
my_best_model.load_model('catboost_model.bin')
print(my_best_model.get_params())
print(my_best_model.random_seed_)

## Hyperparameter tunning

### Training speed

In [None]:
# 19.0
import hyperopt

def hyperopt_objective(params):
    # Define model
    model = CatBoostClassifier(
                                l2_leaf_reg= params['l2_leaf_reg'],  
                                learning_rate=params['learning_rate'],
                                grow_policy = params['grow_policy'], 
                                depth = int(params['depth']),                 # An integer depth is expected
                                border_count = int(params['border_count']),   # If you remove int(), one gets an error
                                                                              # No of splits of numerical feature 

                                iterations=8,
                                eval_metric=metrics.Accuracy(),
                                random_seed=42,
                                verbose=False,
                                loss_function=metrics.Logloss(),
                              )
    
    # Perform cross validation
    cv_data = cv(
                Pool(X, y, cat_features=cat_features),
                model.get_params(),
                logging_level='Silent',
                )
    
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return 1 - best_accuracy # as hyperopt minimises

In [None]:
params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),     
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),  # Any value between the two numbers
    'depth' : hyperopt.hp.quniform('depth', 4,10,1) ,  # Depth should be in steps of 1, not in decimals
    'border_count' : hyperopt.hp.uniform('border_count', 32,255),
    'grow_policy'  : hyperopt.hp.choice('grow_policy', ['Depthwise','SymmetricTree']) # Returns 0 or 1
                                                                                     # Interchange the two options to see results
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
                      hyperopt_objective,
                      space=params_space,
                      algo=hyperopt.tpe.suggest,
                      max_evals=8,
                      trials=trials,
                      verbose = True
                    )

print(best)

## Calculate predictions for the contest

In [None]:
X_test = test_df.drop('id', axis=1)
test_pool = Pool(data=X_test, cat_features=cat_features)
contest_predictions = best_model.predict_proba(test_pool)
print('Predictoins:')
print(contest_predictions)

## Prepare the submission

In [None]:
f = open('submit.csv', 'w')
f.write('Id,Action\n')
for idx in range(len(contest_predictions)):
    line = str(test_df['id'][idx]) + ',' + str(contest_predictions[idx][1]) + '\n'
    f.write(line)
f.close()

Submit your solution [here](https://www.kaggle.com/c/amazon-employee-access-challenge/submit).
Good luck!!!

In [None]:
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))