# Installing Packages and Prerequisites

In [None]:
%pip install numpy
%pip install joblib
%pip install shap
%pip install scikit-learn

In [None]:
import numpy as np;
import constants;
import os;
from fileOrganizer import unpack;
import random;
import shap;
from explainer import explainer;

  from .autonotebook import tqdm as notebook_tqdm


# Model selection and Applying SHAP

According to assignment 1 (as shown in this notebook `CSI5155 Assignment 1 Evaluation Part - Kelvin Mock 300453668.ipynb` and the report `CSI5155 Assignment 1 Report.pdf`), we compared the Areas Under the Curves (AUCs) among 6 models. We also concluded in the report that the AUC is a great metric to inference the overall accuracy of a model. For simplicity, we only consider the **original** models where sampling techniques are not applied. We have the following conclusions:
- Based on the Chocolate dataset, the best classifier (with the largest AUC) is the Decision Tree classifier; and, 
- Based on the Magic Mushroom dataset, the best classifier (with the largest AUC) is the Multi-Layer Perceptron (MLP) classifier,

whereas,

- Based on the Chocolate dataset, the worst classifier (with the lowest AUC) is the Support Vector Machine (SVM) classifier; and, 
- Based on the Magic Mushroom dataset, the worst classifier (with the lowest AUC) is also the SVM classifier. 

## Loading the Models from files

In [None]:
# Choco Best: Decision Tree
choco_bestModel = unpack(os.path.join(constants.ASM1_DIR, constants.filepaths["choc_posttrained_decisionTree"]));
# Choco Worst: SVM
choco_worstModel = unpack(os.path.join(constants.ASM1_DIR, constants.filepaths["choc_posttrained_SVC"]));
# Mushroom Best: MLP
mush_bestModel = unpack(os.path.join(constants.ASM1_DIR, constants.filepaths["mushrooms_posttrained_MLP"]));
# Mushroom Worst: SVM
mush_worstModel = unpack(os.path.join(constants.ASM1_DIR, constants.filepaths["mushrooms_posttrained_SVC"]));

### Inspecting the Imported Models

In [None]:
choco_bestModel

Since the Decision Tree classifier has been optimized by RandomizedSearchCV, we need to extract the estimator. 

In [None]:
choco_bestModel = choco_bestModel.best_estimator_;
choco_bestModel

In [None]:
choco_worstModel

Since the SVM classifier has been optimized by RandomizedSearchCV, we need to extract the estimator. 

In [None]:
choco_worstModel = choco_worstModel.best_estimator_;
choco_worstModel

In [None]:
mush_bestModel

In [None]:
mush_worstModel

Since the SVM classifier has been optimized by RandomizedSearchCV, we need to extract the estimator. 

In [None]:
mush_worstModel = mush_worstModel.best_estimator_;
mush_worstModel

## Loading the Datasets from files

In [None]:
# Choco train set
choco_X_train = unpack(os.path.join(constants.ASM1_DIR, constants.filepaths["choc_train-set_samples"]));
choco_y_train = unpack(os.path.join(constants.ASM1_DIR, constants.filepaths["choc_train-set_labels"]));

# mushroom train set
mush_X_train = unpack(os.path.join(constants.ASM1_DIR, constants.filepaths["mushrooms_train-set_samples"]));
mush_y_train = unpack(os.path.join(constants.ASM1_DIR, constants.filepaths["mushrooms_train-set_labels"]));

# Choco test set
choco_X_test = unpack(os.path.join(constants.ASM1_DIR, constants.filepaths["choc_test-set_samples"]));
choco_y_test = unpack(os.path.join(constants.ASM1_DIR, constants.filepaths["choc_test-set_labels"]));
# mushroom test set
mush_X_test = unpack(os.path.join(constants.ASM1_DIR, constants.filepaths["mushrooms_test-set_samples"]));
mush_y_test = unpack(os.path.join(constants.ASM1_DIR, constants.filepaths["mushrooms_test-set_labels"]));

### Inspecting the Imported Data

In [None]:
print("-----Chocolate Dataset Training Set-----");
print(f"Size of the samples array in the training set from Chocolate dataset: {len(choco_X_train)}");
print(f"Size of the labels array in the training set from Chocolate dataset: {len(choco_y_train)}");
print(f"Number of features in a sample in the training set from Chocolate dataset: {len(choco_X_train[random.randint(0, len(choco_X_train)-1)])}");
unique, counts = np.unique(choco_y_train, return_counts=True);
for i in range(len(unique)):
    print(f"Label '{unique[i]}' has: {counts[i]} samples.");

print();

print("-----Mushroom Dataset Training Set-----");
print(f"Size of the samples array in the test set from Mushroom dataset: {len(mush_X_train)}");
print(f"Size of the labels array in the test set from Mushroom dataset: {len(mush_y_train)}");
print(f"Number of features in a sample in the test set from Mushroom dataset: {len(mush_X_train[random.randint(0, len(mush_X_train)-1)])}");
unique, counts = np.unique(mush_y_train, return_counts=True);
for i in range(len(unique)):
    print(f"Label '{unique[i]}' has: {counts[i]} samples.");

print();

print("-----Chocolate Dataset Test Set-----");
print(f"Size of the samples array in the test set from Chocolate dataset: {len(choco_X_test)}");
print(f"Size of the labels array in the test set from Chocolate dataset: {len(choco_y_test)}");
print(f"Number of features in a sample in the test set from Chocolate dataset: {len(choco_X_test[random.randint(0, len(choco_X_test)-1)])}");
unique, counts = np.unique(choco_y_test, return_counts=True);
for i in range(len(unique)):
    print(f"Label '{unique[i]}' has: {counts[i]} samples.");

print();

print("-----Mushroom Dataset Test Set-----");
print(f"Size of the samples array in the test set from Mushroom dataset: {len(mush_X_test)}");
print(f"Size of the labels array in the test set from Mushroom dataset: {len(mush_y_test)}");
print(f"Number of features in a sample in the test set from Mushroom dataset: {len(mush_X_test[random.randint(0, len(mush_X_test)-1)])}");
unique, counts = np.unique(mush_y_test, return_counts=True);
for i in range(len(unique)):
    print(f"Label '{unique[i]}' has: {counts[i]} samples.");

-----Chocolate Dataset Training Set-----
Size of the samples array in the training set from Chocolate dataset: 1256
Size of the labels array in the training set from Chocolate dataset: 1256
Number of features in a sample in the training set from Chocolate dataset: 13
Label 'non-user' has: 27 samples.
Label 'user' has: 1229 samples.

-----Mushroom Dataset Training Set-----
Size of the samples array in the test set from Mushroom dataset: 1256
Size of the labels array in the test set from Mushroom dataset: 1256
Number of features in a sample in the test set from Mushroom dataset: 13
Label 'non-user' has: 805 samples.
Label 'user' has: 451 samples.

-----Chocolate Dataset Test Set-----
Size of the samples array in the test set from Chocolate dataset: 629
Size of the labels array in the test set from Chocolate dataset: 629
Number of features in a sample in the test set from Chocolate dataset: 13
Label 'non-user' has: 8 samples.
Label 'user' has: 621 samples.

-----Mushroom Dataset Test Set-

## Apply SHAP Method

A SHAP value is used to represent the impact of each feature on the models’ predictions.

### Instantiating Explainers

In [None]:
treeexmplainer_choco = explainer(
    model=choco_bestModel, # Decision Tree
    X_train=choco_X_train,
    modelType="tree"
)
treeexmplainer_choco

Start instantiating an explainer.
A tree explainer is instantiated successfully.


<explainer.explainer at 0x2b2c6c209a0>

In [None]:
kernelExplainer_choco = explainer(
    model=choco_worstModel, # SVM
    X_train=choco_X_train,
    modelType="svm"
);
kernelExplainer_choco


Using 1256 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Start instantiating an explainer.
A kernel explainer is instantiated successfully.


<explainer.explainer at 0x2b2c6c235b0>

In [None]:
kernelExplainer_mush_best = explainer(
    model=mush_bestModel, # MLP
    X_train=mush_X_train,
    modelType="mlp"
)
kernelExplainer_mush_best

Start instantiating an explainer.


Using 1256 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


A kernel explainer is instantiated successfully.


<explainer.explainer at 0x2b2c6c233d0>

In [None]:
kernelExplainer_mush_worst = explainer(
    model=mush_worstModel, # SVM
    X_train=mush_X_train,
    modelType="svm"
);
kernelExplainer_mush_worst

Using 1256 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Start instantiating an explainer.
A kernel explainer is instantiated successfully.


<explainer.explainer at 0x2b2c6c22fb0>

### Calculating SHAP values

In [None]:
# Decision Tree
SHAP_choco_best = treeexmplainer_choco.explain(
    X_test=choco_X_test
);
print(f"Shape of the SHAP values set results: {SHAP_choco_best.shape}");
print(f"The SHAP value for a sample: \n{SHAP_choco_best[random.randint(0, len(SHAP_choco_best)-1)]}");

A tree explainer is found.
Shape of the SHAP values set results: (629, 13, 2)
The SHAP value for a sample: 
[[ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.00145422 -0.00145422]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]]


In [None]:
# SVM
SHAP_choco_worst = kernelExplainer_choco.explain(
    X_test=choco_X_test
);
print(f"Shape of the SHAP values set results: {SHAP_choco_worst.shape}");
print(f"The SHAP value for a sample: \n{SHAP_choco_worst[random.randint(0, len(SHAP_choco_worst)-1)]}");

A kernel explainer is found


  0%|          | 0/629 [00:16<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# MLP
SHAP_mush_best = kernelExplainer_mush_best.explain(
    X_test=choco_X_test
);
print(f"Shape of the SHAP values set results: {SHAP_mush_best.shape}");
print(f"The SHAP value for a sample: \n{SHAP_mush_best[random.randint(0, len(SHAP_mush_best)-1)]}");

In [None]:
# SVM
SHAP_mush_worst = kernelExplainer_mush_worst.explain(
    X_test=choco_X_test
);
print(f"Shape of the SHAP values set results: {SHAP_mush_worst.shape}");
print(f"The SHAP value for a sample: \n{SHAP_mush_worst[random.randint(0, len(SHAP_mush_worst)-1)]}");

### Store the SHAP values after calculation

In [None]:
np.save("SHAP_choco_best.npy", SHAP_choco_best);
np.save("SHAP_choco_worst.npy", SHAP_choco_worst);
np.save("SHAP_mush_best.npy", SHAP_mush_best);
np.save("SHAP_mush_worst.npy", SHAP_mush_worst);