From 8d8b0d2ab4e5167deba1c61d47a622781004143e Mon Sep 17 00:00:00 2001
From: Joris Clement <j.clement@campus.tu-berlin.de>
Date: Wed, 12 May 2021 16:12:53 +0200
Subject: [PATCH 1/7] Add general feature importance estimation

---
 poetry.lock            | 48 +++++++++++++++++++++++++++++++++++++++++-
 pyproject.toml         |  1 +
 src/thesis/classify.py | 12 ++++++-----
 3 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 05bd6d58..9bb12ee0 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1003,6 +1003,31 @@ numpy = ">=1.15"
 pandas = ">=0.23"
 scipy = ">=1.0"
 
+[[package]]
+name = "shap"
+version = "0.39.0"
+description = "A unified approach to explain the output of any machine learning model."
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+cloudpickle = "*"
+numba = "*"
+numpy = "*"
+pandas = "*"
+scikit-learn = "*"
+scipy = "*"
+slicer = "0.0.7"
+tqdm = ">4.25.0"
+
+[package.extras]
+all = ["pyod", "pytest", "sphinx-rtd-theme", "sentencepiece", "transformers", "lightgbm", "torch", "numpydoc", "nbsphinx", "opencv-python", "ipython", "pytest-mpl", "matplotlib", "xgboost", "catboost", "sphinx", "pytest-cov", "pyspark", "lime"]
+docs = ["matplotlib", "ipython", "numpydoc", "sphinx-rtd-theme", "sphinx", "nbsphinx"]
+others = ["lime"]
+plots = ["matplotlib", "ipython"]
+test = ["pytest", "pytest-mpl", "pytest-cov", "xgboost", "lightgbm", "catboost", "pyspark", "pyod", "transformers", "torch", "sentencepiece", "opencv-python"]
+
 [[package]]
 name = "six"
 version = "1.15.0"
@@ -1011,6 +1036,14 @@ category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
 
+[[package]]
+name = "slicer"
+version = "0.0.7"
+description = "A small package for big slicing."
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
 [[package]]
 name = "sortedcontainers"
 version = "2.3.0"
@@ -1283,7 +1316,7 @@ heapdict = "*"
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.8"
-content-hash = "a50261bab581ed0cbe19aa08fb97bae8fe987573ac88ec23c8093c12377217b4"
+content-hash = "72ff87359639478a079fa8b76b86856e4acd7f6412eabcba3ebb3675b93a7c43"
 
 [metadata.files]
 absl-py = [
@@ -1349,6 +1382,9 @@ coverage = [
     {file = "coverage-5.5-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:8963a499849a1fc54b35b1c9f162f4108017b2e6db2c46c1bed93a72262ed083"},
     {file = "coverage-5.5-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:869a64f53488f40fa5b5b9dcb9e9b2962a66a87dab37790f3fcfb5144b996ef5"},
     {file = "coverage-5.5-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:4a7697d8cb0f27399b0e393c0b90f0f1e40c82023ea4d45d22bce7032a5d7b81"},
+    {file = "coverage-5.5-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:8d0a0725ad7c1a0bcd8d1b437e191107d457e2ec1084b9f190630a4fb1af78e6"},
+    {file = "coverage-5.5-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:51cb9476a3987c8967ebab3f0fe144819781fca264f57f89760037a2ea191cb0"},
+    {file = "coverage-5.5-cp310-cp310-win_amd64.whl", hash = "sha256:c0891a6a97b09c1f3e073a890514d5012eb256845c451bd48f7968ef939bf4ae"},
     {file = "coverage-5.5-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:3487286bc29a5aa4b93a072e9592f22254291ce96a9fbc5251f566b6b7343cdb"},
     {file = "coverage-5.5-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:deee1077aae10d8fa88cb02c845cfba9b62c55e1183f52f6ae6a2df6a2187160"},
     {file = "coverage-5.5-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:f11642dddbb0253cc8853254301b51390ba0081750a8ac03f20ea8103f0c56b6"},
@@ -2148,10 +2184,20 @@ seaborn = [
     {file = "seaborn-0.11.1-py3-none-any.whl", hash = "sha256:4e1cce9489449a1c6ff3c567f2113cdb41122f727e27a984950d004a88ef3c5c"},
     {file = "seaborn-0.11.1.tar.gz", hash = "sha256:44e78eaed937c5a87fc7a892c329a7cc091060b67ebd1d0d306b446a74ba01ad"},
 ]
+shap = [
+    {file = "shap-0.39.0-cp36-cp36m-win_amd64.whl", hash = "sha256:bf9af9b089ef95cb1ac0df80a43f8144aa9095d10f282cb5c19643ff88a6a79d"},
+    {file = "shap-0.39.0-cp37-cp37m-win_amd64.whl", hash = "sha256:b44f9fbb7349f5406b98b4ec24c672f8fe932606bb7574a8aae2238410c55289"},
+    {file = "shap-0.39.0-cp38-cp38-win_amd64.whl", hash = "sha256:c0d51b44c15eae1c12e51ed498f898cfc5e12d6be7e0d4f733ce6453f6ec85a4"},
+    {file = "shap-0.39.0.tar.gz", hash = "sha256:0196a6c12cc98f8b48ce9c5968550902432b80290da6fa7be8655441a1c6251a"},
+]
 six = [
     {file = "six-1.15.0-py2.py3-none-any.whl", hash = "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"},
     {file = "six-1.15.0.tar.gz", hash = "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259"},
 ]
+slicer = [
+    {file = "slicer-0.0.7-py3-none-any.whl", hash = "sha256:0b94faa5251c0f23782c03f7b7eedda91d80144059645f452c4bc80fab875976"},
+    {file = "slicer-0.0.7.tar.gz", hash = "sha256:f5d5f7b45f98d155b9c0ba6554fa9770c6b26d5793a3e77a1030fb56910ebeec"},
+]
 sortedcontainers = [
     {file = "sortedcontainers-2.3.0-py2.py3-none-any.whl", hash = "sha256:37257a32add0a3ee490bb170b599e93095eed89a55da91fa9f48753ea12fd73f"},
     {file = "sortedcontainers-2.3.0.tar.gz", hash = "sha256:59cc937650cf60d677c16775597c89a960658a09cf7c1a668f86e1e4464b10a1"},
diff --git a/pyproject.toml b/pyproject.toml
index 640b77a0..5221a929 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ lightgbm = "^3.1.1"
 pyts = "^0.11.0"
 imbalanced-learn = "^0.8.0"
 dtaidistance = "^2.2.5"
+shap = "^0.39.0"
 
 [tool.poetry.dev-dependencies]
 pytest = "^6.1.0"
diff --git a/src/thesis/classify.py b/src/thesis/classify.py
index 7b99c87e..c7b2c07a 100644
--- a/src/thesis/classify.py
+++ b/src/thesis/classify.py
@@ -13,6 +13,7 @@
 from lightgbm import LGBMClassifier
 import numpy as np
 import pandas as pd
+import shap
 from sklearn import metrics
 from sklearn.base import BaseEstimator
 from sklearn.metrics import (
@@ -505,11 +506,12 @@ def _save_models(
         X = self.get_X(model_name)
         self._train(pipeline, X, range(0, len(X)), range(0), False)
 
-        if isinstance(get_classifier(pipeline), LGBMClassifier):
-            lightgbm.plot_importance(get_classifier(pipeline))
-            util.finish_plot(
-                "feature_importance", model_folder, self.config["general"]["show_plots"]
-            )
+        explainer = shap.Explainer(model)
+        shap_values = explainer(X)
+        shap.plots.bar(shap_values)
+        util.finish_plot(
+            "feature_importance", model_folder, self.config["general"]["show_plots"]
+        )
 
         if is_keras(pipeline):
             pipeline.named_steps["classifier"].model.save(

From c80c2f5d8a1915c5e49ec78c02b253c4bcf5af18 Mon Sep 17 00:00:00 2001
From: Joris Clement <j.clement@campus.tu-berlin.de>
Date: Wed, 12 May 2021 18:40:17 +0200
Subject: [PATCH 2/7] Fixup feature importance

---
 mypy.ini               |  3 +++
 src/thesis/classify.py | 19 ++++++++++++++++---
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 4fa674b4..ee8946bb 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -53,3 +53,6 @@ ignore_missing_imports = True
 
 [mypy-dtaidistance.*]
 ignore_missing_imports = True
+
+[mypy-shap.*]
+ignore_missing_imports = True
diff --git a/src/thesis/classify.py b/src/thesis/classify.py
index c7b2c07a..7c4e791d 100644
--- a/src/thesis/classify.py
+++ b/src/thesis/classify.py
@@ -506,9 +506,22 @@ def _save_models(
         X = self.get_X(model_name)
         self._train(pipeline, X, range(0, len(X)), range(0), False)
 
-        explainer = shap.Explainer(model)
-        shap_values = explainer(X)
-        shap.plots.bar(shap_values)
+        explainer = shap.Explainer(
+            pipeline.named_steps["classifier"],
+            feature_names=get_feature_names(pipeline[-2]),
+            output_names=self.defect_names,
+        )
+        X_tr = pd.DataFrame(
+            data=pipeline[:-1].transform(X),
+            index=X.index,
+            columns=get_feature_names(pipeline[-2]),
+        )
+        shap.summary_plot(
+            explainer.shap_values(X_tr),
+            X_tr,
+            class_names=self.defect_names,
+            show=self.config["general"]["show_plots"],
+        )
         util.finish_plot(
             "feature_importance", model_folder, self.config["general"]["show_plots"]
         )

From ceca09bafb3b4ca93f7529ddb202b66288048771 Mon Sep 17 00:00:00 2001
From: Joris Clement <j.clement@campus.tu-berlin.de>
Date: Wed, 12 May 2021 18:41:54 +0200
Subject: [PATCH 3/7] Fix defects and dependent defect_names

---
 src/thesis/classify.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/thesis/classify.py b/src/thesis/classify.py
index 7c4e791d..4796d456 100644
--- a/src/thesis/classify.py
+++ b/src/thesis/classify.py
@@ -193,7 +193,7 @@ def __init__(self, config):
             index=build_index(self.measurements),
             dtype=np.int8,
         )
-        self.defects: Final = sorted(set(self.y))
+        self.defects: Final = [data.Defect(i) for i in sorted(set(self.y))]
         self.defect_names: Final = data.get_names(self.defects)
         self.cv_splits: Final = self._generate_cv_splits()
 

From 845dc6cb4b8a730241534f6f47833a9303e62b69 Mon Sep 17 00:00:00 2001
From: Joris Clement <j.clement@campus.tu-berlin.de>
Date: Thu, 13 May 2021 15:05:11 +0200
Subject: [PATCH 4/7] Show importance of all features

---
 src/thesis/classify.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/thesis/classify.py b/src/thesis/classify.py
index 4796d456..5ced3d24 100644
--- a/src/thesis/classify.py
+++ b/src/thesis/classify.py
@@ -520,6 +520,7 @@ def _save_models(
             explainer.shap_values(X_tr),
             X_tr,
             class_names=self.defect_names,
+            max_display=X.shape[1],
             show=self.config["general"]["show_plots"],
         )
         util.finish_plot(

From c252a0930dae6bc3d3779f453245a0473661dbc4 Mon Sep 17 00:00:00 2001
From: Joris Clement <j.clement@campus.tu-berlin.de>
Date: Thu, 13 May 2021 15:08:38 +0200
Subject: [PATCH 5/7] Improve feature names, align with document

Changes:
 - Be more consistent in naming
 - Remove a duplicate definition
 - Correct a description

Issue #105.
---
 src/thesis/fingerprint.py | 113 +++++++++++++++++++-------------------
 1 file changed, 56 insertions(+), 57 deletions(-)

diff --git a/src/thesis/fingerprint.py b/src/thesis/fingerprint.py
index b4628b82..fa48f64e 100644
--- a/src/thesis/fingerprint.py
+++ b/src/thesis/fingerprint.py
@@ -24,10 +24,10 @@
 from .data import CLASS, get_defects, PATH, PD, TIME_DIFF, VOLTAGE_SIGN
 
 
-PD_ID = "PD-Value"
-PD_DIFF_ID = "PD-Diff"
-TD_ID = "TimeDiff"
-CORR_ID = "Correlate"
+PD_ID = "A"
+PD_DIFF_ID = "\u0394A"
+TD_ID = "\u0394t"
+CORR_ID = "Corr"
 
 
 class Group(Enum):
@@ -41,31 +41,31 @@ def __str__(self):
 
 
 # @note: parameter in TU Graz fingerprint
-PD_VAR = f"{PD_ID} Variance"
-PD_SKEW = f"{PD_ID} Skewness"
-PD_KURT = f"{PD_ID} Kurtosis"
-PD_WEIB_A = f"{PD_ID} Weibull A"
-PD_WEIB_B = f"{PD_ID} Weibull B"
+PD_VAR = f"{PD_ID}-Var"
+PD_SKEW = f"{PD_ID}-Skew"
+PD_KURT = f"{PD_ID}-Kurt"
+PD_WEIB_A = f"{PD_ID}-Weib-\u03B1"
+PD_WEIB_B = f"{PD_ID} Weib-\u03B2"
 
-PD_DIFF_WEIB_B = f"{PD_DIFF_ID} Weibull B"
+PD_DIFF_WEIB_B = f"{PD_DIFF_ID}-Weib-\u03B2"
 
-TD_MAX = f"{TD_ID} Max"
-TD_MEAN = f"{TD_ID} Mean"
-TD_MIN = f"{TD_ID} Min"
-TD_VAR = f"{TD_ID} Variance"
-TD_SKEW = f"{TD_ID} Skewness"
-TD_KURT = f"{TD_ID} Kurtosis"
-TDIFF_NORM_WEIB_A = f"{TD_ID} Sorted Normed Weibull A"
-TDIFF_NORM_WEIB_B = f"{TD_ID} Sorted Normed Weibull B"
+TD_MAX = f"{TD_ID}-Max"
+TD_MEAN = f"{TD_ID}-Mean"
+TD_MIN = f"{TD_ID}-Min"
+TD_VAR = f"{TD_ID}-Var"
+TD_SKEW = f"{TD_ID}-Skew"
+TD_KURT = f"{TD_ID}-Kurt"
+TDIFF_NORM_WEIB_A = f"{TD_ID}-Norm-Weib-\u03B1"
+TDIFF_NORM_WEIB_B = f"{TD_ID}-Norm-Weib-\u03B2"
 
 
 # @note: parameter in Lukas fingerprint
-PDS_PER_SEC = "Number of PDs/sec"
+PDS_PER_SEC = "PDs/Sec"
 
-PD_MEAN = f"{PD_ID} Mean"
-PD_MAX = f"{PD_ID} Max"
-PD_CV = f"{PD_ID} std/mean"
-PD_SUM = f"{PD_ID} Sum"
+PD_MEAN = f"{PD_ID}-Mean"
+PD_MAX = f"{PD_ID}-Max"
+PD_CV = f"{PD_ID}-Std/Mean"
+PD_SUM = f"{PD_ID}-Sum"
 
 PD_DIFF_MEAN = f"{PD_DIFF_ID} Mean"
 PD_DIFF_SKEW = f"{PD_DIFF_ID} Skewness"
@@ -73,14 +73,13 @@ def __str__(self):
 PD_DIFF_KURT = f"{PD_DIFF_ID} Kurtosis"
 PD_DIFF_WEIB_A = f"{PD_DIFF_ID} Weibull A"
 
-TD_MEDIAN = f"{TD_ID} Median"
+TD_MEDIAN = f"{TD_ID}-Median"
 
-CORR_PD_DIFF_TO_PD_BINS = f"{CORR_ID} {PD_DIFF_ID} - PD Bins"
-CORR_NEXT_PD_TO_PD_BINS = f"{CORR_ID} Next PD - PD Bins"
-CORR_NEXT_PD_TO_PD = f"{CORR_ID} Next PD - PD"
-CORR_PD_DIFF_TO_PD = f"{CORR_ID} {PD_DIFF_ID} - PD"
-CORR_PD_DIFF_TO_TD = f"{CORR_ID} PD - {PD_DIFF_ID}"
-CORR_PD_TO_TD = f"{CORR_ID} PD - {TD_ID}"
+CORR_PD_DIFF_TO_PD_BINS = f"{CORR_ID}-{PD_DIFF_ID}-{PD_ID}-Bins"
+CORR_NEXT_PD_TO_PD_BINS = f"Auto-{CORR_ID}-{PD_ID}-Bins"
+CORR_NEXT_PD_TO_PD = f"Auto-{CORR_ID}-{PD_ID}-1"
+CORR_PD_DIFF_TO_PD = f"{CORR_ID}-{PD_DIFF_ID}-{PD_ID}"
+CORR_PD_TO_TD = f"{CORR_ID}-{PD_ID}-{TD_ID}"
 
 # @note: parameter in own fingerprint
 DURATION = "duration"
@@ -89,37 +88,37 @@ def __str__(self):
 PD_MIN = f"{PD_ID} min"
 PD_MEDIAN = f"{PD_ID} Median"
 PD_STD = f"{PD_ID} std"
-PD_VAR = f"{PD_ID} var"
-PD_NUM_PEAKS_50 = f"{PD_ID} num peaks 50"
-PD_NUM_PEAKS_10 = f"{PD_ID} num peaks 10"
-PD_NUM_PEAKS_5 = f"{PD_ID} num peaks 5"
-PD_NUM_PEAKS_100 = f"{PD_ID} num peaks 100"
+PD_VAR = f"{PD_ID}-Var"
+PD_NUM_PEAKS_50 = f"{PD_ID}-Num-peaks-50"
+PD_NUM_PEAKS_10 = f"{PD_ID}-Num-peaks-10"
+PD_NUM_PEAKS_5 = f"{PD_ID}-Num-peaks-5"
+PD_NUM_PEAKS_100 = f"{PD_ID}-Num-peaks-100"
 PD_RATIO = f"{PD_ID} ratio"
 PD_PERC_REOCCUR = f"{PD_ID} percentage reocurring"
-PD_COUNT_ABOVE_MEAN = f"{PD_ID} count above mean"
-PD_COUNT_BELOW_MEAN = f"{PD_ID} count below mean"
-PD_CHANGE_QUANTILES = f"{PD_ID} ChangeQuantiles"
-PD_NORM_WEIB_A = f"{PD_ID} Weibull normed sorted A"
-PD_NORM_WEIB_B = f"{PD_ID} Weibull normed sorted B"
-
-TD_LONGEST_STRIKE_BELOW_MEAN = f"{TD_ID} longest strike below mean"
-TD_CHANGE_QUANTILES = f"{TD_ID} ChangeQuantiles"
-TD_SUM = f"{TD_ID} Sum"
-
-CORR_2ND_NEXT_PD_TO_PD = f"{CORR_ID} Auto 2nd Next {PD_ID}"
-CORR_3RD_NEXT_PD_TO_PD = f"{CORR_ID} Auto 3rd Next {PD_ID}"
-CORR_5TH_NEXT_PD_TO_PD = f"{CORR_ID} Auto 5th Next {PD_ID}"
-CORR_10TH_NEXT_PD_TO_PD = f"{CORR_ID} Auto 10th Next {PD_ID}"
-
-AUTOCORR_NEXT_TD = f"{CORR_ID} Auto Next {TD_ID}"
-AUTOCORR_2ND_NEXT_TD = f"{CORR_ID} Auto 2nd Next {TD_ID}"
-AUTOCORR_3RD_NEXT_TD = f"{CORR_ID} Auto 3rd Next {TD_ID}"
-AUTOCORR_5TH_NEXT_TD = f"{CORR_ID} Auto 5th Next {TD_ID}"
-AUTOCORR_10TH_NEXT_TD = f"{CORR_ID} Auto 10th Next {TD_ID}"
+PD_COUNT_ABOVE_MEAN = f"{PD_ID}-Num->-mean"
+PD_COUNT_BELOW_MEAN = f"{PD_ID}-Num-<-mean"
+PD_CHANGE_QUANTILES = f"{PD_ID}-change-quantiles"
+PD_NORM_WEIB_A = f"{PD_ID}-norm-Weib-\u03B1"
+PD_NORM_WEIB_B = f"{PD_ID}-norm-Weib-\u03B2"
+TD_LONGEST_STRIKE_BELOW_MEAN = f"{PD_ID}-max-strike-<-mean"
+
+TD_CHANGE_QUANTILES = f"{TD_ID}-Change-quantiles"
+TD_SUM = f"{TD_ID}-Sum"
+
+CORR_2ND_NEXT_PD_TO_PD = f"Auto-{CORR_ID}-{PD_ID}-2"
+CORR_3RD_NEXT_PD_TO_PD = f"Auto-{CORR_ID}-{PD_ID}-3"
+CORR_5TH_NEXT_PD_TO_PD = f"Auto-{CORR_ID}-{PD_ID}-5"
+CORR_10TH_NEXT_PD_TO_PD = f"Auto-{CORR_ID}-{PD_ID}-10"
+
+AUTOCORR_NEXT_TD = f"Auto-{CORR_ID}-{TD_ID}-1"
+AUTOCORR_2ND_NEXT_TD = f"Auto-{CORR_ID}-{TD_ID}-2"
+AUTOCORR_3RD_NEXT_TD = f"Auto-{CORR_ID}-{TD_ID}-3"
+AUTOCORR_5TH_NEXT_TD = f"Auto-{CORR_ID}-{TD_ID}-5"
+AUTOCORR_10TH_NEXT_TD = f"Auto-{CORR_ID}-{TD_ID}-10"
 
 # @note: further parameters
-PD_BY_TD_WEIB_A = f"{PD_ID} / {TD_ID} Weibull A"
-PD_BY_TD_WEIB_B = f"{PD_ID} / {TD_ID} Weibull B"
+PD_BY_TD_WEIB_A = f"{PD_ID}/{TD_ID}-Weib-\u03B1"
+PD_BY_TD_WEIB_B = f"{PD_ID}/{TD_ID}-Weibu-\u03B2"
 
 
 def get_parameter_group(df: pd.DataFrame, group: Group) -> pd.DataFrame:

From 2d8712c4e70d800c692b4bc95f3c4b6d65b5c162 Mon Sep 17 00:00:00 2001
From: Joris Clement <j.clement@campus.tu-berlin.de>
Date: Fri, 14 May 2021 11:24:07 +0200
Subject: [PATCH 6/7] Calc feature importance only for certain models

Just calculate the feature importance for fingerprint (feature-based)
models and exclude the k-NN classifier as the library does not seem to
support it.
---
 src/thesis/classify.py | 52 +++++++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/src/thesis/classify.py b/src/thesis/classify.py
index 5ced3d24..f8636515 100644
--- a/src/thesis/classify.py
+++ b/src/thesis/classify.py
@@ -21,6 +21,7 @@
     balanced_accuracy_score,
 )
 from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold
+from sklearn.neighbors import KNeighborsClassifier
 from sklearn.pipeline import FeatureUnion, Pipeline
 from sklearn.preprocessing import FunctionTransformer
 from sklearn.utils import estimator_html_repr
@@ -73,6 +74,10 @@
 tensorflow.random.set_seed(SEED)
 
 
+def is_pipeline_finger(pipeline: Pipeline) -> bool:
+    return is_data_finger(list(pipeline.named_steps.keys())[0])
+
+
 def combine(dataPart: DataPart, metric_name: str):
     return f"{dataPart}_{metric_name}"
 
@@ -351,7 +356,7 @@ def _train(
                     (get_data_transformer(pipeline).transform(X_val), y_val),
                     (get_data_transformer(pipeline).transform(X_train), y_train),
                 ]
-            if is_data_finger(list(pipeline.named_steps.keys())[0]):
+            if is_pipeline_finger(pipeline):
                 feature_name, categorical_feature = get_categorical_features_info(
                     get_data_transformer(pipeline), X
                 )
@@ -506,26 +511,31 @@ def _save_models(
         X = self.get_X(model_name)
         self._train(pipeline, X, range(0, len(X)), range(0), False)
 
-        explainer = shap.Explainer(
-            pipeline.named_steps["classifier"],
-            feature_names=get_feature_names(pipeline[-2]),
-            output_names=self.defect_names,
-        )
-        X_tr = pd.DataFrame(
-            data=pipeline[:-1].transform(X),
-            index=X.index,
-            columns=get_feature_names(pipeline[-2]),
-        )
-        shap.summary_plot(
-            explainer.shap_values(X_tr),
-            X_tr,
-            class_names=self.defect_names,
-            max_display=X.shape[1],
-            show=self.config["general"]["show_plots"],
-        )
-        util.finish_plot(
-            "feature_importance", model_folder, self.config["general"]["show_plots"]
-        )
+        if (
+            is_pipeline_finger(pipeline)
+            and not isinstance(get_classifier(pipeline), KNeighborsClassifier)
+            and "finger_all" not in pipeline.named_steps
+        ):
+            explainer = shap.Explainer(
+                pipeline.named_steps["classifier"],
+                feature_names=get_feature_names(pipeline[0]),
+                output_names=self.defect_names,
+            )
+            X_tr = pd.DataFrame(
+                data=pipeline[:-1].transform(X),
+                index=X.index,
+                columns=get_feature_names(pipeline[0]),
+            )
+            shap.summary_plot(
+                explainer.shap_values(X_tr),
+                X_tr,
+                class_names=self.defect_names,
+                max_display=X.shape[1],
+                show=self.config["general"]["show_plots"],
+            )
+            util.finish_plot(
+                "feature_importance", model_folder, self.config["general"]["show_plots"]
+            )
 
         if is_keras(pipeline):
             pipeline.named_steps["classifier"].model.save(

From 7010057e1c2be1887449203876a1c295cfcfddff Mon Sep 17 00:00:00 2001
From: Joris Clement <j.clement@campus.tu-berlin.de>
Date: Fri, 14 May 2021 13:36:34 +0200
Subject: [PATCH 7/7] Fix failing CI

The CI fails due to something of the SHAP dependencies not being
installed correctly. Part of the error output is:
C extension was not built during install!

Fix this by restricting the feature importance calculation to the LGBM
classifier as that classifier does not need the C extension.
---
 src/thesis/classify.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/thesis/classify.py b/src/thesis/classify.py
index f8636515..a62a912f 100644
--- a/src/thesis/classify.py
+++ b/src/thesis/classify.py
@@ -21,7 +21,6 @@
     balanced_accuracy_score,
 )
 from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold
-from sklearn.neighbors import KNeighborsClassifier
 from sklearn.pipeline import FeatureUnion, Pipeline
 from sklearn.preprocessing import FunctionTransformer
 from sklearn.utils import estimator_html_repr
@@ -513,7 +512,9 @@ def _save_models(
 
         if (
             is_pipeline_finger(pipeline)
-            and not isinstance(get_classifier(pipeline), KNeighborsClassifier)
+            # @note: LGBMClassifier is chosen to make the tests pass the CI,
+            #        but all classifiers except k-NN also work.
+            and isinstance(get_classifier(pipeline), LGBMClassifier)
             and "finger_all" not in pipeline.named_steps
         ):
             explainer = shap.Explainer(