From 8d8b0d2ab4e5167deba1c61d47a622781004143e Mon Sep 17 00:00:00 2001 From: Joris Clement Date: Wed, 12 May 2021 16:12:53 +0200 Subject: [PATCH 1/7] Add general feature importance estimation --- poetry.lock | 48 +++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 1 + src/thesis/classify.py | 12 ++++++----- 3 files changed, 55 insertions(+), 6 deletions(-) diff --git a/poetry.lock b/poetry.lock index 05bd6d58..9bb12ee0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1003,6 +1003,31 @@ numpy = ">=1.15" pandas = ">=0.23" scipy = ">=1.0" +[[package]] +name = "shap" +version = "0.39.0" +description = "A unified approach to explain the output of any machine learning model." +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +cloudpickle = "*" +numba = "*" +numpy = "*" +pandas = "*" +scikit-learn = "*" +scipy = "*" +slicer = "0.0.7" +tqdm = ">4.25.0" + +[package.extras] +all = ["pyod", "pytest", "sphinx-rtd-theme", "sentencepiece", "transformers", "lightgbm", "torch", "numpydoc", "nbsphinx", "opencv-python", "ipython", "pytest-mpl", "matplotlib", "xgboost", "catboost", "sphinx", "pytest-cov", "pyspark", "lime"] +docs = ["matplotlib", "ipython", "numpydoc", "sphinx-rtd-theme", "sphinx", "nbsphinx"] +others = ["lime"] +plots = ["matplotlib", "ipython"] +test = ["pytest", "pytest-mpl", "pytest-cov", "xgboost", "lightgbm", "catboost", "pyspark", "pyod", "transformers", "torch", "sentencepiece", "opencv-python"] + [[package]] name = "six" version = "1.15.0" @@ -1011,6 +1036,14 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +[[package]] +name = "slicer" +version = "0.0.7" +description = "A small package for big slicing." +category = "main" +optional = false +python-versions = ">=3.6" + [[package]] name = "sortedcontainers" version = "2.3.0" @@ -1283,7 +1316,7 @@ heapdict = "*" [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "a50261bab581ed0cbe19aa08fb97bae8fe987573ac88ec23c8093c12377217b4" +content-hash = "72ff87359639478a079fa8b76b86856e4acd7f6412eabcba3ebb3675b93a7c43" [metadata.files] absl-py = [ @@ -1349,6 +1382,9 @@ coverage = [ {file = "coverage-5.5-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:8963a499849a1fc54b35b1c9f162f4108017b2e6db2c46c1bed93a72262ed083"}, {file = "coverage-5.5-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:869a64f53488f40fa5b5b9dcb9e9b2962a66a87dab37790f3fcfb5144b996ef5"}, {file = "coverage-5.5-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:4a7697d8cb0f27399b0e393c0b90f0f1e40c82023ea4d45d22bce7032a5d7b81"}, + {file = "coverage-5.5-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:8d0a0725ad7c1a0bcd8d1b437e191107d457e2ec1084b9f190630a4fb1af78e6"}, + {file = "coverage-5.5-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:51cb9476a3987c8967ebab3f0fe144819781fca264f57f89760037a2ea191cb0"}, + {file = "coverage-5.5-cp310-cp310-win_amd64.whl", hash = "sha256:c0891a6a97b09c1f3e073a890514d5012eb256845c451bd48f7968ef939bf4ae"}, {file = "coverage-5.5-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:3487286bc29a5aa4b93a072e9592f22254291ce96a9fbc5251f566b6b7343cdb"}, {file = "coverage-5.5-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:deee1077aae10d8fa88cb02c845cfba9b62c55e1183f52f6ae6a2df6a2187160"}, {file = "coverage-5.5-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:f11642dddbb0253cc8853254301b51390ba0081750a8ac03f20ea8103f0c56b6"}, @@ -2148,10 +2184,20 @@ seaborn = [ {file = "seaborn-0.11.1-py3-none-any.whl", hash = "sha256:4e1cce9489449a1c6ff3c567f2113cdb41122f727e27a984950d004a88ef3c5c"}, {file = "seaborn-0.11.1.tar.gz", hash = "sha256:44e78eaed937c5a87fc7a892c329a7cc091060b67ebd1d0d306b446a74ba01ad"}, ] +shap = [ + {file = "shap-0.39.0-cp36-cp36m-win_amd64.whl", hash = "sha256:bf9af9b089ef95cb1ac0df80a43f8144aa9095d10f282cb5c19643ff88a6a79d"}, + {file = "shap-0.39.0-cp37-cp37m-win_amd64.whl", hash = "sha256:b44f9fbb7349f5406b98b4ec24c672f8fe932606bb7574a8aae2238410c55289"}, + {file = "shap-0.39.0-cp38-cp38-win_amd64.whl", hash = "sha256:c0d51b44c15eae1c12e51ed498f898cfc5e12d6be7e0d4f733ce6453f6ec85a4"}, + {file = "shap-0.39.0.tar.gz", hash = "sha256:0196a6c12cc98f8b48ce9c5968550902432b80290da6fa7be8655441a1c6251a"}, +] six = [ {file = "six-1.15.0-py2.py3-none-any.whl", hash = "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"}, {file = "six-1.15.0.tar.gz", hash = "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259"}, ] +slicer = [ + {file = "slicer-0.0.7-py3-none-any.whl", hash = "sha256:0b94faa5251c0f23782c03f7b7eedda91d80144059645f452c4bc80fab875976"}, + {file = "slicer-0.0.7.tar.gz", hash = "sha256:f5d5f7b45f98d155b9c0ba6554fa9770c6b26d5793a3e77a1030fb56910ebeec"}, +] sortedcontainers = [ {file = "sortedcontainers-2.3.0-py2.py3-none-any.whl", hash = "sha256:37257a32add0a3ee490bb170b599e93095eed89a55da91fa9f48753ea12fd73f"}, {file = "sortedcontainers-2.3.0.tar.gz", hash = "sha256:59cc937650cf60d677c16775597c89a960658a09cf7c1a668f86e1e4464b10a1"}, diff --git a/pyproject.toml b/pyproject.toml index 640b77a0..5221a929 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ lightgbm = "^3.1.1" pyts = "^0.11.0" imbalanced-learn = "^0.8.0" dtaidistance = "^2.2.5" +shap = "^0.39.0" [tool.poetry.dev-dependencies] pytest = "^6.1.0" diff --git a/src/thesis/classify.py b/src/thesis/classify.py index 7b99c87e..c7b2c07a 100644 --- a/src/thesis/classify.py +++ b/src/thesis/classify.py @@ -13,6 +13,7 @@ from lightgbm import LGBMClassifier import numpy as np import pandas as pd +import shap from sklearn import metrics from sklearn.base import BaseEstimator from sklearn.metrics import ( @@ -505,11 +506,12 @@ def _save_models( X = self.get_X(model_name) self._train(pipeline, X, range(0, len(X)), range(0), False) - if isinstance(get_classifier(pipeline), LGBMClassifier): - lightgbm.plot_importance(get_classifier(pipeline)) - util.finish_plot( - "feature_importance", model_folder, self.config["general"]["show_plots"] - ) + explainer = shap.Explainer(model) + shap_values = explainer(X) + shap.plots.bar(shap_values) + util.finish_plot( + "feature_importance", model_folder, self.config["general"]["show_plots"] + ) if is_keras(pipeline): pipeline.named_steps["classifier"].model.save( From c80c2f5d8a1915c5e49ec78c02b253c4bcf5af18 Mon Sep 17 00:00:00 2001 From: Joris Clement Date: Wed, 12 May 2021 18:40:17 +0200 Subject: [PATCH 2/7] Fixup feature importance --- mypy.ini | 3 +++ src/thesis/classify.py | 19 ++++++++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/mypy.ini b/mypy.ini index 4fa674b4..ee8946bb 100644 --- a/mypy.ini +++ b/mypy.ini @@ -53,3 +53,6 @@ ignore_missing_imports = True [mypy-dtaidistance.*] ignore_missing_imports = True + +[mypy-shap.*] +ignore_missing_imports = True diff --git a/src/thesis/classify.py b/src/thesis/classify.py index c7b2c07a..7c4e791d 100644 --- a/src/thesis/classify.py +++ b/src/thesis/classify.py @@ -506,9 +506,22 @@ def _save_models( X = self.get_X(model_name) self._train(pipeline, X, range(0, len(X)), range(0), False) - explainer = shap.Explainer(model) - shap_values = explainer(X) - shap.plots.bar(shap_values) + explainer = shap.Explainer( + pipeline.named_steps["classifier"], + feature_names=get_feature_names(pipeline[-2]), + output_names=self.defect_names, + ) + X_tr = pd.DataFrame( + data=pipeline[:-1].transform(X), + index=X.index, + columns=get_feature_names(pipeline[-2]), + ) + shap.summary_plot( + explainer.shap_values(X_tr), + X_tr, + class_names=self.defect_names, + show=self.config["general"]["show_plots"], + ) util.finish_plot( "feature_importance", model_folder, self.config["general"]["show_plots"] ) From ceca09bafb3b4ca93f7529ddb202b66288048771 Mon Sep 17 00:00:00 2001 From: Joris Clement Date: Wed, 12 May 2021 18:41:54 +0200 Subject: [PATCH 3/7] Fix defects and dependent defect_names --- src/thesis/classify.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/thesis/classify.py b/src/thesis/classify.py index 7c4e791d..4796d456 100644 --- a/src/thesis/classify.py +++ b/src/thesis/classify.py @@ -193,7 +193,7 @@ def __init__(self, config): index=build_index(self.measurements), dtype=np.int8, ) - self.defects: Final = sorted(set(self.y)) + self.defects: Final = [data.Defect(i) for i in sorted(set(self.y))] self.defect_names: Final = data.get_names(self.defects) self.cv_splits: Final = self._generate_cv_splits() From 845dc6cb4b8a730241534f6f47833a9303e62b69 Mon Sep 17 00:00:00 2001 From: Joris Clement Date: Thu, 13 May 2021 15:05:11 +0200 Subject: [PATCH 4/7] Show importance of all features --- src/thesis/classify.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/thesis/classify.py b/src/thesis/classify.py index 4796d456..5ced3d24 100644 --- a/src/thesis/classify.py +++ b/src/thesis/classify.py @@ -520,6 +520,7 @@ def _save_models( explainer.shap_values(X_tr), X_tr, class_names=self.defect_names, + max_display=X.shape[1], show=self.config["general"]["show_plots"], ) util.finish_plot( From c252a0930dae6bc3d3779f453245a0473661dbc4 Mon Sep 17 00:00:00 2001 From: Joris Clement Date: Thu, 13 May 2021 15:08:38 +0200 Subject: [PATCH 5/7] Improve feature names, align with document Changes: - Be more consistent in naming - Remove a duplicate definition - Correct a description Issue #105. --- src/thesis/fingerprint.py | 113 +++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 57 deletions(-) diff --git a/src/thesis/fingerprint.py b/src/thesis/fingerprint.py index b4628b82..fa48f64e 100644 --- a/src/thesis/fingerprint.py +++ b/src/thesis/fingerprint.py @@ -24,10 +24,10 @@ from .data import CLASS, get_defects, PATH, PD, TIME_DIFF, VOLTAGE_SIGN -PD_ID = "PD-Value" -PD_DIFF_ID = "PD-Diff" -TD_ID = "TimeDiff" -CORR_ID = "Correlate" +PD_ID = "A" +PD_DIFF_ID = "\u0394A" +TD_ID = "\u0394t" +CORR_ID = "Corr" class Group(Enum): @@ -41,31 +41,31 @@ def __str__(self): # @note: parameter in TU Graz fingerprint -PD_VAR = f"{PD_ID} Variance" -PD_SKEW = f"{PD_ID} Skewness" -PD_KURT = f"{PD_ID} Kurtosis" -PD_WEIB_A = f"{PD_ID} Weibull A" -PD_WEIB_B = f"{PD_ID} Weibull B" +PD_VAR = f"{PD_ID}-Var" +PD_SKEW = f"{PD_ID}-Skew" +PD_KURT = f"{PD_ID}-Kurt" +PD_WEIB_A = f"{PD_ID}-Weib-\u03B1" +PD_WEIB_B = f"{PD_ID} Weib-\u03B2" -PD_DIFF_WEIB_B = f"{PD_DIFF_ID} Weibull B" +PD_DIFF_WEIB_B = f"{PD_DIFF_ID}-Weib-\u03B2" -TD_MAX = f"{TD_ID} Max" -TD_MEAN = f"{TD_ID} Mean" -TD_MIN = f"{TD_ID} Min" -TD_VAR = f"{TD_ID} Variance" -TD_SKEW = f"{TD_ID} Skewness" -TD_KURT = f"{TD_ID} Kurtosis" -TDIFF_NORM_WEIB_A = f"{TD_ID} Sorted Normed Weibull A" -TDIFF_NORM_WEIB_B = f"{TD_ID} Sorted Normed Weibull B" +TD_MAX = f"{TD_ID}-Max" +TD_MEAN = f"{TD_ID}-Mean" +TD_MIN = f"{TD_ID}-Min" +TD_VAR = f"{TD_ID}-Var" +TD_SKEW = f"{TD_ID}-Skew" +TD_KURT = f"{TD_ID}-Kurt" +TDIFF_NORM_WEIB_A = f"{TD_ID}-Norm-Weib-\u03B1" +TDIFF_NORM_WEIB_B = f"{TD_ID}-Norm-Weib-\u03B2" # @note: parameter in Lukas fingerprint -PDS_PER_SEC = "Number of PDs/sec" +PDS_PER_SEC = "PDs/Sec" -PD_MEAN = f"{PD_ID} Mean" -PD_MAX = f"{PD_ID} Max" -PD_CV = f"{PD_ID} std/mean" -PD_SUM = f"{PD_ID} Sum" +PD_MEAN = f"{PD_ID}-Mean" +PD_MAX = f"{PD_ID}-Max" +PD_CV = f"{PD_ID}-Std/Mean" +PD_SUM = f"{PD_ID}-Sum" PD_DIFF_MEAN = f"{PD_DIFF_ID} Mean" PD_DIFF_SKEW = f"{PD_DIFF_ID} Skewness" @@ -73,14 +73,13 @@ def __str__(self): PD_DIFF_KURT = f"{PD_DIFF_ID} Kurtosis" PD_DIFF_WEIB_A = f"{PD_DIFF_ID} Weibull A" -TD_MEDIAN = f"{TD_ID} Median" +TD_MEDIAN = f"{TD_ID}-Median" -CORR_PD_DIFF_TO_PD_BINS = f"{CORR_ID} {PD_DIFF_ID} - PD Bins" -CORR_NEXT_PD_TO_PD_BINS = f"{CORR_ID} Next PD - PD Bins" -CORR_NEXT_PD_TO_PD = f"{CORR_ID} Next PD - PD" -CORR_PD_DIFF_TO_PD = f"{CORR_ID} {PD_DIFF_ID} - PD" -CORR_PD_DIFF_TO_TD = f"{CORR_ID} PD - {PD_DIFF_ID}" -CORR_PD_TO_TD = f"{CORR_ID} PD - {TD_ID}" +CORR_PD_DIFF_TO_PD_BINS = f"{CORR_ID}-{PD_DIFF_ID}-{PD_ID}-Bins" +CORR_NEXT_PD_TO_PD_BINS = f"Auto-{CORR_ID}-{PD_ID}-Bins" +CORR_NEXT_PD_TO_PD = f"Auto-{CORR_ID}-{PD_ID}-1" +CORR_PD_DIFF_TO_PD = f"{CORR_ID}-{PD_DIFF_ID}-{PD_ID}" +CORR_PD_TO_TD = f"{CORR_ID}-{PD_ID}-{TD_ID}" # @note: parameter in own fingerprint DURATION = "duration" @@ -89,37 +88,37 @@ def __str__(self): PD_MIN = f"{PD_ID} min" PD_MEDIAN = f"{PD_ID} Median" PD_STD = f"{PD_ID} std" -PD_VAR = f"{PD_ID} var" -PD_NUM_PEAKS_50 = f"{PD_ID} num peaks 50" -PD_NUM_PEAKS_10 = f"{PD_ID} num peaks 10" -PD_NUM_PEAKS_5 = f"{PD_ID} num peaks 5" -PD_NUM_PEAKS_100 = f"{PD_ID} num peaks 100" +PD_VAR = f"{PD_ID}-Var" +PD_NUM_PEAKS_50 = f"{PD_ID}-Num-peaks-50" +PD_NUM_PEAKS_10 = f"{PD_ID}-Num-peaks-10" +PD_NUM_PEAKS_5 = f"{PD_ID}-Num-peaks-5" +PD_NUM_PEAKS_100 = f"{PD_ID}-Num-peaks-100" PD_RATIO = f"{PD_ID} ratio" PD_PERC_REOCCUR = f"{PD_ID} percentage reocurring" -PD_COUNT_ABOVE_MEAN = f"{PD_ID} count above mean" -PD_COUNT_BELOW_MEAN = f"{PD_ID} count below mean" -PD_CHANGE_QUANTILES = f"{PD_ID} ChangeQuantiles" -PD_NORM_WEIB_A = f"{PD_ID} Weibull normed sorted A" -PD_NORM_WEIB_B = f"{PD_ID} Weibull normed sorted B" - -TD_LONGEST_STRIKE_BELOW_MEAN = f"{TD_ID} longest strike below mean" -TD_CHANGE_QUANTILES = f"{TD_ID} ChangeQuantiles" -TD_SUM = f"{TD_ID} Sum" - -CORR_2ND_NEXT_PD_TO_PD = f"{CORR_ID} Auto 2nd Next {PD_ID}" -CORR_3RD_NEXT_PD_TO_PD = f"{CORR_ID} Auto 3rd Next {PD_ID}" -CORR_5TH_NEXT_PD_TO_PD = f"{CORR_ID} Auto 5th Next {PD_ID}" -CORR_10TH_NEXT_PD_TO_PD = f"{CORR_ID} Auto 10th Next {PD_ID}" - -AUTOCORR_NEXT_TD = f"{CORR_ID} Auto Next {TD_ID}" -AUTOCORR_2ND_NEXT_TD = f"{CORR_ID} Auto 2nd Next {TD_ID}" -AUTOCORR_3RD_NEXT_TD = f"{CORR_ID} Auto 3rd Next {TD_ID}" -AUTOCORR_5TH_NEXT_TD = f"{CORR_ID} Auto 5th Next {TD_ID}" -AUTOCORR_10TH_NEXT_TD = f"{CORR_ID} Auto 10th Next {TD_ID}" +PD_COUNT_ABOVE_MEAN = f"{PD_ID}-Num->-mean" +PD_COUNT_BELOW_MEAN = f"{PD_ID}-Num-<-mean" +PD_CHANGE_QUANTILES = f"{PD_ID}-change-quantiles" +PD_NORM_WEIB_A = f"{PD_ID}-norm-Weib-\u03B1" +PD_NORM_WEIB_B = f"{PD_ID}-norm-Weib-\u03B2" +TD_LONGEST_STRIKE_BELOW_MEAN = f"{PD_ID}-max-strike-<-mean" + +TD_CHANGE_QUANTILES = f"{TD_ID}-Change-quantiles" +TD_SUM = f"{TD_ID}-Sum" + +CORR_2ND_NEXT_PD_TO_PD = f"Auto-{CORR_ID}-{PD_ID}-2" +CORR_3RD_NEXT_PD_TO_PD = f"Auto-{CORR_ID}-{PD_ID}-3" +CORR_5TH_NEXT_PD_TO_PD = f"Auto-{CORR_ID}-{PD_ID}-5" +CORR_10TH_NEXT_PD_TO_PD = f"Auto-{CORR_ID}-{PD_ID}-10" + +AUTOCORR_NEXT_TD = f"Auto-{CORR_ID}-{TD_ID}-1" +AUTOCORR_2ND_NEXT_TD = f"Auto-{CORR_ID}-{TD_ID}-2" +AUTOCORR_3RD_NEXT_TD = f"Auto-{CORR_ID}-{TD_ID}-3" +AUTOCORR_5TH_NEXT_TD = f"Auto-{CORR_ID}-{TD_ID}-5" +AUTOCORR_10TH_NEXT_TD = f"Auto-{CORR_ID}-{TD_ID}-10" # @note: further parameters -PD_BY_TD_WEIB_A = f"{PD_ID} / {TD_ID} Weibull A" -PD_BY_TD_WEIB_B = f"{PD_ID} / {TD_ID} Weibull B" +PD_BY_TD_WEIB_A = f"{PD_ID}/{TD_ID}-Weib-\u03B1" +PD_BY_TD_WEIB_B = f"{PD_ID}/{TD_ID}-Weibu-\u03B2" def get_parameter_group(df: pd.DataFrame, group: Group) -> pd.DataFrame: From 2d8712c4e70d800c692b4bc95f3c4b6d65b5c162 Mon Sep 17 00:00:00 2001 From: Joris Clement Date: Fri, 14 May 2021 11:24:07 +0200 Subject: [PATCH 6/7] Calc feature importance only for certain models Just calculate the feature importance for fingerprint (feature-based) models and exclude the k-NN classifier as the library does not seem to support it. --- src/thesis/classify.py | 52 +++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/src/thesis/classify.py b/src/thesis/classify.py index 5ced3d24..f8636515 100644 --- a/src/thesis/classify.py +++ b/src/thesis/classify.py @@ -21,6 +21,7 @@ balanced_accuracy_score, ) from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold +from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import FeatureUnion, Pipeline from sklearn.preprocessing import FunctionTransformer from sklearn.utils import estimator_html_repr @@ -73,6 +74,10 @@ tensorflow.random.set_seed(SEED) +def is_pipeline_finger(pipeline: Pipeline) -> bool: + return is_data_finger(list(pipeline.named_steps.keys())[0]) + + def combine(dataPart: DataPart, metric_name: str): return f"{dataPart}_{metric_name}" @@ -351,7 +356,7 @@ def _train( (get_data_transformer(pipeline).transform(X_val), y_val), (get_data_transformer(pipeline).transform(X_train), y_train), ] - if is_data_finger(list(pipeline.named_steps.keys())[0]): + if is_pipeline_finger(pipeline): feature_name, categorical_feature = get_categorical_features_info( get_data_transformer(pipeline), X ) @@ -506,26 +511,31 @@ def _save_models( X = self.get_X(model_name) self._train(pipeline, X, range(0, len(X)), range(0), False) - explainer = shap.Explainer( - pipeline.named_steps["classifier"], - feature_names=get_feature_names(pipeline[-2]), - output_names=self.defect_names, - ) - X_tr = pd.DataFrame( - data=pipeline[:-1].transform(X), - index=X.index, - columns=get_feature_names(pipeline[-2]), - ) - shap.summary_plot( - explainer.shap_values(X_tr), - X_tr, - class_names=self.defect_names, - max_display=X.shape[1], - show=self.config["general"]["show_plots"], - ) - util.finish_plot( - "feature_importance", model_folder, self.config["general"]["show_plots"] - ) + if ( + is_pipeline_finger(pipeline) + and not isinstance(get_classifier(pipeline), KNeighborsClassifier) + and "finger_all" not in pipeline.named_steps + ): + explainer = shap.Explainer( + pipeline.named_steps["classifier"], + feature_names=get_feature_names(pipeline[0]), + output_names=self.defect_names, + ) + X_tr = pd.DataFrame( + data=pipeline[:-1].transform(X), + index=X.index, + columns=get_feature_names(pipeline[0]), + ) + shap.summary_plot( + explainer.shap_values(X_tr), + X_tr, + class_names=self.defect_names, + max_display=X.shape[1], + show=self.config["general"]["show_plots"], + ) + util.finish_plot( + "feature_importance", model_folder, self.config["general"]["show_plots"] + ) if is_keras(pipeline): pipeline.named_steps["classifier"].model.save( From 7010057e1c2be1887449203876a1c295cfcfddff Mon Sep 17 00:00:00 2001 From: Joris Clement Date: Fri, 14 May 2021 13:36:34 +0200 Subject: [PATCH 7/7] Fix failing CI The CI fails due to something of the SHAP dependencies not being installed correctly. Part of the error output is: C extension was not built during install! Fix this by restricting the feature importance calculation to the LGBM classifier as that classifier does not need the C extension. --- src/thesis/classify.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/thesis/classify.py b/src/thesis/classify.py index f8636515..a62a912f 100644 --- a/src/thesis/classify.py +++ b/src/thesis/classify.py @@ -21,7 +21,6 @@ balanced_accuracy_score, ) from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold -from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import FeatureUnion, Pipeline from sklearn.preprocessing import FunctionTransformer from sklearn.utils import estimator_html_repr @@ -513,7 +512,9 @@ def _save_models( if ( is_pipeline_finger(pipeline) - and not isinstance(get_classifier(pipeline), KNeighborsClassifier) + # @note: LGBMClassifier is chosen to make the tests pass the CI, + # but all classifiers except k-NN also work. + and isinstance(get_classifier(pipeline), LGBMClassifier) and "finger_all" not in pipeline.named_steps ): explainer = shap.Explainer(