Skip to content
This repository has been archived by the owner on Feb 18, 2023. It is now read-only.

Commit

Permalink
Merge pull request #106 from flyingdutchman23/feature-importance
Browse files Browse the repository at this point in the history
Add feature importance plot
  • Loading branch information
joclement committed May 14, 2021
2 parents 9c9365b + 7010057 commit cce9858
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 62 deletions.
3 changes: 3 additions & 0 deletions mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,6 @@ ignore_missing_imports = True

[mypy-dtaidistance.*]
ignore_missing_imports = True

[mypy-shap.*]
ignore_missing_imports = True
48 changes: 47 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ lightgbm = "^3.1.1"
pyts = "^0.11.0"
imbalanced-learn = "^0.8.0"
dtaidistance = "^2.2.5"
shap = "^0.39.0"

[tool.poetry.dev-dependencies]
pytest = "^6.1.0"
Expand Down
35 changes: 31 additions & 4 deletions src/thesis/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd
import shap
from sklearn import metrics
from sklearn.base import BaseEstimator
from sklearn.metrics import (
Expand Down Expand Up @@ -72,6 +73,10 @@
tensorflow.random.set_seed(SEED)


def is_pipeline_finger(pipeline: Pipeline) -> bool:
return is_data_finger(list(pipeline.named_steps.keys())[0])


def combine(dataPart: DataPart, metric_name: str):
return f"{dataPart}_{metric_name}"

Expand Down Expand Up @@ -192,7 +197,7 @@ def __init__(self, config):
index=build_index(self.measurements),
dtype=np.int8,
)
self.defects: Final = sorted(set(self.y))
self.defects: Final = [data.Defect(i) for i in sorted(set(self.y))]
self.defect_names: Final = data.get_names(self.defects)
self.cv_splits: Final = self._generate_cv_splits()

Expand Down Expand Up @@ -350,7 +355,7 @@ def _train(
(get_data_transformer(pipeline).transform(X_val), y_val),
(get_data_transformer(pipeline).transform(X_train), y_train),
]
if is_data_finger(list(pipeline.named_steps.keys())[0]):
if is_pipeline_finger(pipeline):
feature_name, categorical_feature = get_categorical_features_info(
get_data_transformer(pipeline), X
)
Expand Down Expand Up @@ -505,8 +510,30 @@ def _save_models(
X = self.get_X(model_name)
self._train(pipeline, X, range(0, len(X)), range(0), False)

if isinstance(get_classifier(pipeline), LGBMClassifier):
lightgbm.plot_importance(get_classifier(pipeline))
if (
is_pipeline_finger(pipeline)
# @note: LGBMClassifier is chosen to make the tests pass the CI,
# but all classifiers except k-NN also work.
and isinstance(get_classifier(pipeline), LGBMClassifier)
and "finger_all" not in pipeline.named_steps
):
explainer = shap.Explainer(
pipeline.named_steps["classifier"],
feature_names=get_feature_names(pipeline[0]),
output_names=self.defect_names,
)
X_tr = pd.DataFrame(
data=pipeline[:-1].transform(X),
index=X.index,
columns=get_feature_names(pipeline[0]),
)
shap.summary_plot(
explainer.shap_values(X_tr),
X_tr,
class_names=self.defect_names,
max_display=X.shape[1],
show=self.config["general"]["show_plots"],
)
util.finish_plot(
"feature_importance", model_folder, self.config["general"]["show_plots"]
)
Expand Down
113 changes: 56 additions & 57 deletions src/thesis/fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@
from .data import CLASS, get_defects, PATH, PD, TIME_DIFF, VOLTAGE_SIGN


PD_ID = "PD-Value"
PD_DIFF_ID = "PD-Diff"
TD_ID = "TimeDiff"
CORR_ID = "Correlate"
PD_ID = "A"
PD_DIFF_ID = "\u0394A"
TD_ID = "\u0394t"
CORR_ID = "Corr"


class Group(Enum):
Expand All @@ -41,46 +41,45 @@ def __str__(self):


# @note: parameter in TU Graz fingerprint
PD_VAR = f"{PD_ID} Variance"
PD_SKEW = f"{PD_ID} Skewness"
PD_KURT = f"{PD_ID} Kurtosis"
PD_WEIB_A = f"{PD_ID} Weibull A"
PD_WEIB_B = f"{PD_ID} Weibull B"
PD_VAR = f"{PD_ID}-Var"
PD_SKEW = f"{PD_ID}-Skew"
PD_KURT = f"{PD_ID}-Kurt"
PD_WEIB_A = f"{PD_ID}-Weib-\u03B1"
PD_WEIB_B = f"{PD_ID} Weib-\u03B2"

PD_DIFF_WEIB_B = f"{PD_DIFF_ID} Weibull B"
PD_DIFF_WEIB_B = f"{PD_DIFF_ID}-Weib-\u03B2"

TD_MAX = f"{TD_ID} Max"
TD_MEAN = f"{TD_ID} Mean"
TD_MIN = f"{TD_ID} Min"
TD_VAR = f"{TD_ID} Variance"
TD_SKEW = f"{TD_ID} Skewness"
TD_KURT = f"{TD_ID} Kurtosis"
TDIFF_NORM_WEIB_A = f"{TD_ID} Sorted Normed Weibull A"
TDIFF_NORM_WEIB_B = f"{TD_ID} Sorted Normed Weibull B"
TD_MAX = f"{TD_ID}-Max"
TD_MEAN = f"{TD_ID}-Mean"
TD_MIN = f"{TD_ID}-Min"
TD_VAR = f"{TD_ID}-Var"
TD_SKEW = f"{TD_ID}-Skew"
TD_KURT = f"{TD_ID}-Kurt"
TDIFF_NORM_WEIB_A = f"{TD_ID}-Norm-Weib-\u03B1"
TDIFF_NORM_WEIB_B = f"{TD_ID}-Norm-Weib-\u03B2"


# @note: parameter in Lukas fingerprint
PDS_PER_SEC = "Number of PDs/sec"
PDS_PER_SEC = "PDs/Sec"

PD_MEAN = f"{PD_ID} Mean"
PD_MAX = f"{PD_ID} Max"
PD_CV = f"{PD_ID} std/mean"
PD_SUM = f"{PD_ID} Sum"
PD_MEAN = f"{PD_ID}-Mean"
PD_MAX = f"{PD_ID}-Max"
PD_CV = f"{PD_ID}-Std/Mean"
PD_SUM = f"{PD_ID}-Sum"

PD_DIFF_MEAN = f"{PD_DIFF_ID} Mean"
PD_DIFF_SKEW = f"{PD_DIFF_ID} Skewness"
PD_DIFF_VAR = f"{PD_DIFF_ID} Variance"
PD_DIFF_KURT = f"{PD_DIFF_ID} Kurtosis"
PD_DIFF_WEIB_A = f"{PD_DIFF_ID} Weibull A"

TD_MEDIAN = f"{TD_ID} Median"
TD_MEDIAN = f"{TD_ID}-Median"

CORR_PD_DIFF_TO_PD_BINS = f"{CORR_ID} {PD_DIFF_ID} - PD Bins"
CORR_NEXT_PD_TO_PD_BINS = f"{CORR_ID} Next PD - PD Bins"
CORR_NEXT_PD_TO_PD = f"{CORR_ID} Next PD - PD"
CORR_PD_DIFF_TO_PD = f"{CORR_ID} {PD_DIFF_ID} - PD"
CORR_PD_DIFF_TO_TD = f"{CORR_ID} PD - {PD_DIFF_ID}"
CORR_PD_TO_TD = f"{CORR_ID} PD - {TD_ID}"
CORR_PD_DIFF_TO_PD_BINS = f"{CORR_ID}-{PD_DIFF_ID}-{PD_ID}-Bins"
CORR_NEXT_PD_TO_PD_BINS = f"Auto-{CORR_ID}-{PD_ID}-Bins"
CORR_NEXT_PD_TO_PD = f"Auto-{CORR_ID}-{PD_ID}-1"
CORR_PD_DIFF_TO_PD = f"{CORR_ID}-{PD_DIFF_ID}-{PD_ID}"
CORR_PD_TO_TD = f"{CORR_ID}-{PD_ID}-{TD_ID}"

# @note: parameter in own fingerprint
DURATION = "duration"
Expand All @@ -89,37 +88,37 @@ def __str__(self):
PD_MIN = f"{PD_ID} min"
PD_MEDIAN = f"{PD_ID} Median"
PD_STD = f"{PD_ID} std"
PD_VAR = f"{PD_ID} var"
PD_NUM_PEAKS_50 = f"{PD_ID} num peaks 50"
PD_NUM_PEAKS_10 = f"{PD_ID} num peaks 10"
PD_NUM_PEAKS_5 = f"{PD_ID} num peaks 5"
PD_NUM_PEAKS_100 = f"{PD_ID} num peaks 100"
PD_VAR = f"{PD_ID}-Var"
PD_NUM_PEAKS_50 = f"{PD_ID}-Num-peaks-50"
PD_NUM_PEAKS_10 = f"{PD_ID}-Num-peaks-10"
PD_NUM_PEAKS_5 = f"{PD_ID}-Num-peaks-5"
PD_NUM_PEAKS_100 = f"{PD_ID}-Num-peaks-100"
PD_RATIO = f"{PD_ID} ratio"
PD_PERC_REOCCUR = f"{PD_ID} percentage reocurring"
PD_COUNT_ABOVE_MEAN = f"{PD_ID} count above mean"
PD_COUNT_BELOW_MEAN = f"{PD_ID} count below mean"
PD_CHANGE_QUANTILES = f"{PD_ID} ChangeQuantiles"
PD_NORM_WEIB_A = f"{PD_ID} Weibull normed sorted A"
PD_NORM_WEIB_B = f"{PD_ID} Weibull normed sorted B"

TD_LONGEST_STRIKE_BELOW_MEAN = f"{TD_ID} longest strike below mean"
TD_CHANGE_QUANTILES = f"{TD_ID} ChangeQuantiles"
TD_SUM = f"{TD_ID} Sum"

CORR_2ND_NEXT_PD_TO_PD = f"{CORR_ID} Auto 2nd Next {PD_ID}"
CORR_3RD_NEXT_PD_TO_PD = f"{CORR_ID} Auto 3rd Next {PD_ID}"
CORR_5TH_NEXT_PD_TO_PD = f"{CORR_ID} Auto 5th Next {PD_ID}"
CORR_10TH_NEXT_PD_TO_PD = f"{CORR_ID} Auto 10th Next {PD_ID}"

AUTOCORR_NEXT_TD = f"{CORR_ID} Auto Next {TD_ID}"
AUTOCORR_2ND_NEXT_TD = f"{CORR_ID} Auto 2nd Next {TD_ID}"
AUTOCORR_3RD_NEXT_TD = f"{CORR_ID} Auto 3rd Next {TD_ID}"
AUTOCORR_5TH_NEXT_TD = f"{CORR_ID} Auto 5th Next {TD_ID}"
AUTOCORR_10TH_NEXT_TD = f"{CORR_ID} Auto 10th Next {TD_ID}"
PD_COUNT_ABOVE_MEAN = f"{PD_ID}-Num->-mean"
PD_COUNT_BELOW_MEAN = f"{PD_ID}-Num-<-mean"
PD_CHANGE_QUANTILES = f"{PD_ID}-change-quantiles"
PD_NORM_WEIB_A = f"{PD_ID}-norm-Weib-\u03B1"
PD_NORM_WEIB_B = f"{PD_ID}-norm-Weib-\u03B2"
TD_LONGEST_STRIKE_BELOW_MEAN = f"{PD_ID}-max-strike-<-mean"

TD_CHANGE_QUANTILES = f"{TD_ID}-Change-quantiles"
TD_SUM = f"{TD_ID}-Sum"

CORR_2ND_NEXT_PD_TO_PD = f"Auto-{CORR_ID}-{PD_ID}-2"
CORR_3RD_NEXT_PD_TO_PD = f"Auto-{CORR_ID}-{PD_ID}-3"
CORR_5TH_NEXT_PD_TO_PD = f"Auto-{CORR_ID}-{PD_ID}-5"
CORR_10TH_NEXT_PD_TO_PD = f"Auto-{CORR_ID}-{PD_ID}-10"

AUTOCORR_NEXT_TD = f"Auto-{CORR_ID}-{TD_ID}-1"
AUTOCORR_2ND_NEXT_TD = f"Auto-{CORR_ID}-{TD_ID}-2"
AUTOCORR_3RD_NEXT_TD = f"Auto-{CORR_ID}-{TD_ID}-3"
AUTOCORR_5TH_NEXT_TD = f"Auto-{CORR_ID}-{TD_ID}-5"
AUTOCORR_10TH_NEXT_TD = f"Auto-{CORR_ID}-{TD_ID}-10"

# @note: further parameters
PD_BY_TD_WEIB_A = f"{PD_ID} / {TD_ID} Weibull A"
PD_BY_TD_WEIB_B = f"{PD_ID} / {TD_ID} Weibull B"
PD_BY_TD_WEIB_A = f"{PD_ID}/{TD_ID}-Weib-\u03B1"
PD_BY_TD_WEIB_B = f"{PD_ID}/{TD_ID}-Weibu-\u03B2"


def get_parameter_group(df: pd.DataFrame, group: Group) -> pd.DataFrame:
Expand Down

0 comments on commit cce9858

Please sign in to comment.