Merge pull request #1 from jmyrberg/modified-aakr

Added modified AAKR
jmyrberg · Dec 29, 2020 · 5fa7c93 · 5fa7c93
2 parents 88dc468 + 9528f70
commit 5fa7c93
Show file tree

Hide file tree

Showing 5 changed files with 120 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -35,7 +35,7 @@ X_obs_nc = aakr.transform(X_obs)
 
 ## References
 
-* [Assessment of Statistical and Classification Models For Monitoring EDF’s  Assets”](https://link.springer.com/chapter/10.1007/978-0-85729-320-6_52)
+* [Assessment of Statistical and Classification Models For Monitoring EDF’s  Assets](https://link.springer.com/chapter/10.1007/978-0-85729-320-6_52)
 
 * [A modified Auto Associative Kernel Regression method for robust signal reconstruction in nuclear power plant components](https://www.researchgate.net/publication/292538769_A_modified_Auto_Associative_Kernel_Regression_method_for_robust_signal_reconstruction_in_nuclear_power_plant_components)
 

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.0.1dev6
+0.0.1a
diff --git a/aakr/_aakr.py b/aakr/_aakr.py
@@ -17,7 +17,15 @@ class AAKR(TransformerMixin, BaseEstimator):
         Metric for calculating kernel distances, see available metrics from
         `sklearn.metrics.pairwise_distances <https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html>`_.
     bw : float, default=1.0
-        Kernel bandwith parameter.
+        Gaussian Radial Basis Function (RBF) bandwith parameter.
+    modified : bool, default=False
+        Whether to use the modified version of AAKR (see reference [2]). The
+        modified version reduces the contribution provided by those signals
+        which are expected to be subject to the abnormal conditions.
+    penalty : array-like or list of shape (n_features, 1) or None, default=None
+            Penalty vector for the modified AAKR - only used when parameter
+            modified=True. If modified AAKR used and penalty=None, penalty
+            vector is automatically determined.
     n_jobs : int, default=-1
         The number of jobs to run in parallel.
 
@@ -37,11 +45,37 @@ class AAKR(TransformerMixin, BaseEstimator):
            signal reconstruction in nuclear power plant components", European
            Safety and Reliability Conference ESREL.
     """
-    def __init__(self, metric='euclidean', bw=1, n_jobs=-1):
+    def __init__(self, metric='euclidean', bw=1., modified=False, penalty=None,
+                 n_jobs=-1):
         self.metric = metric
         self.bw = bw
+        self.modified = modified
+        self.penalty = penalty
         self.n_jobs = n_jobs
-        # TODO: Implement modified -version
+
+    def _fit_validation(self, X):
+        X = check_array(X)
+
+        if self.modified:
+            if self.penalty is not None:
+                penalty = check_array(self.penalty, ensure_2d=False)
+                if len(penalty) != X.shape[1]:
+                    raise ValueError('Shape of input is different from what '
+                                     'is defined in penalty vector ('
+                                     f'{X.shape[1]} != {len(penalty)})')
+        elif not self.modified and self.penalty is not None:
+            raise ValueError('Parameter `penalty` given, but `modified=False`.'
+                             'Please set `modified=True` to make use of the '
+                             'penalty vector, or set `penalty=None`.')
+
+    def _rbf_kernel(self, X_obs_nc, X_obs):
+        # Kernel regression
+        D = pairwise_distances(X=X_obs_nc, Y=X_obs,
+                               metric=self.metric, n_jobs=self.n_jobs)
+        k = 1 / np.sqrt(2 * np.pi * self.bw ** 2)
+        w = k * np.exp(-D ** 2 / (2 * self.bw ** 2))
+
+        return w
 
     def fit(self, X, y=None):
         """Fit normal condition examples.
@@ -59,9 +93,10 @@ def fit(self, X, y=None):
             Returns self.
         """
         # Validation
-        X = check_array(X)
+        self._fit_validation(X)
 
-        # Save history
+        # Fit = save history
+        # TODO: Add pruning options as a parameter... sampling?
         self.X_ = X
 
         return self
@@ -82,7 +117,7 @@ def partial_fit(self, X, y=None):
             Returns self.
         """
         # Validation
-        X = check_array(X)
+        self._fit_validation(X)
 
         # Fit
         if hasattr(self, 'X_'):
@@ -95,7 +130,7 @@ def partial_fit(self, X, y=None):
 
         return self
 
-    def transform(self, X, **kwargs):
+    def transform(self, X):
         """Transform given array into expected values in normal conditions.
 
         Parameters
@@ -117,12 +152,46 @@ def transform(self, X, **kwargs):
             raise ValueError('Shape of input is different from what was seen'
                              'in `fit`')
 
-        # Kernel regression
-        D = pairwise_distances(X=self.X_, Y=X, metric=self.metric,
-                               n_jobs=self.n_jobs, **kwargs)
-        k = 1 / np.sqrt(2 * np.pi * self.bw ** 2)
-        w = k * np.exp(-D ** 2 / (2 * self.bw ** 2))
-        w_sum = w.sum(0)
-        X_nc = w.T.dot(self.X_) / np.where(w_sum == 0, 1, w_sum)[:, None]
+        # Modified AAKR basically sorts the columns
+        # TODO: Needs to be verified that everything here is correct
+        if self.modified:
+            X_obs_nc = self.X_
+            X_nc = np.zeros(X.shape)
+
+            # Penalty matrix (J x J, where J is the number of features)
+            if self.penalty is None:
+                D = np.diag(np.arange(X.shape[1]) + 1) ** 2.
+                D /= D.sum()
+            else:
+                D = np.diag(self.penalty).astype('float')
+
+            for i, X_obs in enumerate(X):  # TODO: Vectorize
+                # Standardized contributions in decreasing order (J, 1)
+                diff = (np.abs(X_obs - X_obs_nc) / X_obs_nc.std(0)).sum(0)
+                order = diff.argsort()[::-1]
+
+                # Historical examples with ordered signals and penalty applied
+                # (N_obs_nc x J)
+                row_selector = np.arange(len(X_obs_nc))[:, np.newaxis]
+                X_obs_nc_new = X_obs_nc[row_selector, order].dot(D)
+
+                # New observations with ordered features and penalty applied
+                # (1 x J)
+                X_obs_new = X_obs[order].dot(D)[np.newaxis, :]
+
+                # Weights for each observation (N_obs_nc, 1)
+                w = self._rbf_kernel(X_obs_nc_new, X_obs_new)
+
+                # Apply kernel and save the results (1, J)
+                w_sum = w.sum(0)
+                w_div = np.where(w_sum == 0, 1, w_sum)[:, np.newaxis]
+
+                X_nc[i, :] = w.T.dot(X_obs_nc) / w_div
+        else:
+            w = self._rbf_kernel(self.X_, X)
+            w_sum = w.sum(0)
+            w_div = np.where(w_sum == 0, 1, w_sum)[:, np.newaxis]
+
+            X_nc = w.T.dot(self.X_) / w_div
 
         return X_nc
diff --git a/setup.py b/setup.py
@@ -18,7 +18,7 @@
     author_email='jesse.myrberg@gmail.com',
     url='https://github.com/jmyrberg/aakr',
     keywords=['aakr', 'auto', 'associative', 'kernel', 'regression', 'anomaly',
-              'detection'],
+              'detection', 'signal', 'reconstruction'],
     install_requires=[
         'numpy>=1.19.4',
         'pandas>=1.1.5',
@@ -27,7 +27,7 @@
     packages=setuptools.find_packages(),
     include_package_data=True,
     classifiers=[
-        'Development Status :: 2 - Pre-Alpha',
+        'Development Status :: 3 - Alpha',
         'Programming Language :: Python :: 3',
         'License :: OSI Approved :: MIT License',
         'Intended Audience :: Science/Research',
@@ -43,7 +43,8 @@
     extras_require={
         'tests': [
             'pytest',
-            'pytest-cov'],
+            'pytest-cov'
+        ],
         'docs': [
             'sphinx',
             'sphinx_rtd_theme',

diff --git a/tests/test_aakr.py b/tests/test_aakr.py
@@ -4,7 +4,11 @@
 import pytest
 
 from sklearn.datasets import load_linnerud
-from sklearn.utils.testing import assert_allclose
+
+try:  # scikit-learn < 0.24.0
+    from sklearn.utils.testing import assert_allclose
+except ModuleNotFoundError:  # scikit-learn >= 0.24.0
+    from sklearn.utils._testing import assert_allclose
 
 from aakr import AAKR
 
@@ -19,6 +23,8 @@ def test_aakr(data):
     aakr = AAKR()
     assert aakr.metric == 'euclidean'
     assert aakr.bw == 1
+    assert not aakr.modified
+    assert aakr.penalty is None
     assert aakr.n_jobs == -1
 
     aakr.fit(X)
@@ -44,3 +50,27 @@ def test_aakr_partial_fit_input_shape_mismatch(data):
 
     with pytest.raises(ValueError, match='Shape of input is different'):
         aakr.partial_fit(X[:, :-1])
+
+
+def test_aakr_modified(data):
+    X = data[0]
+
+    # Modified, no penalty given
+    aakr = AAKR(modified=True, penalty=None)
+    X_nc = aakr.fit(X).transform(X[:3])
+    assert hasattr(aakr, 'X_')
+    assert_allclose(X_nc, X[:3], atol=1.)
+
+    # Modified, penalty given
+    aakr = AAKR(modified=True, penalty=[1] * X.shape[1])
+    X_nc = aakr.fit(X).transform(X[:3])
+    assert hasattr(aakr, 'X_')
+    assert_allclose(X_nc, X[:3], atol=1.)
+
+    # Modified, penalty given, mismatch with input data
+    with pytest.raises(ValueError, match='Shape of input is different from'):
+        AAKR(modified=True, penalty=[1] * (X.shape[1] - 1)).fit(X)
+
+    # No modified, penalty given
+    with pytest.raises(ValueError, match='Parameter `penalty` given, but'):
+        AAKR(modified=False, penalty=[1] * X.shape[1]).fit(X)