add AD

hkaneko1985 · Sep 16, 2019 · 269344a · 269344a
1 parent ed24eef
commit 269344a
Show file tree

Hide file tree

Showing 4 changed files with 227 additions and 1 deletion.
diff --git a/dcekit/validation/__init__.py b/dcekit/validation/__init__.py
@@ -12,3 +12,4 @@
 from .validation import y_randomization_with_hyperparam_opt
 from .validation import double_cross_validation
 from .validation import mae_cce
+from .applicability_domain import ApplicabilityDomain
diff --git a/dcekit/validation/applicability_domain.py b/dcekit/validation/applicability_domain.py
@@ -0,0 +1,126 @@
+# -*- coding: utf-8 -*-
+# %reset -f
+"""
+@author: Hiromasa Kaneko
+"""
+
+import sys
+
+import numpy as np
+from sklearn.svm import OneClassSVM
+from sklearn.neighbors import NearestNeighbors, LocalOutlierFactor
+from scipy.spatial.distance import cdist
+
+
+class ApplicabilityDomain():
+    def __init__(self, method_name='ocsvm', rate_of_outliers=0.01, gamma='auto', nu=0.5, n_neighbors=10,
+                 metric='minkowski', p=2):
+        """
+        Applicability Domain (AD)
+        
+        Parameters
+        ----------
+        method_name: str, default 'ocsvm'
+            The name of method to set AD. 'knn', 'lof', or 'ocsvm'
+        rate_of_outliers: float, default 0.01
+            Rate of outlier samples. This is used to set threshold
+        gamma : (only for 'ocsvm') float, default ’auto’
+            Kernel coefficient for ‘rbf’. Current default is ‘auto’ which optimize gamma to maximize variance in Gram matrix
+        nu : (only for 'ocsvm') float, default 0.5
+            An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors. Should be in the interval (0, 1]. By default 0.5 will be taken.
+            https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html#sklearn.svm.OneClassSVM
+        n_neighbors: (only for 'knn' and 'lof') int, default 10
+            Number of neighbors to use for each query
+            https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
+        metric : string or callable, default ‘minkowski’
+            Metric to use for distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used.
+            https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
+        p : integer, default 2
+            Parameter for the Minkowski metric from sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+            https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
+        """
+
+        if method_name != 'knn' and method_name != 'lof' and method_name != 'ocsvm':
+            sys.exit('There is no ad method named \'{0}\'. Please check the variable of method_name.'.format(method_name))
+
+        self.method_name = method_name
+        self.rate_of_outliers = rate_of_outliers
+        self.gamma = gamma
+        self.nu = nu
+        self.n_neighbors = n_neighbors
+        self.metric = metric
+        self.p = p
+
+    def fit(self, x):
+        """
+        Applicability Domain (AD)
+        
+        Set AD
+    
+        Parameters
+        ----------
+        x : numpy.array or pandas.DataFrame
+            m x n matrix of X-variables of training data,
+            m is the number of training sammples and
+            n is the number of X-variables
+        """
+
+        x = np.array(x)
+
+        if self.method_name == 'ocsvm':
+            if self.gamma == 'auto':
+                ocsvm_gammas = 2 ** np.arange(-20, 11, dtype=float)
+                variance_of_gram_matrix = []
+                for index, ocsvm_gamma in enumerate(ocsvm_gammas):
+                    gram_matrix = np.exp(-ocsvm_gamma * cdist(x, x, metric='seuclidean'))
+                    variance_of_gram_matrix.append(gram_matrix.var(ddof=1))
+                self.optimal_gamma = ocsvm_gammas[variance_of_gram_matrix.index(max(variance_of_gram_matrix))]
+            else:
+                self.optimal_gamma = self.gamma
+            self.ad = OneClassSVM(kernel='rbf', gamma=self.optimal_gamma, nu=self.nu)
+            self.ad.fit(x)
+            ad_values = np.ndarray.flatten(self.ad.decision_function(x))
+
+        elif self.method_name == 'knn':
+            self.ad = NearestNeighbors(n_neighbors=self.n_neighbors)
+            self.ad.fit(x)
+            knn_dist_all, knn_ind_all = self.ad.kneighbors(None)
+            ad_values = 1 / (knn_dist_all.mean(axis=1) + 1)
+        elif self.method_name == 'lof':
+            self.ad = LocalOutlierFactor(novelty=True, contamination=self.rate_of_outliers)
+            self.ad.fit(x)
+            ad_values = self.ad.negative_outlier_factor_ - self.ad.offset_
+
+        self.offset = np.percentile(ad_values, 100 * self.rate_of_outliers)
+
+
+    def predict(self, x):
+        """
+        Applicability Domain (AD)
+        
+        Predict AD-values 
+    
+        Parameters
+        ----------
+        x : numpy.array or pandas.DataFrame
+            k x n matrix of X-variables of test data, which is autoscaled with training data,
+            and k is the number of test samples
+    
+        Returns
+        -------
+        ad_values : numpy.array, shape (n_samples,)
+            values lower than 0 means outside of AD
+        """
+
+        x = np.array(x)
+
+        if self.method_name == 'ocsvm':
+            ad_values = np.ndarray.flatten(self.ad.decision_function(x))
+
+        elif self.method_name == 'knn':
+            knn_dist_all, knn_ind_all = self.ad.kneighbors(x)
+            ad_values = 1 / (knn_dist_all.mean(axis=1) + 1)
+        elif self.method_name == 'lof':
+            ad_values = np.ndarray.flatten(self.ad.decision_function(x))
+
+        return ad_values - self.offset
diff --git a/demo_ad.py b/demo_ad.py
@@ -0,0 +1,99 @@
+# -*- coding: utf-8 -*-
+# %reset -f
+"""
+@author: Hiromasa Kaneko
+"""
+
+import matplotlib.figure as figure
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from dcekit.validation import ApplicabilityDomain
+from sklearn.datasets import load_boston
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import WhiteKernel, RBF, ConstantKernel, Matern, DotProduct
+from sklearn.model_selection import train_test_split
+
+method_name = 'ocsvm'  # 'ocsvm' or 'knn' or 'lof'
+rate_of_outliers = 0.05
+
+number_of_test_samples = 400
+fold_number = 5
+
+# load dataset
+boston = load_boston()
+y = boston.target
+x = boston.data
+x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=number_of_test_samples, random_state=0)
+
+#index = np.argsort(boston.target)
+#y = boston.target[index]
+#x = boston.data[index, :]
+#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=number_of_test_samples, shuffle=False)
+
+# autoscaling
+autoscaled_x_train = (x_train - x_train.mean(axis=0)) / x_train.std(axis=0, ddof=1)
+autoscaled_y_train = (y_train - y_train.mean()) / y_train.std(ddof=1)
+autoscaled_x_test = (x_test - x_train.mean(axis=0)) / x_train.std(axis=0, ddof=1)
+
+# Gaussian process regression
+model = GaussianProcessRegressor(ConstantKernel() * RBF() + WhiteKernel(), alpha=0)
+model.fit(autoscaled_x_train, autoscaled_y_train)
+
+# AD
+ad = ApplicabilityDomain(method_name=method_name, rate_of_outliers=rate_of_outliers)
+ad.fit(autoscaled_x_train)
+
+# calculate y in training data
+calculated_y_train = model.predict(autoscaled_x_train) * y_train.std(ddof=1) + y_train.mean()
+# yy-plot
+plt.rcParams['font.size'] = 18  # 横軸や縦軸の名前の文字などのフォントのサイズ
+plt.figure(figsize=figure.figaspect(1))
+plt.scatter(y_train, calculated_y_train, c='blue')
+y_max = np.max(np.array([np.array(y_train), calculated_y_train]))
+y_min = np.min(np.array([np.array(y_train), calculated_y_train]))
+plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
+         [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')
+plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))
+plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))
+plt.xlabel('Actual Y')
+plt.ylabel('Calculated Y')
+plt.show()
+# r2, RMSE, MAE
+print('r2: {0}'.format(float(1 - sum((y_train - calculated_y_train) ** 2) / sum((y_train - y_train.mean()) ** 2))))
+print('RMSE: {0}'.format(float((sum((y_train - calculated_y_train) ** 2) / len(y_train)) ** 0.5)))
+print('MAE: {0}'.format(float(sum(abs(y_train - calculated_y_train)) / len(y_train))))
+
+# prediction
+predicted_y_test, predicted_y_test_std = model.predict(autoscaled_x_test, return_std=True)
+predicted_y_test = predicted_y_test * y_train.std(ddof=1) + y_train.mean()
+predicted_y_test_std = predicted_y_test_std * y_train.std(ddof=1)
+predicted_ad = ad.predict(autoscaled_x_test)
+
+# yy-plot
+plt.figure(figsize=figure.figaspect(1))
+plt.scatter(y_test, predicted_y_test, c='blue')
+y_max = np.max(np.array([np.array(y_test), predicted_y_test]))
+y_min = np.min(np.array([np.array(y_test), predicted_y_test]))
+plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
+         [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')
+plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))
+plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))
+plt.xlabel('Actual Y')
+plt.ylabel('Predicted Y')
+plt.show()
+# r2p, RMSEp, MAEp
+print('r2p: {0}'.format(float(1 - sum((y_test - predicted_y_test) ** 2) / sum((y_test - y_test.mean()) ** 2))))
+print('RMSEp: {0}'.format(float((sum((y_test - predicted_y_test) ** 2) / len(y_test)) ** 0.5)))
+print('MAEp: {0}'.format(float(sum(abs(y_test - predicted_y_test)) / len(y_test))))
+print('')
+
+#plt.scatter(predicted_y_test_std, abs(y_test - predicted_y_test), c='blue')
+#plt.xlabel('Std. of estimated Y')
+#plt.ylabel('Error of Y')
+#plt.show()
+
+plt.scatter(predicted_ad, abs(y_test - predicted_y_test), c='blue')
+plt.xlabel('AD index (lower than 0 means outside of AD)')
+plt.ylabel('Error of Y')
+plt.show()
diff --git a/setup.py b/setup.py
@@ -13,7 +13,7 @@
 
 setup(
     name='dcekit',
-    version='2.2.1',
+    version='2.3.0',
     description='Data Chemical Engineering toolkit',
     long_description=readme,
     author='Hiromasa Kaneko',