Skip to content

Commit

Permalink
add AD
Browse files Browse the repository at this point in the history
  • Loading branch information
hkaneko1985 committed Sep 16, 2019
1 parent ed24eef commit 269344a
Show file tree
Hide file tree
Showing 4 changed files with 227 additions and 1 deletion.
1 change: 1 addition & 0 deletions dcekit/validation/__init__.py
Expand Up @@ -12,3 +12,4 @@
from .validation import y_randomization_with_hyperparam_opt
from .validation import double_cross_validation
from .validation import mae_cce
from .applicability_domain import ApplicabilityDomain
126 changes: 126 additions & 0 deletions dcekit/validation/applicability_domain.py
@@ -0,0 +1,126 @@
# -*- coding: utf-8 -*-
# %reset -f
"""
@author: Hiromasa Kaneko
"""

import sys

import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.neighbors import NearestNeighbors, LocalOutlierFactor
from scipy.spatial.distance import cdist


class ApplicabilityDomain():
def __init__(self, method_name='ocsvm', rate_of_outliers=0.01, gamma='auto', nu=0.5, n_neighbors=10,
metric='minkowski', p=2):
"""
Applicability Domain (AD)
Parameters
----------
method_name: str, default 'ocsvm'
The name of method to set AD. 'knn', 'lof', or 'ocsvm'
rate_of_outliers: float, default 0.01
Rate of outlier samples. This is used to set threshold
gamma : (only for 'ocsvm') float, default ’auto’
Kernel coefficient for ‘rbf’. Current default is ‘auto’ which optimize gamma to maximize variance in Gram matrix
nu : (only for 'ocsvm') float, default 0.5
An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors. Should be in the interval (0, 1]. By default 0.5 will be taken.
https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html#sklearn.svm.OneClassSVM
n_neighbors: (only for 'knn' and 'lof') int, default 10
Number of neighbors to use for each query
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
metric : string or callable, default ‘minkowski’
Metric to use for distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used.
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
p : integer, default 2
Parameter for the Minkowski metric from sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
"""

if method_name != 'knn' and method_name != 'lof' and method_name != 'ocsvm':
sys.exit('There is no ad method named \'{0}\'. Please check the variable of method_name.'.format(method_name))

self.method_name = method_name
self.rate_of_outliers = rate_of_outliers
self.gamma = gamma
self.nu = nu
self.n_neighbors = n_neighbors
self.metric = metric
self.p = p

def fit(self, x):
"""
Applicability Domain (AD)
Set AD
Parameters
----------
x : numpy.array or pandas.DataFrame
m x n matrix of X-variables of training data,
m is the number of training sammples and
n is the number of X-variables
"""

x = np.array(x)

if self.method_name == 'ocsvm':
if self.gamma == 'auto':
ocsvm_gammas = 2 ** np.arange(-20, 11, dtype=float)
variance_of_gram_matrix = []
for index, ocsvm_gamma in enumerate(ocsvm_gammas):
gram_matrix = np.exp(-ocsvm_gamma * cdist(x, x, metric='seuclidean'))
variance_of_gram_matrix.append(gram_matrix.var(ddof=1))
self.optimal_gamma = ocsvm_gammas[variance_of_gram_matrix.index(max(variance_of_gram_matrix))]
else:
self.optimal_gamma = self.gamma
self.ad = OneClassSVM(kernel='rbf', gamma=self.optimal_gamma, nu=self.nu)
self.ad.fit(x)
ad_values = np.ndarray.flatten(self.ad.decision_function(x))

elif self.method_name == 'knn':
self.ad = NearestNeighbors(n_neighbors=self.n_neighbors)
self.ad.fit(x)
knn_dist_all, knn_ind_all = self.ad.kneighbors(None)
ad_values = 1 / (knn_dist_all.mean(axis=1) + 1)
elif self.method_name == 'lof':
self.ad = LocalOutlierFactor(novelty=True, contamination=self.rate_of_outliers)
self.ad.fit(x)
ad_values = self.ad.negative_outlier_factor_ - self.ad.offset_

self.offset = np.percentile(ad_values, 100 * self.rate_of_outliers)


def predict(self, x):
"""
Applicability Domain (AD)
Predict AD-values
Parameters
----------
x : numpy.array or pandas.DataFrame
k x n matrix of X-variables of test data, which is autoscaled with training data,
and k is the number of test samples
Returns
-------
ad_values : numpy.array, shape (n_samples,)
values lower than 0 means outside of AD
"""

x = np.array(x)

if self.method_name == 'ocsvm':
ad_values = np.ndarray.flatten(self.ad.decision_function(x))

elif self.method_name == 'knn':
knn_dist_all, knn_ind_all = self.ad.kneighbors(x)
ad_values = 1 / (knn_dist_all.mean(axis=1) + 1)
elif self.method_name == 'lof':
ad_values = np.ndarray.flatten(self.ad.decision_function(x))

return ad_values - self.offset
99 changes: 99 additions & 0 deletions demo_ad.py
@@ -0,0 +1,99 @@
# -*- coding: utf-8 -*-
# %reset -f
"""
@author: Hiromasa Kaneko
"""

import matplotlib.figure as figure
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from dcekit.validation import ApplicabilityDomain
from sklearn.datasets import load_boston
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, RBF, ConstantKernel, Matern, DotProduct
from sklearn.model_selection import train_test_split

method_name = 'ocsvm' # 'ocsvm' or 'knn' or 'lof'
rate_of_outliers = 0.05

number_of_test_samples = 400
fold_number = 5

# load dataset
boston = load_boston()
y = boston.target
x = boston.data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=number_of_test_samples, random_state=0)

#index = np.argsort(boston.target)
#y = boston.target[index]
#x = boston.data[index, :]
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=number_of_test_samples, shuffle=False)

# autoscaling
autoscaled_x_train = (x_train - x_train.mean(axis=0)) / x_train.std(axis=0, ddof=1)
autoscaled_y_train = (y_train - y_train.mean()) / y_train.std(ddof=1)
autoscaled_x_test = (x_test - x_train.mean(axis=0)) / x_train.std(axis=0, ddof=1)

# Gaussian process regression
model = GaussianProcessRegressor(ConstantKernel() * RBF() + WhiteKernel(), alpha=0)
model.fit(autoscaled_x_train, autoscaled_y_train)

# AD
ad = ApplicabilityDomain(method_name=method_name, rate_of_outliers=rate_of_outliers)
ad.fit(autoscaled_x_train)

# calculate y in training data
calculated_y_train = model.predict(autoscaled_x_train) * y_train.std(ddof=1) + y_train.mean()
# yy-plot
plt.rcParams['font.size'] = 18 # 横軸や縦軸の名前の文字などのフォントのサイズ
plt.figure(figsize=figure.figaspect(1))
plt.scatter(y_train, calculated_y_train, c='blue')
y_max = np.max(np.array([np.array(y_train), calculated_y_train]))
y_min = np.min(np.array([np.array(y_train), calculated_y_train]))
plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
[y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')
plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))
plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))
plt.xlabel('Actual Y')
plt.ylabel('Calculated Y')
plt.show()
# r2, RMSE, MAE
print('r2: {0}'.format(float(1 - sum((y_train - calculated_y_train) ** 2) / sum((y_train - y_train.mean()) ** 2))))
print('RMSE: {0}'.format(float((sum((y_train - calculated_y_train) ** 2) / len(y_train)) ** 0.5)))
print('MAE: {0}'.format(float(sum(abs(y_train - calculated_y_train)) / len(y_train))))

# prediction
predicted_y_test, predicted_y_test_std = model.predict(autoscaled_x_test, return_std=True)
predicted_y_test = predicted_y_test * y_train.std(ddof=1) + y_train.mean()
predicted_y_test_std = predicted_y_test_std * y_train.std(ddof=1)
predicted_ad = ad.predict(autoscaled_x_test)

# yy-plot
plt.figure(figsize=figure.figaspect(1))
plt.scatter(y_test, predicted_y_test, c='blue')
y_max = np.max(np.array([np.array(y_test), predicted_y_test]))
y_min = np.min(np.array([np.array(y_test), predicted_y_test]))
plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
[y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')
plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))
plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))
plt.xlabel('Actual Y')
plt.ylabel('Predicted Y')
plt.show()
# r2p, RMSEp, MAEp
print('r2p: {0}'.format(float(1 - sum((y_test - predicted_y_test) ** 2) / sum((y_test - y_test.mean()) ** 2))))
print('RMSEp: {0}'.format(float((sum((y_test - predicted_y_test) ** 2) / len(y_test)) ** 0.5)))
print('MAEp: {0}'.format(float(sum(abs(y_test - predicted_y_test)) / len(y_test))))
print('')

#plt.scatter(predicted_y_test_std, abs(y_test - predicted_y_test), c='blue')
#plt.xlabel('Std. of estimated Y')
#plt.ylabel('Error of Y')
#plt.show()

plt.scatter(predicted_ad, abs(y_test - predicted_y_test), c='blue')
plt.xlabel('AD index (lower than 0 means outside of AD)')
plt.ylabel('Error of Y')
plt.show()
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -13,7 +13,7 @@

setup(
name='dcekit',
version='2.2.1',
version='2.3.0',
description='Data Chemical Engineering toolkit',
long_description=readme,
author='Hiromasa Kaneko',
Expand Down

0 comments on commit 269344a

Please sign in to comment.