Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
ed24eef
commit 269344a
Showing
4 changed files
with
227 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
# -*- coding: utf-8 -*- | ||
# %reset -f | ||
""" | ||
@author: Hiromasa Kaneko | ||
""" | ||
|
||
import sys | ||
|
||
import numpy as np | ||
from sklearn.svm import OneClassSVM | ||
from sklearn.neighbors import NearestNeighbors, LocalOutlierFactor | ||
from scipy.spatial.distance import cdist | ||
|
||
|
||
class ApplicabilityDomain(): | ||
def __init__(self, method_name='ocsvm', rate_of_outliers=0.01, gamma='auto', nu=0.5, n_neighbors=10, | ||
metric='minkowski', p=2): | ||
""" | ||
Applicability Domain (AD) | ||
Parameters | ||
---------- | ||
method_name: str, default 'ocsvm' | ||
The name of method to set AD. 'knn', 'lof', or 'ocsvm' | ||
rate_of_outliers: float, default 0.01 | ||
Rate of outlier samples. This is used to set threshold | ||
gamma : (only for 'ocsvm') float, default ’auto’ | ||
Kernel coefficient for ‘rbf’. Current default is ‘auto’ which optimize gamma to maximize variance in Gram matrix | ||
nu : (only for 'ocsvm') float, default 0.5 | ||
An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors. Should be in the interval (0, 1]. By default 0.5 will be taken. | ||
https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html#sklearn.svm.OneClassSVM | ||
n_neighbors: (only for 'knn' and 'lof') int, default 10 | ||
Number of neighbors to use for each query | ||
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html | ||
metric : string or callable, default ‘minkowski’ | ||
Metric to use for distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used. | ||
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html | ||
p : integer, default 2 | ||
Parameter for the Minkowski metric from sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. | ||
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html | ||
""" | ||
|
||
if method_name != 'knn' and method_name != 'lof' and method_name != 'ocsvm': | ||
sys.exit('There is no ad method named \'{0}\'. Please check the variable of method_name.'.format(method_name)) | ||
|
||
self.method_name = method_name | ||
self.rate_of_outliers = rate_of_outliers | ||
self.gamma = gamma | ||
self.nu = nu | ||
self.n_neighbors = n_neighbors | ||
self.metric = metric | ||
self.p = p | ||
|
||
def fit(self, x): | ||
""" | ||
Applicability Domain (AD) | ||
Set AD | ||
Parameters | ||
---------- | ||
x : numpy.array or pandas.DataFrame | ||
m x n matrix of X-variables of training data, | ||
m is the number of training sammples and | ||
n is the number of X-variables | ||
""" | ||
|
||
x = np.array(x) | ||
|
||
if self.method_name == 'ocsvm': | ||
if self.gamma == 'auto': | ||
ocsvm_gammas = 2 ** np.arange(-20, 11, dtype=float) | ||
variance_of_gram_matrix = [] | ||
for index, ocsvm_gamma in enumerate(ocsvm_gammas): | ||
gram_matrix = np.exp(-ocsvm_gamma * cdist(x, x, metric='seuclidean')) | ||
variance_of_gram_matrix.append(gram_matrix.var(ddof=1)) | ||
self.optimal_gamma = ocsvm_gammas[variance_of_gram_matrix.index(max(variance_of_gram_matrix))] | ||
else: | ||
self.optimal_gamma = self.gamma | ||
self.ad = OneClassSVM(kernel='rbf', gamma=self.optimal_gamma, nu=self.nu) | ||
self.ad.fit(x) | ||
ad_values = np.ndarray.flatten(self.ad.decision_function(x)) | ||
|
||
elif self.method_name == 'knn': | ||
self.ad = NearestNeighbors(n_neighbors=self.n_neighbors) | ||
self.ad.fit(x) | ||
knn_dist_all, knn_ind_all = self.ad.kneighbors(None) | ||
ad_values = 1 / (knn_dist_all.mean(axis=1) + 1) | ||
elif self.method_name == 'lof': | ||
self.ad = LocalOutlierFactor(novelty=True, contamination=self.rate_of_outliers) | ||
self.ad.fit(x) | ||
ad_values = self.ad.negative_outlier_factor_ - self.ad.offset_ | ||
|
||
self.offset = np.percentile(ad_values, 100 * self.rate_of_outliers) | ||
|
||
|
||
def predict(self, x): | ||
""" | ||
Applicability Domain (AD) | ||
Predict AD-values | ||
Parameters | ||
---------- | ||
x : numpy.array or pandas.DataFrame | ||
k x n matrix of X-variables of test data, which is autoscaled with training data, | ||
and k is the number of test samples | ||
Returns | ||
------- | ||
ad_values : numpy.array, shape (n_samples,) | ||
values lower than 0 means outside of AD | ||
""" | ||
|
||
x = np.array(x) | ||
|
||
if self.method_name == 'ocsvm': | ||
ad_values = np.ndarray.flatten(self.ad.decision_function(x)) | ||
|
||
elif self.method_name == 'knn': | ||
knn_dist_all, knn_ind_all = self.ad.kneighbors(x) | ||
ad_values = 1 / (knn_dist_all.mean(axis=1) + 1) | ||
elif self.method_name == 'lof': | ||
ad_values = np.ndarray.flatten(self.ad.decision_function(x)) | ||
|
||
return ad_values - self.offset |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
# -*- coding: utf-8 -*- | ||
# %reset -f | ||
""" | ||
@author: Hiromasa Kaneko | ||
""" | ||
|
||
import matplotlib.figure as figure | ||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
import pandas as pd | ||
from dcekit.validation import ApplicabilityDomain | ||
from sklearn.datasets import load_boston | ||
from sklearn.gaussian_process import GaussianProcessRegressor | ||
from sklearn.gaussian_process.kernels import WhiteKernel, RBF, ConstantKernel, Matern, DotProduct | ||
from sklearn.model_selection import train_test_split | ||
|
||
method_name = 'ocsvm' # 'ocsvm' or 'knn' or 'lof' | ||
rate_of_outliers = 0.05 | ||
|
||
number_of_test_samples = 400 | ||
fold_number = 5 | ||
|
||
# load dataset | ||
boston = load_boston() | ||
y = boston.target | ||
x = boston.data | ||
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=number_of_test_samples, random_state=0) | ||
|
||
#index = np.argsort(boston.target) | ||
#y = boston.target[index] | ||
#x = boston.data[index, :] | ||
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=number_of_test_samples, shuffle=False) | ||
|
||
# autoscaling | ||
autoscaled_x_train = (x_train - x_train.mean(axis=0)) / x_train.std(axis=0, ddof=1) | ||
autoscaled_y_train = (y_train - y_train.mean()) / y_train.std(ddof=1) | ||
autoscaled_x_test = (x_test - x_train.mean(axis=0)) / x_train.std(axis=0, ddof=1) | ||
|
||
# Gaussian process regression | ||
model = GaussianProcessRegressor(ConstantKernel() * RBF() + WhiteKernel(), alpha=0) | ||
model.fit(autoscaled_x_train, autoscaled_y_train) | ||
|
||
# AD | ||
ad = ApplicabilityDomain(method_name=method_name, rate_of_outliers=rate_of_outliers) | ||
ad.fit(autoscaled_x_train) | ||
|
||
# calculate y in training data | ||
calculated_y_train = model.predict(autoscaled_x_train) * y_train.std(ddof=1) + y_train.mean() | ||
# yy-plot | ||
plt.rcParams['font.size'] = 18 # 横軸や縦軸の名前の文字などのフォントのサイズ | ||
plt.figure(figsize=figure.figaspect(1)) | ||
plt.scatter(y_train, calculated_y_train, c='blue') | ||
y_max = np.max(np.array([np.array(y_train), calculated_y_train])) | ||
y_min = np.min(np.array([np.array(y_train), calculated_y_train])) | ||
plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], | ||
[y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-') | ||
plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)) | ||
plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)) | ||
plt.xlabel('Actual Y') | ||
plt.ylabel('Calculated Y') | ||
plt.show() | ||
# r2, RMSE, MAE | ||
print('r2: {0}'.format(float(1 - sum((y_train - calculated_y_train) ** 2) / sum((y_train - y_train.mean()) ** 2)))) | ||
print('RMSE: {0}'.format(float((sum((y_train - calculated_y_train) ** 2) / len(y_train)) ** 0.5))) | ||
print('MAE: {0}'.format(float(sum(abs(y_train - calculated_y_train)) / len(y_train)))) | ||
|
||
# prediction | ||
predicted_y_test, predicted_y_test_std = model.predict(autoscaled_x_test, return_std=True) | ||
predicted_y_test = predicted_y_test * y_train.std(ddof=1) + y_train.mean() | ||
predicted_y_test_std = predicted_y_test_std * y_train.std(ddof=1) | ||
predicted_ad = ad.predict(autoscaled_x_test) | ||
|
||
# yy-plot | ||
plt.figure(figsize=figure.figaspect(1)) | ||
plt.scatter(y_test, predicted_y_test, c='blue') | ||
y_max = np.max(np.array([np.array(y_test), predicted_y_test])) | ||
y_min = np.min(np.array([np.array(y_test), predicted_y_test])) | ||
plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], | ||
[y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-') | ||
plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)) | ||
plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)) | ||
plt.xlabel('Actual Y') | ||
plt.ylabel('Predicted Y') | ||
plt.show() | ||
# r2p, RMSEp, MAEp | ||
print('r2p: {0}'.format(float(1 - sum((y_test - predicted_y_test) ** 2) / sum((y_test - y_test.mean()) ** 2)))) | ||
print('RMSEp: {0}'.format(float((sum((y_test - predicted_y_test) ** 2) / len(y_test)) ** 0.5))) | ||
print('MAEp: {0}'.format(float(sum(abs(y_test - predicted_y_test)) / len(y_test)))) | ||
print('') | ||
|
||
#plt.scatter(predicted_y_test_std, abs(y_test - predicted_y_test), c='blue') | ||
#plt.xlabel('Std. of estimated Y') | ||
#plt.ylabel('Error of Y') | ||
#plt.show() | ||
|
||
plt.scatter(predicted_ad, abs(y_test - predicted_y_test), c='blue') | ||
plt.xlabel('AD index (lower than 0 means outside of AD)') | ||
plt.ylabel('Error of Y') | ||
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters