In [1]:
import numpy as np
import scipy.stats as st

In [48]:
# m = number of random projection vectors
# d = data dimension
# N = number of data points
m = 3
d = 10
N = 100
eps = 0.1

In [3]:
# Sample m multivariate normal vectors and scale by the norm to get
# m vectors uniformly from the d-dimensional unit sphere
mean = np.zeros(d)
cov = np.eye(d)

sampled = st.multivariate_normal.rvs(mean, cov, m)
normalized_samples = np.array([row / np.linalg.norm(row) for row in sampled])

In [4]:
sampled.shape

(3, 10)

In [27]:
# Just generate some training data
data = st.multivariate_normal.rvs(np.zeros(d), np.eye(d), size = N)

In [28]:
data.shape

(100, 10)

In [29]:
# Compute all dot products between data vectors x_j and random projection vectors w_i
# <w_i, x_j> = dot_products[i][j]
dot_products = normalized_samples @ data.T

In [30]:
dot_products.shape

(3, 100)

In [31]:
# Find minimum and maximum dot products for each w_i
# These are the intervals which are considered to be non-anomalous
min_values = np.array([np.min(row) for row in dot_products])
max_values = np.array([np.max(row) for row in dot_products])

inlier_intervals = np.array([ [np.min(row), np.max(row)] for row in dot_products ] )

In [33]:
# min-max normalization
scaled_dot_products = np.divide(dot_products - min_values.repeat(N).reshape(m,N), (max_values - min_values).repeat(N).reshape(m,N))

In [35]:
scaled_dot_products.shape

(3, 100)

In [38]:
sorted_scaled_dot_products = np.sort(scaled_dot_products, axis=1)

In [74]:
two_d_interval_indices = np.argwhere(np.diff(sorted_scaled_dot_products, axis=1) >= eps)
outlier_break_points = [[ index[1] for index in two_d_interval_indices  if index[0] == i ] for i in range(m)]

In [75]:
outlier_intervals = [[ [sorted_scaled_dot_products[i,index], sorted_scaled_dot_products[i,index+1]] for index in outlier_break_points[i] ] for i in range(m)]

In [76]:
outlier_intervals

[[[0.0042703461587965865, 0.17099141692006106]],
 [[0.8013201539486069, 1.0]],
 [[0.0001230418983577699, 0.14251461597498036]]]

In [136]:
# Generate testing data from distribution different from training data.
N_new = 10
new_data = st.multivariate_normal.rvs(np.ones(d) * 2, np.eye(d), size = N_new)

In [137]:
# Project testing data onto the random vectors w_i
# and check if inside inlier interval
projection_new_data = normalized_samples @ new_data.T
scaled_new_data = np.divide(projection_new_data.reshape(m,N_new) - min_values.repeat(N_new).reshape(m,N_new), (max_values - min_values).repeat(N_new).reshape(m,N_new))
# inliers_new_data = np.array([projection_new_data[i] >= inlier_intervals[i][0] and projection_new_data[i] <= inlier_intervals[i][1] for i in range(m)])

In [138]:
outlier_intervals

[[[0.0042703461587965865, 0.17099141692006106]],
 [[0.8013201539486069, 1.0]],
 [[0.0001230418983577699, 0.14251461597498036]]]

In [139]:
scaled_new_data

array([[ 1.25135726,  1.08655127,  0.8087682 ,  0.8317424 ,  0.94954866,
         1.01614168,  1.07829789,  1.24427788,  1.33538295,  0.83206588],
       [ 0.30539805,  0.12073241,  0.07732909, -0.08947978,  0.36831865,
         0.29777877,  0.13189884, -0.00194933,  0.09569101,  0.50769019],
       [ 1.0805636 ,  0.61482895,  0.82426173,  0.9576015 ,  0.44735815,
         0.78898119,  1.18924816,  0.97623143,  0.72110488,  0.6805248 ]])

In [140]:
outliers_new_data = np.array(
  [[  scaled_new_data[i][j] > 1 or 
      scaled_new_data[i][j] < 0 or 
      any([ scaled_new_data[i][j] > outlier[0] and scaled_new_data[i][j] < outlier[1] for outlier in outlier_intervals[i] ]) 
  for i in range(m)] for j in range(N_new)]
)

In [141]:
new_data_is_outlier = [ any(data_point) for data_point in outliers_new_data ]

In [142]:
new_data_is_outlier

[True, True, False, True, False, True, True, True, True, False]

In [14]:
inliers_new_data.all()

False

In [15]:
i = 2
j = 30
projection_i_j = dot_products[i][j] * normalized_samples[i]

In [21]:
np.linalg.norm(projection_i_j) - np.abs(dot_products[i][j]) < 1e-10

True