In [2]:
import numpy as np
from numpy.linalg import inv
from scipy.stats import multivariate_normal
import matplotlib.pyplot as plt

In [15]:
def get_theta(mu, cov, rho):
    '''
    Returns explicit threshold theta for a given percentage rho of anomalies in 
    data distributed as a Gaussian with mean mu and covariance matrix cov. 
    
    Parameters
        mu    mean of Gaussian distribution
        cov   covariance matrix of Gaussian distribution
        rho   percentage of anomalies, which must be between 0 and 100 inclusive
    '''
    # generate random variables (data)
    X = multivariate_normal.rvs(mean=mu, cov=cov, size=5000000)
    # center data (normalize) (for x_i - mu)
    Z = X - mu
    # calculate the mahalanobis distance
    # d2M (xi, ˆμ) = (xi − ˆμ)T ˆΣ−1(xi − ˆμ)
    d = np.sqrt(np.sum(Z.dot(inv(cov)) * Z, axis=1))
    # thetha = 
    return np.percentile(d, 100-rho) 

# get_theta([0, 0], [[1, 0], [0, 1]], 5)

In [4]:
# styling and fig siz
plt.style.use('seaborn-dark')
plt.rcParams['figure.figsize']= 16, 10  

In [16]:
# utility functions for plotting etc

def gen_data(mu, cov, n=1000):
    '''
    generate bivariate gaussian data
    
    mu mean of the gaussian distribution
    cov covariance matrix
    n size (number of points)
    '''
    return multivariate_normal.rvs(cov=cov, mean=mu, size=n)

def plt_points(data):
    '''
    plot bivariate gaussian data as points
    '''
    # Plotting the generated samples
    plt.plot(data[:,0], data[:,1], 'o', c='lime',
             markeredgewidth = 0.5,
             markeredgecolor = 'black')
    # plt.title('covariance of distribution')
    plt.xlabel('x1')
    plt.ylabel('x2')
    plt.axis('equal')
     
    plt.show()
    
def plt_pdf(mu, cov):
    '''
    plot the density function from a bivariate gaussian distribution
    
    mu mean
    cov covariance matrix
    '''
    distr = multivariate_normal(cov=cov, mean=mu)
     
    # Generating a meshgrid complacent with
    # the 3-sigma boundary
    mean_1, mean_2 = mean[0], mean[1]
    sigma_1, sigma_2 = cov[0,0], cov[1,1]
     
    x = np.linspace(-3*sigma_1, 3*sigma_1, num=100)
    y = np.linspace(-3*sigma_2, 3*sigma_2, num=100)
    X, Y = np.meshgrid(x, y)
     
    # Generating the density function
    # for each point in the meshgrid
    pdf = np.zeros(X.shape)
    for i in range(X.shape[0]):
        for j in range(X.shape[1]):
            pdf[i,j] = distr.pdf([X[i,j], Y[i,j]])
     
    # Plotting the density function values
    ax = plt.figure().add_subplot(111, projection = '3d')
    ax.plot_surface(X, Y, pdf, cmap = 'viridis')
    plt.xlabel("x1")
    plt.ylabel("x2")
    ax.axes.zaxis.set_ticks([])
    
    plt.show()

# assignment
1. Sample a data set D of size n from N (x; μ, Σ). Fix a percentage ρ.
2. Use the function get_theta(mu, cov, rho) provided by the notebook to
obtain an explicit threshold θ given the percentage ρ. Note that θ is part
of the ground-truth and therefore considered as unknown.
3. Determine the true anomalies of D. For this, use the explicit threshold θ
together with the Mahalanobis distance d∗
M defined by the true μ and Σ.
4. Use the data D to estimate μ and Σ. Construct the Mahalanobis distance
dM defined by the estimates ˆμ and ˆΣ.
5. Predict the anomalies of D using the Mahalanobis distance dM and Eu-
clidean distance dE . Anomalies are the ρ percent points xi ∈ D farthest
from ˆμ (do not use θ). Assess precision and recall of both detectors.

In [17]:
# fixate groundtruth mean and covariance matrix for the bivariate gaussian distribution
# '_T' nominator stands for groundtruth variable
# '_E' nominator stands for estimated variable
mu_T = np.array([0, 0])  # mean at (0, 0)
cov = np.array([[1, 0], [0, 1]])  # sigma / covariance matrix
rho = 5  # preset percentage of outliers
size = 5000  # number of data points    

In [18]:
# 1. generate dataset
D = multivariate_normal.rvs(mean=mu, cov=cov, size=size)

In [19]:
# 2. use get_theta to get the 'groundtruth' explicit treshold
theta = get_theta(mu, cov, rho)

In [21]:
# 3. determine subset of true anomalies of dataset D
# start by calculating the mahalanobis distance of each point from the mean
Z = D - mu
d_star_M = np.sqrt(np.sum(Z.dot(inv(cov)) * Z, axis=1)) 
# filter out values (indices) over the groundtruth threshold theta (True / False array)
T = d_star_M > theta  # indices of true anomalies
# print number of as true determined inices
len(T[T]) / len(T)

0.0466

In [11]:
# 4. Use the data D to estimate μ and Σ. Construct the Mahalanobis distance
# dM defined by the estimates ˆμ and ˆΣ.

D

array([[-0.3317541 , -0.65547892],
       [-0.24176314,  0.74385974],
       [ 0.84266411, -0.29247938],
       ...,
       [ 0.79165354,  0.04219234],
       [ 0.90316687, -2.18269991],
       [-0.54494302, -0.67902587]])