<a href="https://colab.research.google.com/github/khayk5ay/Anomaly_Detection/blob/main/Anomaly_Detection_Algorithm_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The essence of an anomaly detection algorithm is to detect strange occurences whose probability of occurence is quite unlikely.

In [None]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import random

In [None]:
# Generate random dataset for the algorithm analysis
def generate_dataset():
  X_train_list = []
  X_val_list = []
  y_val_list = []

  # Generate the training set containing values considered to be normal
  for i in range(50):
    X_train_list.append([random.uniform(25,30), random.uniform(3,5)])

  X_train = np.array(X_train_list)

  # Generate the validation set of some normal and other anomolous values
  for i in range(20):
    X_val_list.append([random.uniform(25,30), random.uniform(3,5)])
    y_val_list.append(0)
  for i in range(5):
    X_val_list.append([random.uniform(19,27), random.uniform(2,7)])
    y_val_list.append(1)
  for i in range(5):
    X_val_list.append([random.uniform(28,35), random.uniform(2,7)])
    y_val_list.append(1)

  X_val = np.array(X_val_list)
  y_val = np.array(y_val_list)

  return X_train, X_val, y_val

In [None]:
# Initialise Training dataset for training
X_train, X_val, y_val = generate_dataset()

X_train

In [None]:
# Visualise the data
plt.scatter(X_train[:,0], X_train[:, 1], marker='o')
plt.scatter(X_val[:,0], X_val[:, 1], color='r', marker='x')
plt.ylim(0,8)
plt.xlim(10,40)

In [None]:
# Look at the distribution of each of the features in X_train

fig, ax = plt.subplots(2,1, figsize=(10,10))
 
ax[0].hist(X_train[:,0], bins=5) 
ax[1].hist(X_train[:,1], bins=3)
plt.show()

Even though the data does not necessarily have a normal / gaussian distribution, the algorithm still does well to detect any anomalies

In [None]:
# Get the values if Mean and Variance for the data set
def get_gaussian_distribution(X):
  """
  gets the gaussian distribution of the data

  parameters:
  X (numpy ndarray) : m * n dimensioned unlabeled data

  returns:
  mu () : 1 * n array showing the mean of the data
  var () : 1 * n array showing the variance of the data
  
  """
  m, n = X.shape
  # Compute the mean of each featrue in the data set
  mu = sum(X) / m
  # Compute the variance of each feature in the data set
  var = sum((X-mu)**2) / m
  
  return mu, var

In [None]:
mu, var = get_gaussian_distribution(X_train)

In [None]:
print(f"The mean is {mu}")
print(f"The variance is {var}")

$$ p(x ; \mu,\sigma ^2) = \frac{1}{\sqrt{2 \pi \sigma ^2}}\exp^{ - \frac{(x - \mu)^2}{2 \sigma ^2} }$$

In [None]:
# Get probabilities of features when provided with the array of all observations when provided witht he values of the gaussian distribution 

def get_probabilities(X, mu, var):
  # Initialise the array to hold the probabilities
  p_x_j = np.zeros(len(X))
  
  # Get probabiliities for each observation
  for j in range(len(X)):
    # Initialise the array that will hold the probabilities for each feature associated with the overall observation
    p_x_i = np.zeros(X[j].shape[0])
    for i in range(X[j].shape[0]):
      # Compute the probability of each individual feature
      denom = np.sqrt(2 * np.pi * var[i])
      exp_val = -((X[j][i]-mu[i]) ** 2 )/ (2 * var[i])
      p_x_i[i] = (1 / denom) * np.exp(exp_val)

    # Compute the overall probability of that observation as the product of all the feature probabilities
    p_x_j[j] = np.prod(p_x_i)

  return p_x_j

In [None]:
get_probabilities(X_train, mu, var)
#len(X_train[0].shape)

In [None]:
# Determine the threshold probability below which an observation will be considered abnormal
# The threshold will be considered using the F1 score
def select_threshold(y_val, p_val):

  best_F1 = 0
  best_epsilon = 0
  step_value = (p_val.max() - p_val.min()) / 1000
  # Consider a wide range of theshold values
  for epsilon in np.arange(p_val.min(), p_val.max(), step_value):
    
    predictions = p_val < epsilon
    # Compute the True Positive (tp), False Positive(fp), False Negative(fn)
    tp = sum(predictions[y_val == 1])
    fp = sum(predictions[y_val == 0])
    fn = sum(y_val[predictions == 0])

    precision_score = tp / (tp + fp)
    recall_score = tp / (tp + fn)
    
    #Compute the f1 score for each value of epsilon
    F1 = (2 * precision_score * recall_score) / (precision_score + recall_score)

    if F1 > best_F1:
      best_F1 = F1
      best_epsilon = epsilon

  return best_F1, best_epsilon    


In [None]:
get_probabilities(X_val, mu, var)

In [None]:
F1_score, epsilon = select_threshold(y_val, get_probabilities(X_val, mu, var))
print("F1 Score ", F1_score)
print("Best Epsilon ", epsilon)