In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# Anomaly Detection

In this notebook, we're going to try and find observations of a data set that are away from **normal** behaviour.

Se debe de crear un modelo: probabilidad de la observación x. Si la probabilidad es muy baja entonces es anómalo
Hiperparámetro: la probabilidad mínima (el punto donde una observacion es anomala o no)

p(x)<ϵ:Anomalyp(x)⩾ϵ:Ok

## Importing the data

In [116]:
X = pd.read_csv("heights.csv")

In [117]:
X.head()

Unnamed: 0,Estatura,Edad
0,1.77,26.5
1,1.74,26.5
2,1.72,24.0
3,1.78,26.5
4,1.65,32.0


## Defining the funcion

In [199]:
def anomaly_detection(x_train, x_test, threshold, exp_name):
    
    global log
    
    if exp_name in log['Name'].unique():
        print('The experiment name is already in use. Please change to a name that is not in the following list: '
              + str(log['Name'].unique()))
        return(0,0)
    
    
    mu = pd.DataFrame(x_train.mean(), columns = ['mu'])
    sigma = pd.DataFrame(x_train.var(), columns = ['sigma'])
    key = mu.join(sigma)
    
    df = x_test.copy()

    for i in range(0, x_train.shape[1]):
        df[str(x_train.columns[i])+'_z'] = (x_test.iloc[:,i]-key['mu'][key.index == x_train.columns[i]][0])/np.sqrt(key['sigma'][key.index == x_train.columns[i]][0])
        df[str(x_train.columns[i])+'_prob'] = stats.norm.pdf(df[str(x_train.columns[i])+'_z'],0,1)
    
    filter_col = [col for col in df if col.endswith('prob')]
    df['Anomaly'] = np.where(df[filter_col].prod(axis = 1)<=threshold,1,0)
    
    log_info = {
        'Name': exp_name,
        'Description': 'Anomaly Detection Model',
        'Features': np.array(x_train.columns),
        'Configuration': 'Threshold:' + str(threshold),
        'Results': df.groupby('Anomaly').count().iloc[:,0],
    }
    log = log.append(log_info, ignore_index=True)
    
    return(np.array(df['Anomaly']), log_info)

In [214]:
log = pd.DataFrame(columns = ['Name', 'Description', 'Features', 'Configuration', 
                              'Results'])

Now, we're going to import an anomaly data set to check the model

In [215]:
df_test = pd.read_csv('anomalies.csv')

In [216]:
df_test.head()

Unnamed: 0,Estatura,Edad
0,0.25,2.4
1,175.1,2.5
2,0.15,250.0
3,150.0,14.0


In [217]:
anomaly_detection(
    x_train = X,
    x_test = df_test,
    threshold = 0.05,
    exp_name = 'first_test'
)

(array([1, 1, 1, 1]),
 {'Name': 'first_test',
  'Description': 'Anomaly Detection Model',
  'Features': array(['Estatura', 'Edad'], dtype=object),
  'Configuration': 'Threshold:0.05',
  'Results': Anomaly
  1    4
  Name: Estatura, dtype: int64})

In [218]:
log

Unnamed: 0,Name,Description,Features,Configuration,Results
0,first_test,Anomaly Detection Model,"[Estatura, Edad]",Threshold:0.05,"Anomaly 1 4 Name: Estatura, dtype: int64"


### Finding the best threshold

In [220]:
th = np.linspace(0.0, 1.0, num=101)
for i in range(0, len(th)):
    print('Experiment ', str(i), ':')
    ad = anomaly_detection(
        x_train = X,
        x_test = df_test,
        threshold = th[i],
        exp_name = 'exp'+str(i)
    )
    print(ad[1])

Experiment  0 :
The experiment name is already in use. Please change to a name that is not in the following list: ['first_test' 'exp0']
0
Experiment  1 :
{'Name': 'exp1', 'Description': 'Anomaly Detection Model', 'Features': array(['Estatura', 'Edad'], dtype=object), 'Configuration': 'Threshold:0.01', 'Results': Anomaly
1    4
Name: Estatura, dtype: int64}
Experiment  2 :
{'Name': 'exp2', 'Description': 'Anomaly Detection Model', 'Features': array(['Estatura', 'Edad'], dtype=object), 'Configuration': 'Threshold:0.02', 'Results': Anomaly
1    4
Name: Estatura, dtype: int64}
Experiment  3 :
{'Name': 'exp3', 'Description': 'Anomaly Detection Model', 'Features': array(['Estatura', 'Edad'], dtype=object), 'Configuration': 'Threshold:0.03', 'Results': Anomaly
1    4
Name: Estatura, dtype: int64}
Experiment  4 :
{'Name': 'exp4', 'Description': 'Anomaly Detection Model', 'Features': array(['Estatura', 'Edad'], dtype=object), 'Configuration': 'Threshold:0.04', 'Results': Anomaly
1    4
Name: Es

{'Name': 'exp40', 'Description': 'Anomaly Detection Model', 'Features': array(['Estatura', 'Edad'], dtype=object), 'Configuration': 'Threshold:0.4', 'Results': Anomaly
1    4
Name: Estatura, dtype: int64}
Experiment  41 :
{'Name': 'exp41', 'Description': 'Anomaly Detection Model', 'Features': array(['Estatura', 'Edad'], dtype=object), 'Configuration': 'Threshold:0.41000000000000003', 'Results': Anomaly
1    4
Name: Estatura, dtype: int64}
Experiment  42 :
{'Name': 'exp42', 'Description': 'Anomaly Detection Model', 'Features': array(['Estatura', 'Edad'], dtype=object), 'Configuration': 'Threshold:0.42', 'Results': Anomaly
1    4
Name: Estatura, dtype: int64}
Experiment  43 :
{'Name': 'exp43', 'Description': 'Anomaly Detection Model', 'Features': array(['Estatura', 'Edad'], dtype=object), 'Configuration': 'Threshold:0.43', 'Results': Anomaly
1    4
Name: Estatura, dtype: int64}
Experiment  44 :
{'Name': 'exp44', 'Description': 'Anomaly Detection Model', 'Features': array(['Estatura', 'Ed

{'Name': 'exp79', 'Description': 'Anomaly Detection Model', 'Features': array(['Estatura', 'Edad'], dtype=object), 'Configuration': 'Threshold:0.79', 'Results': Anomaly
1    4
Name: Estatura, dtype: int64}
Experiment  80 :
{'Name': 'exp80', 'Description': 'Anomaly Detection Model', 'Features': array(['Estatura', 'Edad'], dtype=object), 'Configuration': 'Threshold:0.8', 'Results': Anomaly
1    4
Name: Estatura, dtype: int64}
Experiment  81 :
{'Name': 'exp81', 'Description': 'Anomaly Detection Model', 'Features': array(['Estatura', 'Edad'], dtype=object), 'Configuration': 'Threshold:0.81', 'Results': Anomaly
1    4
Name: Estatura, dtype: int64}
Experiment  82 :
{'Name': 'exp82', 'Description': 'Anomaly Detection Model', 'Features': array(['Estatura', 'Edad'], dtype=object), 'Configuration': 'Threshold:0.8200000000000001', 'Results': Anomaly
1    4
Name: Estatura, dtype: int64}
Experiment  83 :
{'Name': 'exp83', 'Description': 'Anomaly Detection Model', 'Features': array(['Estatura', 'Eda

In [221]:
log

Unnamed: 0,Name,Description,Features,Configuration,Results
0,first_test,Anomaly Detection Model,"[Estatura, Edad]",Threshold:0.05,"Anomaly 1 4 Name: Estatura, dtype: int64"
1,exp0,Anomaly Detection Model,"[Estatura, Edad]",Threshold:0.0,"Anomaly 0 1 1 3 Name: Estatura, dtype: i..."
2,exp1,Anomaly Detection Model,"[Estatura, Edad]",Threshold:0.01,"Anomaly 1 4 Name: Estatura, dtype: int64"
3,exp2,Anomaly Detection Model,"[Estatura, Edad]",Threshold:0.02,"Anomaly 1 4 Name: Estatura, dtype: int64"
4,exp3,Anomaly Detection Model,"[Estatura, Edad]",Threshold:0.03,"Anomaly 1 4 Name: Estatura, dtype: int64"
5,exp4,Anomaly Detection Model,"[Estatura, Edad]",Threshold:0.04,"Anomaly 1 4 Name: Estatura, dtype: int64"
6,exp5,Anomaly Detection Model,"[Estatura, Edad]",Threshold:0.05,"Anomaly 1 4 Name: Estatura, dtype: int64"
7,exp6,Anomaly Detection Model,"[Estatura, Edad]",Threshold:0.06,"Anomaly 1 4 Name: Estatura, dtype: int64"
8,exp7,Anomaly Detection Model,"[Estatura, Edad]",Threshold:0.07,"Anomaly 1 4 Name: Estatura, dtype: int64"
9,exp8,Anomaly Detection Model,"[Estatura, Edad]",Threshold:0.08,"Anomaly 1 4 Name: Estatura, dtype: int64"


All models every 0.01 of threshold gives us the same result as all of the test set are anomalies.