In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.font_manager
import seaborn as sns


In [None]:
# Import models
from pyod.models.abod import ABOD
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging

In [None]:
# reading the dataset
df = pd.read_csv("rca.csv")

In [None]:
df.plot.scatter('latency', 'source')

In [None]:
le = preprocessing.LabelEncoder()
le.fit(df['source'])
LabelEncoder()
list(le.classes_)
sourceTransformed = le.transform(df['source'])

In [None]:
X1 = df['latency'].values.reshape(-1,1)
X2 = sourceTransformed.reshape(-1,1)

print(X1)
print(X2)

X = np.concatenate((X1,X2), axis=1)

In [None]:
# le = preprocessing.LabelEncoder()
# test = pd.DataFrame(df.source)
# sourceTransformed = le.fit_transform(test.values)


# X1 = df.latency.values.reshape(-1,1)
# X2 = sourceTransformed.reshape(-1,1)

# print(X1)
# print(X2)

# X = np.concatenate((X1,X2), axis=1)

In [None]:
outliers_fraction = 0.2
# Test 4 different methods
classifiers = {
        'FeatureBagging': FeatureBagging(contamination=outliers_fraction),
        'KNN': KNN(contamination=outliers_fraction),
        'HBOS': HBOS(contamination=outliers_fraction),
        'IForest': IForest(contamination=outliers_fraction)
}

In [None]:
xx , yy = np.meshgrid(np.linspace(-10, 10, 200), np.linspace(-10, 10, 200))
plt.figure(figsize=(10, 10))

for i, (clf_name, clf) in enumerate (classifiers.items()):
    clf.fit(X)
    # predict raw anomaly score
    scores_pred = clf.decision_function(X) * -1

    # prediction of a datapoint category outlier or inlier
    y_pred = clf.predict(X)
    n_inliers = len(y_pred) - np.count_nonzero(y_pred)
    n_outliers = np.count_nonzero(y_pred == 1)
    
    

    print('OUTLIERS : ',n_outliers,'INLIERS : ',n_inliers, clf_name)

    # threshold value to consider a datapoint inlier or outlier
    threshold = stats.scoreatpercentile(scores_pred,75 * outliers_fraction)

    # decision function calculates the raw anomaly score for every point
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
    Z = Z.reshape(xx.shape)

    # plot outliers and contour
    subplot = plt .subplot(2, 2, i + 1)
    subplot.contour(xx, yy, Z, levels = np.linspace(Z.min(), threshold, 15))
    subplot.contour(xx, yy, Z, levels = [threshold], linewidths=2, colors='red')

    # fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
    subplot.contour(xx, yy, Z, levels=[threshold, Z.max()], colors='blue')

    # scatter plot of inliers with white dots
    subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white', s=12, edgecolor='g')
    # scatter plot of outliers with black dots
    subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='black', s=12, edgecolor='g')
    subplot.axis('tight')

    subplot.set_title(clf_name)
    subplot.set_xlim((-15, 15))
    subplot.set_ylim((-15, 15))
plt.show()

