In [1]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import pandas as pd
from datetime import datetime, timedelta, timezone
import time
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.font_manager
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [2]:
# Import models
from pyod.models.abod import ABOD
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging

In [3]:
# reading the dataset
df = pd.read_csv("public/data_source/rca_2020_04_24.csv")
faultsDataFrame = pd.read_csv("public/data_source/ret_info24.csv")

In [4]:
le = preprocessing.LabelEncoder()
le.fit(df['source'])
LabelEncoder()
list(le.classes_)
sourceTransformed = le.transform(df['source'])

In [5]:
X1 = df['latency'].values.reshape(-1,1)
X2 = sourceTransformed.reshape(-1,1)

X = np.concatenate((X1,X2), axis=1)

In [6]:
outliers_fraction = 0.1
# Test 4 different methods
classifiers = {
        #'FeatureBagging': FeatureBagging(contamination=outliers_fraction),
        'KNN': KNN(contamination=outliers_fraction),
        'HBOS': HBOS(contamination=outliers_fraction),
        #'IForest': IForest(contamination=outliers_fraction)
}

In [7]:
def getMetricsConfusionMatrix(faults, predicted):
    confusion_matrix = metrics.confusion_matrix(faults, predicted)
    cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])
    cm_display.plot()
    plt.show()
    del confusion_matrix, cm_display    

In [8]:
def getConfusionMatrix(faults, predicted):
    
    truePositive = []
    trueNegative = []
    falsePositive = []
    falseNegative = []
    predicted = predicted.assign(date_time = list(map( lambda x: datetime.fromtimestamp(x/1000), predicted.timestamp)))
    #predicted = predicted.assign(date_time = datetime.fromtimestamp((predicted.timestamp)/1000))
    
    for i, act in faults.iterrows():        
        minTime = datetime.strptime(act.time_preliminary,'%Y-%m-%d %H:%M:%S+08:00')
        maxTime = minTime + timedelta(minutes=5)

        truePositive.append(predicted.loc[(predicted.date_time > minTime) & (predicted.date_time < maxTime) | (predicted.predition == 1) & (predicted.target == act.ground_truth)])
        trueNegative.append(predicted.loc[(predicted.date_time > minTime) & (predicted.date_time < maxTime) | (predicted.predition == 0) & (predicted.target == act.ground_truth)])
        falsePositive.append(predicted.loc[(predicted.date_time < minTime) | (predicted.date_time > maxTime) & (predicted.predition == 1)])
        falseNegative.append(predicted.loc[(predicted.date_time < minTime) | (predicted.date_time > maxTime) & (predicted.predition == 0)])

        del maxTime, minTime
    
    else: #remove all empty row
        truePositive = list(filter(lambda dfTP: not dfTP.empty, truePositive))        
        trueNegative = list(filter(lambda dfTN: not dfTN.empty, trueNegative))
        falsePositive = list(filter(lambda dfFP: not dfFP.empty, falsePositive))
        falseNegative = list(filter(lambda dfFN: not dfFN.empty, falseNegative))

        count = 0
        for i, *trueP in truePositive:
             count = count + len(trueP)
        print("True Positive: ", count)

        count = 0
        for i, *trueN in trueNegative:
             count = count + len(trueN)
        print("True Negative: ", count)

        count = 0
        for i, *falseP in falsePositive:
             count = count + len(falseP)
        print("False Positive: ", count)

        count = 0
        for i, *falseN in falseNegative:
             count = count + len(falseN)
        print("False Negative: ", count)

        


        # print("\nTrue Positive: ", len(truePositive))
        # print("True Negative: ", len(trueNegative))
        # print("False Positive: ", len(falsePositive))
        # print("False Negative: ", len(falseNegative))
        # print((falseNegative))

    return 

In [9]:
print("FaultsData: ", faultsDataFrame)

FaultsData:     fault_id           time_preliminary duration ground_truth
0        43  2020-05-20 19:40:00+08:00     5min       os_022
1        44  2020-05-20 19:40:00+08:00     5min       os_022
2        45  2020-05-20 19:40:00+08:00     5min   docker_001
3        46  2020-05-20 19:40:00+08:00     5min   docker_001
4        47  2020-05-20 19:40:00+08:00     5min   docker_001
5        48  2020-04-24 05:17:00+08:00     5min   docker_002


In [10]:
xx, yy = np.meshgrid(np.linspace(-10, 10, 200), np.linspace(-10, 10, 200))
plt.figure(figsize=(10, 10))

for i, (clf_name, clf) in enumerate(classifiers.items()):
    clf.fit(X)
    
    scores_pred = clf.decision_function(X) * -1

    y_pred = clf.predict(X)

    np.unique(y_pred, return_counts=True)

    n_inliers = len(y_pred) - np.count_nonzero(y_pred)
    n_outliers = np.count_nonzero(y_pred == 1)   

    outliers = np.where(y_pred == 1)    
    outliersList = df.iloc[outliers[0], :]

    print('\n\nOUTLIERS : ', n_outliers, 'INLIERS : ', n_inliers, clf_name)
    print(outliersList.head(5))

    # expected = faultsDataFrame
    # predicted = y_pred
    # results = confusion_matrix(expected, predicted)
    # print(results)
    
    predicted = df.assign(predition = y_pred)
    getConfusionMatrix(faultsDataFrame, predicted)

    #pd.DataFrame(outliersList).to_csv("public/results/"+clf_name+"_preresult.csv")

    # threshold value to consider a datapoint inlier or outlier - the probability that the prediction is true
#    threshold = stats.scoreatpercentile(scores_pred, 100 * outliers_fraction)

    # decision function calculates the raw anomaly score for every point
#    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
#    Z = Z.reshape(xx.shape)

    # plot outliers and contour
#    subplot = plt .subplot(2, 2, i + 1)
#    subplot.contour(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 15))
#    subplot.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red')

    # fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
#    subplot.contour(xx, yy, Z, levels=[threshold, Z.max()], colors='blue')

    # scatter plot of inliers with white dots
#    subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1],c='white', s=12, edgecolor='g')
    # scatter plot of outliers with black dots
#    subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1],c='black', s=12, edgecolor='g')
#    subplot.axis('tight')

#    subplot.set_title(clf_name)
#    subplot.set_xlim((-15, 15))
#    subplot.set_ylim((-15, 15))

#plt.show()





OUTLIERS :  2004 INLIERS :  197996 KNN
                 trace_id     timestamp  latency  succ      source      target
55   c8c8c171a5313e154968  1.590000e+12      232  True      os_021  docker_004
61   321fa171a5313e154064  1.590000e+12      179  True  docker_001  docker_007
113  321fa171a5313e154064  1.590000e+12     1230  True      os_022  docker_001
119  e151f171a5313e166714  1.590000e+12      182  True  docker_001  docker_007
138  e151f171a5313e166714  1.590000e+12      183  True  docker_001  docker_007
True Positive:  42
True Negative:  42
False Positive:  7
False Negative:  7


OUTLIERS :  11 INLIERS :  199989 HBOS
                   trace_id     timestamp  latency  succ      source  \
2379   9fa77171a53144456718  1.590000e+12     2638  True        None   
36245  2f62c171a531b7e64532  1.590000e+12     2806  True  docker_001   
36468  65de4171a531b8f27324  1.590000e+12     3003  True  docker_001   
36950  3c255171a531ba2c7330  1.590000e+12     2904  True  docker_001   
37747  db

<Figure size 720x720 with 0 Axes>