In [1]:
from sklearn import preprocessing
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'sklearn'

In [None]:
# Import models
from pyod.models.abod import ABOD
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest

## Loading and Reviewing the Data

In [None]:
# reading the dataset
df = pd.read_csv("public/data_source/rca_dataset_new.csv")
faultsDataFrame = pd.read_csv("public/data_source/ret_info.csv")

##### Change "succ" from boolean to integer, True=1, False=0

In [None]:
succ_map = {True : 1, False : 0}
df['succ'] = df['succ'].map(succ_map)

df['timestamp'] = df['timestamp'].apply(lambda d: datetime.fromtimestamp(int(d) / 1000))

##### Converting "target" to Int and Concatenate features that will be used

In [None]:
le = preprocessing.LabelEncoder()
le.fit(df['target'])
targetTransformed = le.transform(df['target'])

X1 = df['latency'].values.reshape(-1,1)
X2 = targetTransformed.reshape(-1,1)

X = np.concatenate((X1,X2), axis=1)

#### Algorithms

In [None]:
outliers_fraction = 0.2
# Test 4 different methods
classifiers = {
        #'FeatureBagging': FeatureBagging(contamination=outliers_fraction),
        'KNN': KNN(contamination=outliers_fraction)
        #'HBOS': HBOS(contamination=outliers_fraction),
        #'IForest': IForest(contamination=outliers_fraction)
}

In [None]:
def getConfusionMatrix(faults, predicted):
     
     truePositive = []
     trueNegative = []
     falsePositive = []
     falseNegative = []

     for i, act in faults.iterrows():
        minTime = np.datetime64(datetime.strptime(act.time_preliminary,'%Y-%m-%d %H:%M:%S+08:00'))
        maxTime = minTime + np.timedelta64(5, 'm')

        truePositive.append(predicted.loc[((predicted.timestamp > minTime) & (predicted.timestamp < maxTime)) & (predicted.is_outlier == 1) ])
        trueNegative.append(predicted.loc[((predicted.timestamp < minTime) | (predicted.timestamp > maxTime)) & (predicted.is_outlier == 0) & (predicted.target == act.ground_truth)])
        falsePositive.append(predicted.loc[((predicted.timestamp < minTime) | (predicted.timestamp > maxTime)) & (predicted.is_outlier == 1) & (predicted.target == act.ground_truth)])
        falseNegative.append(predicted.loc[((predicted.timestamp > minTime) & (predicted.timestamp < maxTime)) & (predicted.is_outlier == 0) ])
        del maxTime, minTime

     else: 
          #remove duplicate row
          trueP = pd.concat(truePositive).drop_duplicates().reset_index(inplace=True, drop=True)
          trueN = pd.concat(trueNegative).drop_duplicates().reset_index(inplace=True, drop=True)
          falseP = pd.concat(falsePositive).drop_duplicates().reset_index(inplace=True, drop=True)
          falseN = pd.concat(falseNegative).drop_duplicates().reset_index(inplace=True, drop=True)

          print("True Positive: ", trueP.shape[0])
          print("True Negative: ", trueN.shape[0])
          print("False Positive: ", falseP.shape[0])
          print("False Negative: ", falseN.shape[0])

     return

In [None]:
for i, (clf_name, clf) in enumerate(classifiers.items()):

    clf.fit(X)
    y_pred = clf.predict(X)

    #  Predict if a particular sample is an outlier or not
    scores_pred = clf.decision_function(X) * -1

    np.unique(y_pred, return_counts=True)

    n_outliers = np.count_nonzero(y_pred)
    n_inliers = len(y_pred) - n_outliers

    print('\n\nOUTLIERS: ', n_outliers, 'INLIERS: ', n_inliers, clf_name)

    predictedDF = df.assign(is_outlier=y_pred)

    #outliersDF = predictedDF.loc[lambda x: x.is_outlier == 1, :]
    #inliersDF = predictedDF.loc[lambda x: x.is_outlier == 0, :]

    #plt.scatter(inliersDF.timestamp, inliersDF.latency, color="green", label="Inliers", alpha= 0.3)
    #plt.scatter(outliersDF.timestamp, outliersDF.latency, color="red", label="Outliers", alpha= 0.3)
    #plt.xlabel("Time")
    #plt.ylabel("Latency")
    #plt.legend()
    #plt.show()

    getConfusionMatrix(faultsDataFrame, predictedDF)

