In [15]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import pandas as pd
from datetime import datetime, timedelta, timezone
import time
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.font_manager
import seaborn as sns
from sklearn.metrics import confusion_matrix
from pyod.utils.example import visualize

In [16]:
# Import models
from pyod.models.abod import ABOD
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest

## Loading and Reviewing the Data

In [17]:
# reading the dataset
df = pd.read_csv("public/data_source/rca_dataset.csv")
faultsDataFrame = pd.read_csv("public/data_source/ret_info.csv")

##### Change "succ" from boolean to integer, True=1, False=0

In [18]:
succ_map = {True : 1, False : 0}
df['succ'] = df['succ'].map(succ_map)

In [19]:
import datetime

df['timestamp'] = df['timestamp'].apply(lambda d: datetime.datetime.fromtimestamp(int(d) / 1000))


##### Converting "target" to Int

In [20]:
le = preprocessing.LabelEncoder()
le.fit(df['target'])
LabelEncoder()
list(le.classes_)
targetTransformed = le.transform(df['target'])

##### Concatenate features that will be used

In [21]:
X1 = df['latency'].values.reshape(-1,1)
X2 = targetTransformed.reshape(-1,1)

X = np.concatenate((X1,X2), axis=1)

#### Algorithms

In [22]:
outliers_fraction = 0.1
# Test 4 different methods
classifiers = {
        #'FeatureBagging': FeatureBagging(contamination=outliers_fraction),
        'KNN': KNN(contamination=outliers_fraction),
        #'HBOS': HBOS(contamination=outliers_fraction),
        #'IForest': IForest(contamination=outliers_fraction)
}

In [12]:
def getConfusionMatrix(faults, predicted):
    import datetime
    
    truePositive = []
    trueNegative = []
    falsePositive = []
    falseNegative = []
    #predicted = predicted.assign(date_time = list(map( lambda x: datetime.fromtimestamp(x/1000), predicted.timestamp)))
    predicted = predicted.assign(date_time = lambda x: df['timestamp'])
    #date_time = datetime.datetime.strptime(date_time,'%Y-%m-%d %H:%M:%S+08:00')
    
    
    for i, act in faults.iterrows(): 
        #minTime = act.time_preliminary
        minTime = datetime.datetime.strptime(act.time_preliminary,'%Y-%m-%d %H:%M:%S+08:00')
        maxTime = minTime + timedelta(minutes=5)

        truePositive.append(predicted.loc[(predicted.date_time >= minTime) & (predicted.date_time <= maxTime) & (predicted.outliers == 1) & (predicted.target == act.ground_truth)])
        trueNegative.append(predicted.loc[predicted.outliers == 0])
        falsePositive.append(predicted.loc[(predicted.date_time < minTime) | (predicted.date_time > maxTime) & (predicted.outliers == 1)])
        falseNegative.append(predicted.loc[(predicted.date_time < minTime) | (predicted.date_time > maxTime) & (predicted.outliers == 0)])

        del maxTime, minTime
    
    else: #remove all empty row
        truePositive = list(filter(lambda dfTP: not dfTP.empty, truePositive))        
        trueNegative = list(filter(lambda dfTN: not dfTN.empty, trueNegative))
        falsePositive = list(filter(lambda dfFP: not dfFP.empty, falsePositive))
        falseNegative = list(filter(lambda dfFN: not dfFN.empty, falseNegative))

        count = 0
        for i, *trueP in truePositive:
             count = count + len(trueP)
        print("True Positive: ", count)

        count = 0
        for i, *trueN in trueNegative:
             count = count + len(trueN)
        print("True Negative: ", count)

        count = 0
        for i, *falseP in falsePositive:
             count = count + len(falseP)
        print("False Positive: ", count)

        count = 0
        for i, *falseN in falseNegative:
             count = count + len(falseN)
        print("False Negative: ", count)

    return 

In [None]:
#xx, yy = np.meshgrid(np.linspace(-10, 10, 200), np.linspace(-10, 10, 401244))
#plt.figure(figsize=(10, 10))

for i, (clf_name, clf) in enumerate(classifiers.items()):
    clf.fit(X)

    y_pred = clf.predict(X)
    scores_pred = clf.decision_function(X) * -1

    np.unique(y_pred, return_counts=True)

    n_inliers = len(y_pred) - np.count_nonzero(y_pred)
    n_outliers = np.count_nonzero(y_pred == 1)

    outliers = np.where(y_pred == 1)
    #outliersList = df.iloc[outliers[0], :]

    print('\n\nOUTLIERS: ', n_outliers, 'INLIERS: ', n_inliers, clf_name)
    # print(outliersList.head(5))

    predicted = df.assign(outliers=y_pred)
    
    #getConfusionMatrix(faultsDataFrame, predicted)
    print(predicted.head(5))

    # visualize the results
    # visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred=y_pred, show_figure=False, save_figure=True)
    visualize(clf_name, y_test_pred=y_pred, show_figure=False, save_figure=True)

