In [33]:
import json
import numpy as np
import hdbscan
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from prefixspan import PrefixSpan
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
trainingJson = json.load(open("data/training_adfald.json",'r'))
testingJson  = json.load(open("data/testing_adfald.json",'r'))
vocabulary = open("data/adfald_vocabulary.txt",'r').readlines()
vocabulary = [v.replace('\n','') for v in vocabulary]
d = len(vocabulary)
n = len(trainingJson)
n_test = len(testingJson)
print(d, n, n_test)

341 4164 1788


In [78]:
X = []
for i in range(n):
    data = trainingJson[i]['actionsQueue']
    bagOfWords = [data.count(vocabulary[j]) for j in range(d)]
    X.append(bagOfWords)
X = np.array(X)
print(X.shape)

(4164, 341)


In [79]:
X_test = []
y_test = []
for i in range(n_test):
    data = testingJson[i]['actionsQueue']
    bagOfWords = [data.count(vocabulary[j]) for j in range(d)]
    X_test.append(bagOfWords)
    y_test.append(int(testingJson[i]['label']))
X_test = np.array(X_test)
y_test = np.array(y_test)
print(X_test.shape)
print(y_test.shape)

(1788, 341)
(1788,)


In [3]:
db = []
for i in range(n):
    data = trainingJson[i]['actionsQueue']
    db.append(data)
    
ps = PrefixSpan(db)
top_patterns = ps.topk(800)  

In [7]:
with open("data/top_800.txt", "w") as outp:
    for tp in top_patterns:
        outp.write(str(tp[0]) + "\t")
        for i in range(len(tp[1])):
            outp.write(str(tp[1][i]))
            if i < len(tp[1])-1:
                outp.write(',')
        outp.write('\n')

In [10]:
patterns = []
for tp in top_patterns:
    patterns.append(tp[1])

# Calculate Feature Representation

In [19]:
def patternIsPresent(sequence, pattern):
    return any(sequence[i:i+len(pattern)] == pattern for i in range(len(sequence) - len(pattern) + 1))

def getFeaturesForPattern(sequence, patterns):
    x = np.zeros(len(patterns))
    for i in range(len(patterns)):
        if patternIsPresent(sequence, patterns[i]):
            x[i] += 1.
    return x

def getFeatureRepresentationPatternsOnly(sequences, patterns):
    X = []
    for s in sequences:
        x = getFeaturesForPattern(s, patterns)
        X.append(x)
    return np.array(X)
    
def getFeatureRepresentationPatternsAndBagOfWords(sequences, patterns, vocabulary):
    X = []
    d = len(vocabulary)
    for s in sequences:
        bagOfWords = [data.count(vocabulary[j]) for j in range(d)]
        xPattern = getFeaturesForPattern(s, patterns)
        x= np.hstack((bagOfWords,xPattern))
        X.append(x)
    return np.array(X)

In [12]:
sequences = []
for i in range(n):
    sequences.append(trainingJson[i]['actionsQueue'])

In [13]:
Xpo = getFeatureRepresentationPatternsOnly(sequences, patterns)

In [20]:
Xpbow = getFeatureRepresentationPatternsAndBagOfWords(sequences, patterns, vocabulary)

In [21]:
test_sequences = []
y_test = []
for i in range(n_test):
    test_sequences.append(testingJson[i]['actionsQueue'])
    y_test.append(int(testingJson[i]['label']))
y_test = np.array(y_test)
Xpo_test = getFeatureRepresentationPatternsOnly(test_sequences, patterns)
Xpbow_test = getFeatureRepresentationPatternsAndBagOfWords(test_sequences, patterns, vocabulary)

In [22]:
print(Xpo.shape)
print(Xpbow.shape)
print(Xpo_test.shape)
print(Xpbow_test.shape)
print(y_test.shape)

(4164, 800)
(4164, 1141)
(1788, 800)
(1788, 1141)
(1788,)


In [28]:
pickle.dump(Xpo,open('data/Xpo.pckl','wb'))
pickle.dump(Xpbow,open('data/Xpbow.pckl','wb'))
pickle.dump(Xpo_test,open('data/Xpo_test.pckl','wb'))
pickle.dump(Xpbow_test,open('data/Xpbow_test.pckl','wb'))
pickle.dump(y_test,open('data/y_test.pckl','wb'))

# HDBSCAN

# Outlier Scores after Training on all Data (train + test)

In [29]:
#Parameter Evaluation with Outlier Scores
X_total = np.vstack((Xpo,Xpo_test))
y_total = np.hstack((np.zeros((n,)),y_test))
print(X_total.shape, y_total.shape)
paramsAUCtotal = {}
for minClusterSize in [2,10,15,50,75,100]:
    for minSamples in [2,10,15,50,75,100]:
        clusterer = hdbscan.HDBSCAN(min_cluster_size=minClusterSize, min_samples=minSamples)
        clusterer.fit(X_total)
        y_pred = np.nan_to_num(clusterer.outlier_scores_)
        auc = roc_auc_score(y_total, y_pred)
        print(minClusterSize, minSamples, auc)
        paramsAUCtotal[auc] = [minClusterSize, minSamples]

maxAuctotal = max(paramsAUCtotal.keys())
bestParamsAuctotal = paramsAUCtotal[maxAuctotal]
print("Best params: ", bestParamsAuctotal, " with AUC: ",maxAuctotal)

(5952, 800) (5952,)


  self._outlier_scores = outlier_scores(self._condensed_tree)


2 2 0.49777131253997503


  self._outlier_scores = outlier_scores(self._condensed_tree)


2 10 0.4887448129040631


  self._outlier_scores = outlier_scores(self._condensed_tree)


2 15 0.48641673507264765


  self._outlier_scores = outlier_scores(self._condensed_tree)


2 50 0.4869113695375206


  self._outlier_scores = outlier_scores(self._condensed_tree)


2 75 0.4916974279007827


  self._outlier_scores = outlier_scores(self._condensed_tree)


2 100 0.49359266838943316


  self._outlier_scores = outlier_scores(self._condensed_tree)


10 2 0.4924885598077698


  self._outlier_scores = outlier_scores(self._condensed_tree)


10 10 0.48449844940721115


  self._outlier_scores = outlier_scores(self._condensed_tree)


10 15 0.4720786440475467


  self._outlier_scores = outlier_scores(self._condensed_tree)


10 50 0.46393288214567846


  self._outlier_scores = outlier_scores(self._condensed_tree)


10 75 0.4740972212924044


  self._outlier_scores = outlier_scores(self._condensed_tree)


10 100 0.47622548842900386


  self._outlier_scores = outlier_scores(self._condensed_tree)


15 2 0.48748698398115603


  self._outlier_scores = outlier_scores(self._condensed_tree)


15 10 0.4819270454074954


  self._outlier_scores = outlier_scores(self._condensed_tree)


15 15 0.47695714060596195


  self._outlier_scores = outlier_scores(self._condensed_tree)


15 50 0.4633817290628776


  self._outlier_scores = outlier_scores(self._condensed_tree)


15 75 0.46440575887380936


  self._outlier_scores = outlier_scores(self._condensed_tree)


15 100 0.46797042286740703


  self._outlier_scores = outlier_scores(self._condensed_tree)


50 2 0.4948153501991412


  self._outlier_scores = outlier_scores(self._condensed_tree)


50 10 0.4728692609785163


  self._outlier_scores = outlier_scores(self._condensed_tree)


50 15 0.48502720103324787


  self._outlier_scores = outlier_scores(self._condensed_tree)


50 50 0.4896148648857423


  self._outlier_scores = outlier_scores(self._condensed_tree)


50 75 0.4861539685596841


  self._outlier_scores = outlier_scores(self._condensed_tree)


50 100 0.47558382831111556


  self._outlier_scores = outlier_scores(self._condensed_tree)


75 2 0.5006092166287816


  self._outlier_scores = outlier_scores(self._condensed_tree)


75 10 0.49976581465601144


  self._outlier_scores = outlier_scores(self._condensed_tree)


75 15 0.5063311151599671


  self._outlier_scores = outlier_scores(self._condensed_tree)


75 50 0.4809187223650996


  self._outlier_scores = outlier_scores(self._condensed_tree)


75 75 0.5273515349890157


  self._outlier_scores = outlier_scores(self._condensed_tree)


75 100 0.48547523016853106


  self._outlier_scores = outlier_scores(self._condensed_tree)


100 2 0.5091231863832102


  self._outlier_scores = outlier_scores(self._condensed_tree)


100 10 0.49839443352123086


  self._outlier_scores = outlier_scores(self._condensed_tree)


100 15 0.517309245158453


  self._outlier_scores = outlier_scores(self._condensed_tree)


100 50 0.5014624031458855


  self._outlier_scores = outlier_scores(self._condensed_tree)


100 75 0.5136411997293286
100 100 0.521937205884322
Best params:  [75, 75]  with AUC:  0.5273515349890157


  self._outlier_scores = outlier_scores(self._condensed_tree)


In [30]:
#Parameter Evaluation with Outlier Scores
X_total = np.vstack((Xpbow,Xpbow_test))
y_total = np.hstack((np.zeros((n,)),y_test))
print(X_total.shape, y_total.shape)
paramsAUCtotal = {}
for minClusterSize in [2,10,15,50,75,100]:
    for minSamples in [2,10,15,50,75,100]:
        clusterer = hdbscan.HDBSCAN(min_cluster_size=minClusterSize, min_samples=minSamples)
        clusterer.fit(X_total)
        y_pred = np.nan_to_num(clusterer.outlier_scores_)
        auc = roc_auc_score(y_total, y_pred)
        print(minClusterSize, minSamples, auc)
        paramsAUCtotal[auc] = [minClusterSize, minSamples]

maxAuctotal = max(paramsAUCtotal.keys())
bestParamsAuctotal = paramsAUCtotal[maxAuctotal]
print("Best params: ", bestParamsAuctotal, " with AUC: ",maxAuctotal)

(5952, 1141) (5952,)


  self._outlier_scores = outlier_scores(self._condensed_tree)


2 2 0.49777131253997503


  self._outlier_scores = outlier_scores(self._condensed_tree)


2 10 0.4887448129040631


  self._outlier_scores = outlier_scores(self._condensed_tree)


2 15 0.48641673507264765


  self._outlier_scores = outlier_scores(self._condensed_tree)


2 50 0.4869113695375206


  self._outlier_scores = outlier_scores(self._condensed_tree)


2 75 0.4916974279007827


  self._outlier_scores = outlier_scores(self._condensed_tree)


2 100 0.49359266838943316


  self._outlier_scores = outlier_scores(self._condensed_tree)


10 2 0.4924885598077698


  self._outlier_scores = outlier_scores(self._condensed_tree)


10 10 0.48449844940721115


  self._outlier_scores = outlier_scores(self._condensed_tree)


10 15 0.4720786440475467


  self._outlier_scores = outlier_scores(self._condensed_tree)


10 50 0.46393288214567846


  self._outlier_scores = outlier_scores(self._condensed_tree)


10 75 0.4740972212924044


  self._outlier_scores = outlier_scores(self._condensed_tree)


10 100 0.47622548842900386


  self._outlier_scores = outlier_scores(self._condensed_tree)


15 2 0.48748698398115603


  self._outlier_scores = outlier_scores(self._condensed_tree)


15 10 0.4819270454074954


  self._outlier_scores = outlier_scores(self._condensed_tree)


15 15 0.47695714060596195


  self._outlier_scores = outlier_scores(self._condensed_tree)


15 50 0.4633817290628776


  self._outlier_scores = outlier_scores(self._condensed_tree)


15 75 0.46440575887380936


  self._outlier_scores = outlier_scores(self._condensed_tree)


15 100 0.46797042286740703


  self._outlier_scores = outlier_scores(self._condensed_tree)


50 2 0.4948153501991412


  self._outlier_scores = outlier_scores(self._condensed_tree)


50 10 0.4728692609785163


  self._outlier_scores = outlier_scores(self._condensed_tree)


50 15 0.48502720103324787


  self._outlier_scores = outlier_scores(self._condensed_tree)


50 50 0.4896148648857423


  self._outlier_scores = outlier_scores(self._condensed_tree)


50 75 0.4861539685596841


  self._outlier_scores = outlier_scores(self._condensed_tree)


50 100 0.47558382831111556


  self._outlier_scores = outlier_scores(self._condensed_tree)


75 2 0.5006092166287816


  self._outlier_scores = outlier_scores(self._condensed_tree)


75 10 0.49976581465601144


  self._outlier_scores = outlier_scores(self._condensed_tree)


75 15 0.5063311151599671


  self._outlier_scores = outlier_scores(self._condensed_tree)


75 50 0.4809187223650996


  self._outlier_scores = outlier_scores(self._condensed_tree)


75 75 0.5273515349890157


  self._outlier_scores = outlier_scores(self._condensed_tree)


75 100 0.48547523016853106


  self._outlier_scores = outlier_scores(self._condensed_tree)


100 2 0.5091231863832102


  self._outlier_scores = outlier_scores(self._condensed_tree)


100 10 0.49839443352123086


  self._outlier_scores = outlier_scores(self._condensed_tree)


100 15 0.517309245158453


  self._outlier_scores = outlier_scores(self._condensed_tree)


100 50 0.5014624031458855


  self._outlier_scores = outlier_scores(self._condensed_tree)


100 75 0.5136411997293286
100 100 0.521937205884322
Best params:  [75, 75]  with AUC:  0.5273515349890157


  self._outlier_scores = outlier_scores(self._condensed_tree)


# Use Strength from Cluster Predictions

In [31]:
#Parameter Evaluation with Strengths, Patterns only
paramsAUC = {}
for minClusterSize in [2,10,15,50,75,100]:
    for minSamples in [2,10,15,50,75,100]:
        clusterer = hdbscan.HDBSCAN(min_cluster_size=minClusterSize, min_samples=minSamples, prediction_data=True)
        clusterer.fit(Xpo)
        labels, strengths = hdbscan.approximate_predict(clusterer, Xpo_test)
        maxStrength = max(strengths)
        y_pred = np.array([maxStrength - s for s in strengths])
        auc = roc_auc_score(y_test, y_pred)
        print(minClusterSize, minSamples, auc)
        paramsAUC[auc] = [minClusterSize, minSamples]

maxAuc = max(paramsAUC.keys())
bestParamsAuc = paramsAUC[maxAuc]
print("Best params: ", bestParamsAuc, " with AUC: ",maxAuc)

2 2 0.5079251851203862
2 10 0.553762099077357
2 15 0.5768301060550705
2 50 0.5403186797919036
2 75 0.5685395686785055
2 100 0.5745299305830713
10 2 0.5664413661086897
10 10 0.5592571771135113
10 15 0.5579199106688004
10 50 0.5105115960747789
10 75 0.5504842203845977
10 100 0.5372157585175961
15 2 0.5609675145240386
15 10 0.5540566964951913
15 15 0.5299801114581671
15 50 0.501811323861619
15 75 0.5561786984197228
15 100 0.5307371882284532
50 2 0.5196351108663995
50 10 0.5126792670313328
50 15 0.5011565199940308
50 50 0.49252700261921545
50 75 0.5547230784272357
50 100 0.4719617872414875
75 2 0.5295382153314157
75 10 0.4933085219700205
75 15 0.4960557393751962
75 50 0.4833931962147448
75 75 0.5913637930768321
75 100 0.5514014603798635
100 2 0.5174905960387582
100 10 0.5121344547760802
100 15 0.49779888130168315
100 50 0.461241786006494
100 75 0.5891504530882558
100 100 0.5695346389959502
Best params:  [75, 75]  with AUC:  0.5913637930768321


In [32]:
#Parameter Evaluation with Strengths, Patterns and Bag of Words
paramsAUC = {}
for minClusterSize in [2,10,15,50,75,100]:
    for minSamples in [2,10,15,50,75,100]:
        clusterer = hdbscan.HDBSCAN(min_cluster_size=minClusterSize, min_samples=minSamples, prediction_data=True)
        clusterer.fit(Xpbow)
        labels, strengths = hdbscan.approximate_predict(clusterer, Xpbow_test)
        maxStrength = max(strengths)
        y_pred = np.array([maxStrength - s for s in strengths])
        auc = roc_auc_score(y_test, y_pred)
        print(minClusterSize, minSamples, auc)
        paramsAUC[auc] = [minClusterSize, minSamples]

maxAuc = max(paramsAUC.keys())
bestParamsAuc = paramsAUC[maxAuc]
print("Best params: ", bestParamsAuc, " with AUC: ",maxAuc)

2 2 0.5079251851203862
2 10 0.553762099077357
2 15 0.5768301060550705
2 50 0.5403186797919036
2 75 0.5685395686785055
2 100 0.5745299305830713
10 2 0.5664413661086897
10 10 0.5592571771135113
10 15 0.5579199106688004
10 50 0.5105115960747789
10 75 0.5504842203845977
10 100 0.5372157585175961
15 2 0.5609675145240386
15 10 0.5540566964951913
15 15 0.5299801114581671
15 50 0.501811323861619
15 75 0.5561786984197228
15 100 0.5307371882284532
50 2 0.5196351108663995
50 10 0.5126792670313328
50 15 0.5011565199940308
50 50 0.49252700261921545
50 75 0.5547230784272357
50 100 0.4719617872414875
75 2 0.5295382153314157
75 10 0.4933085219700205
75 15 0.4960557393751962
75 50 0.4833931962147448
75 75 0.5913637930768321
75 100 0.5514014603798635
100 2 0.5174905960387582
100 10 0.5121344547760802
100 15 0.49779888130168315
100 50 0.461241786006494
100 75 0.5891504530882558
100 100 0.5695346389959502
Best params:  [75, 75]  with AUC:  0.5913637930768321


# Calculate More Error Measures for Best Model

In [68]:
def calcSensAndSpec(cm):
    tn = cm[0,0]
    fn = cm[1,0]
    fp = cm[0,1]
    tp = cm[1,1]
    print(cm)
    TPR = float(tp)/float(tp + fn)
    TNR = float(tn)/float(tn + fp)
    return TPR, TNR

In [70]:
# Patterns and Bag of Words
clusterer = hdbscan.HDBSCAN(min_cluster_size=75, min_samples=75, prediction_data=True)
clusterer.fit(Xpbow)
labels, strengths = hdbscan.approximate_predict(clusterer, Xpbow_test)
y_pred_bin = np.where(labels == -1, labels, 0.)*-1
cm = confusion_matrix(y_test, y_pred_bin)
print(calcSensAndSpec(cm))

In [76]:
# Patterns only
clusterer = hdbscan.HDBSCAN(min_cluster_size=75, min_samples=75, prediction_data=True)
clusterer.fit(Xpo)
labels, strengths = hdbscan.approximate_predict(clusterer, Xpo_test)
y_pred_bin = np.where(labels == -1, labels, 0.)*-1
cm = confusion_matrix(y_test, y_pred_bin)
print(calcSensAndSpec(cm))

[[700 342]
 [220 526]]
(0.7050938337801609, 0.6717850287907869)


In [80]:
# Bag of Words
clusterer = hdbscan.HDBSCAN(min_cluster_size=2, min_samples=10, prediction_data=True)
clusterer.fit(X)
labels, strengths = hdbscan.approximate_predict(clusterer, X_test)
y_pred_bin = np.where(labels == -1, labels, 0.)*-1
cm = confusion_matrix(y_test, y_pred_bin)
print(calcSensAndSpec(cm))

[[596 446]
 [ 26 720]]
(0.9651474530831099, 0.5719769673704415)
