In [8]:
import json
import numpy as np
import hdbscan
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
import pickle

In [9]:
trainingJson = json.load(open("data/training_adfald.json",'r'))
testingJson  = json.load(open("data/testing_adfald.json",'r'))
vocabulary = open("data/adfald_vocabulary.txt",'r').readlines()
vocabulary = [v.replace('\n','') for v in vocabulary]
d = len(vocabulary)
n = len(trainingJson)
n_test = len(testingJson)
print(d, n, n_test)

341 4164 1788


In [10]:
X = []
for i in range(n):
    data = trainingJson[i]['actionsQueue']
    bagOfWords = [data.count(vocabulary[j]) for j in range(d)]
    X.append(bagOfWords)
X = np.array(X)
print(X.shape)
    

(4164, 341)


In [4]:
X_test = []
y_test = []
for i in range(n_test):
    data = testingJson[i]['actionsQueue']
    bagOfWords = [data.count(vocabulary[j]) for j in range(d)]
    X_test.append(bagOfWords)
    y_test.append(int(testingJson[i]['label']))
X_test = np.array(X_test)
y_test = np.array(y_test)
print(X_test.shape)
print(y_test.shape)

(1788, 341)
(1788,)


In [7]:
pickle.dump(X, open('data/X.pckl', 'wb'))
pickle.dump(X_test, open('data/X_test.pckl', 'wb'))
pickle.dump(y_test, open('data/y_test.pckl', 'wb'))

In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=10, min_samples=10, prediction_data=True)

In [None]:
clusterer.fit(X)

In [None]:
clusterer.labels_.tolist().count(-1)

In [None]:
y_pred, strengths = hdbscan.approximate_predict(clusterer, X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [12]:
#Parameter Evaluation
params = {}
for minClusterSize in [2,10,15,50,75,100]:
    for minSamples in [2,10,15,50,75,100]:
        clusterer = hdbscan.HDBSCAN(min_cluster_size=minClusterSize, min_samples=minSamples, prediction_data=True)
        clusterer.fit(X)
        clusterer.labels_.tolist().count(-1)
        y_pred, strengths = hdbscan.approximate_predict(clusterer, X_test)
        acc = accuracy_score(y_test, y_pred)
        print(minClusterSize, minSamples, acc)
        params[acc] = [minClusterSize, minSamples]

maxAcc = max(params.keys())
bestParams = params[maxAcc]
print("Best params: ", bestParams, " with ACC: ",maxAcc)
        

2 2 0.0011185682326621924
2 10 0.0
2 15 0.0
2 50 0.0005592841163310962
2 75 0.0011185682326621924
2 100 0.0
10 2 0.002796420581655481
10 10 0.002796420581655481
10 15 0.002796420581655481
10 50 0.008389261744966443
10 75 0.0
10 100 0.0
15 2 0.002796420581655481
15 10 0.002796420581655481
15 15 0.002796420581655481
15 50 0.008389261744966443
15 75 0.0
15 100 0.11409395973154363
50 2 0.008389261744966443
50 10 0.008389261744966443
50 15 0.008389261744966443
50 50 0.008389261744966443
50 75 0.3076062639821029
50 100 0.12136465324384788
75 2 0.21252796420581654
75 10 0.31543624161073824
75 15 0.31208053691275167
75 50 0.31096196868008946
75 75 0.3076062639821029
75 100 0.12136465324384788
100 2 0.2203579418344519
100 10 0.1935123042505593
100 15 0.18512304250559283
100 50 0.14261744966442952
100 75 0.12248322147651007
100 100 0.12136465324384788
Best params:  [75, 10]  with ACC:  0.31543624161073824


In [15]:
#Parameter Evaluation with Strengths
paramsAUC = {}
for minClusterSize in [2,10,15,50,75,100]:
    for minSamples in [2,10,15,50,75,100]:
        clusterer = hdbscan.HDBSCAN(min_cluster_size=minClusterSize, min_samples=minSamples, prediction_data=True)
        clusterer.fit(X)
        labels, strengths = hdbscan.approximate_predict(clusterer, X_test)
        maxStrength = max(strengths)
        y_pred = np.array([maxStrength - s for s in strengths])
        auc = roc_auc_score(y_test, y_pred)
        print(minClusterSize, minSamples, auc)
        paramsAUC[auc] = [minClusterSize, minSamples]

maxAuc = max(paramsAUC.keys())
bestParamsAuc = paramsAUC[maxAuc]
print("Best params: ", bestParamsAuc, " with AUC: ",maxAuc)


2 2 0.518875461192901
2 10 0.5658450957891866
2 15 0.561287197748195
2 50 0.5451280791219195
2 75 0.4808215794538241
2 100 0.48195751622215477
10 2 0.5532732474669768
10 10 0.5419325333319611
10 15 0.5577784009921115
10 50 0.5236507695553508
10 75 0.5029292503074618
10 100 0.46738201952318964
15 2 0.5488941661992559
15 10 0.5571763416403801
15 15 0.5578208538951182
15 50 0.5231638476223801
15 75 0.46330718406035004
15 100 0.466607575656219
50 2 0.5510470429623378
50 10 0.5327318314439647
50 15 0.5304245804881311
50 50 0.5232448940735748
50 75 0.46267360664426527
50 100 0.46630718920615644
75 2 0.5029646277266342
75 10 0.4962789387288829
75 15 0.4935625961622575
75 50 0.46339530599537904
75 75 0.46267360664426527
75 100 0.46630718920615644
100 2 0.5167245141072283
100 10 0.497016718725075
100 15 0.4966616580817462
100 50 0.46989896209084403
100 75 0.46286078535297664
100 100 0.46630718920615644
Best params:  [2, 10]  with AUC:  0.5658450957891866


In [11]:
#Parameter Evaluation with Outlier Scores
X_total = np.vstack((X,X_test))
y_total = np.hstack((np.zeros((n,)),y_test))
print(X_total.shape, y_total.shape)
paramsAUCtotal = {}
for minClusterSize in [2,10,15,50,75,100]:
    for minSamples in [2,10,15,50,75,100]:
        clusterer = hdbscan.HDBSCAN(min_cluster_size=minClusterSize, min_samples=minSamples)
        clusterer.fit(X_total)
        y_pred = np.nan_to_num(clusterer.outlier_scores_)
        auc = roc_auc_score(y_total, y_pred)
        print(minClusterSize, minSamples, auc)
        paramsAUCtotal[auc] = [minClusterSize, minSamples]

maxAuctotal = max(paramsAUCtotal.keys())
bestParamsAuctotal = paramsAUCtotal[maxAuctotal]
print("Best params: ", bestParamsAuctotal, " with AUC: ",maxAuctotal)


(5952, 341) (5952,)


  self._outlier_scores = outlier_scores(self._condensed_tree)


2 2 0.6198315719436945


  self._outlier_scores = outlier_scores(self._condensed_tree)


2 10 0.6932538914162766


  self._outlier_scores = outlier_scores(self._condensed_tree)


2 15 0.6633711205569156
2 50 0.669561647264087
2 75 0.6773710010824796
2 100 0.6864573666804337


  self._outlier_scores = outlier_scores(self._condensed_tree)


10 2 0.585994428989442


  self._outlier_scores = outlier_scores(self._condensed_tree)


10 10 0.6126991541003936


  self._outlier_scores = outlier_scores(self._condensed_tree)


10 15 0.6030428387949973


  self._outlier_scores = outlier_scores(self._condensed_tree)


10 50 0.6702115469982562


  self._outlier_scores = outlier_scores(self._condensed_tree)


10 75 0.6537722250774781


  self._outlier_scores = outlier_scores(self._condensed_tree)


10 100 0.6489221809440333


  self._outlier_scores = outlier_scores(self._condensed_tree)


15 2 0.5605197240964488


  self._outlier_scores = outlier_scores(self._condensed_tree)


15 10 0.5666443338733715


  self._outlier_scores = outlier_scores(self._condensed_tree)


15 15 0.6007732107415757


  self._outlier_scores = outlier_scores(self._condensed_tree)


15 50 0.6623788905150687


  self._outlier_scores = outlier_scores(self._condensed_tree)


15 75 0.6450427378596979


  self._outlier_scores = outlier_scores(self._condensed_tree)


15 100 0.6388389505200742


  self._outlier_scores = outlier_scores(self._condensed_tree)


50 2 0.5952923209866116


  self._outlier_scores = outlier_scores(self._condensed_tree)


50 10 0.6249469574701906


  self._outlier_scores = outlier_scores(self._condensed_tree)


50 15 0.6481010516840231


  self._outlier_scores = outlier_scores(self._condensed_tree)


50 50 0.6759658890185485


  self._outlier_scores = outlier_scores(self._condensed_tree)


50 75 0.667772749323064


  self._outlier_scores = outlier_scores(self._condensed_tree)


50 100 0.6497366155158154


  self._outlier_scores = outlier_scores(self._condensed_tree)


75 2 0.5962983266369285


  self._outlier_scores = outlier_scores(self._condensed_tree)


75 10 0.6445330403463111


  self._outlier_scores = outlier_scores(self._condensed_tree)


75 15 0.6544501910046049


  self._outlier_scores = outlier_scores(self._condensed_tree)


75 50 0.65743550697844


  self._outlier_scores = outlier_scores(self._condensed_tree)


75 75 0.6694901943416495


  self._outlier_scores = outlier_scores(self._condensed_tree)


75 100 0.6508188376167321


  self._outlier_scores = outlier_scores(self._condensed_tree)


100 2 0.5662814045249912


  self._outlier_scores = outlier_scores(self._condensed_tree)


100 10 0.575312410201057


  self._outlier_scores = outlier_scores(self._condensed_tree)


100 15 0.5982016779978556


  self._outlier_scores = outlier_scores(self._condensed_tree)


100 50 0.6121781271146204


  self._outlier_scores = outlier_scores(self._condensed_tree)


100 75 0.6248238782019921
100 100 0.6517254528956586
Best params:  [2, 10]  with ACC:  0.6932538914162766


  self._outlier_scores = outlier_scores(self._condensed_tree)
