## Create Training from Bootstrap

In [1]:
import logging
import random
import click
import math
from collections import deque
from copy import copy
import time
import json
import sys
sys.path.insert(0, '../src/streamspot')

from iostream import *
from graph import *
from streamhash import *
from cluster import *
import utils as U
import param as P

logging.basicConfig(level=logging.DEBUG)

In [None]:
edges = '../baseline/sbustreamspot-data/all.tsv'
bootstrap = '../baseline/streamspot-bootstrap-clusters/01-C50_k10_all.txt'
chunk_length = 50
num_parallel_graphs = 10
max_num_edges = 100

In [None]:
par = int(num_parallel_graphs)

clusters, cluster_thresholds, global_threshold = read_bootstrap_clusters(bootstrap)
cluster_sizes = []
train_gids = set()
cluster_map = {}

statistics = []

for i, cluster in clusters.items():
    cluster_sizes.append(len(cluster))
    for g in cluster:
        train_gids.add(g)
        cluster_map[g] = i

logging.debug(f"Training Graphs: {train_gids}")

test_gids, train_edges, test_edges, num_test_edges = read_edges(edges, train_gids)
random.shuffle(test_gids)



In [None]:
H = U.allocate_random_bits(chunk_length)

### Get test sketches

In [None]:
test_graph = {}

for gid, edges in test_edges.items():
#     print(e[:10])
    logging.debug(f'updating test graph {gid}')
    for e in edges:
        update_graphs(e, test_graph)
test_graph

In [None]:

#Construct static test Shingles
test_streamhash_sketches = {}
test_streamhash_projections = {}
for gid in train_gids:
    logging.info(f" {gid}")
    temp_shingle_vector = construct_temp_shingle_vector(test_graph[gid],
                                                        chunk_length)
    
    test_streamhash_sketches[gid], test_streamhash_projections[gid] \
        = construct_streamhash_sketch(temp_shingle_vector, H)
    
    logging.debug(f"Sketch for test graph {gid}: {test_streamhash_sketches[gid]}")
#     logging.debug(f"Projection for test graph {gid}: {test_streamhash_projections[gid]}")

with open('test_graph_sketches.json', 'w') as fout:
    fout.write(json.dumps(test_streamhash_sketches))
with open('test_graph_projections.json', 'w') as fout:
    fout.write(json.dumps(test_streamhash_projections))

### Get Test Anomaly Score

In [None]:
with open('centroid_sketches.json', 'r') as fin:
    centroid_sketches = json.loads(fin.read())

with open('test_graph_sketches.json', 'r') as fin:
    test_sketches = json.loads(fin.read())

In [None]:
centroid_sketches

In [None]:
test_sketches

In [None]:
centroid_sketches.keys()

In [None]:
len(test_sketches.keys())

In [None]:
def get_anomaly_scores(centroid_sketches, test_sketches):
    test_anomaly_scores = {}
    for gid in test_sketches.keys():
        logging.debug(gid)
        test_anomaly_scores[gid] = []
        for i in centroid_sketches.keys():
            dist = 1.0 - math.cos(
                P.PI * (1.0 - streamhash_similarity(test_sketches[gid],
                                                    centroid_sketches[i]))
            )
            test_anomaly_scores[gid].append(dist)
    
    return test_anomaly_scores

anomaly_scores = get_anomaly_scores(centroid_sketches, test_sketches)

In [None]:
import pandas as pd
test_scores = pd.DataFrame([(k, min(v)) for k, v in anomaly_scores.items()], columns=['gid', 'anomaly_score'])
test_scores

In [None]:
test_scores['y_true'] = test_scores['gid'].apply(lambda x: int(x.startswith('3')))
test_scores

In [None]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(test_scores.y_true, test_scores.anomaly_score)

In [None]:
metrics.auc(fpr, tpr)

# Static Evaluation

In [1]:
import pandas as pd
import plotly.express as px

In [31]:
def show_roc_auc(dataset_type, threshold=0.5):
    print(f"\n++++++++++++++++++++ {dataset_type} ++++++++++++++++++++")
    anomaly_scores = pd.read_csv(f'../baseline/sbustreamspot-data/test_anomaly_scores_cpp_{dataset_type}.txt', names=['scores'])

    anomaly_scores = anomaly_scores.reset_index()
    print(anomaly_scores.head())

    test_scores = anomaly_scores[anomaly_scores['scores']!=-2]
    test_scores['y_true'] = test_scores['index'].apply(lambda x: int(x>=300 and x<=399))

    hfig = px.histogram(test_scores, x='scores')
    hfig.add_vline(x=threshold, line_width=3, line_dash="dash", line_color="green", annotation_text=f'global threshold = {threshold}', annotation_position='top')
    hfig.show()

    from sklearn import metrics
    import plotly.graph_objects as go

    fpr, tpr, thresholds = metrics.roc_curve(test_scores.y_true, test_scores.scores)
    auc = metrics.auc(fpr, tpr)

    # precision, recall, thresholds = precision_recall_curve(test_scores.y_true, test_scores.scores)


    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=fpr,
        y=tpr,
        name='ROC'
    ))

    fig.add_trace(go.Scatter(
        x=[0.0,1.0],
        y=[0.0,1.0],
        name='',
        line = dict(color='gray', width=4, dash='dash')
    ))

    fig.update_layout(
        title=f"{dataset_type} ROC (AUC = {auc})",
        xaxis_title="FPR",
        yaxis_title="TPR",
    #     legend_title="Legend Title",
    )
    fig.show()

In [32]:
for t, s in [
    ('all', 0.4823),
    ('gfc', 1.0288), 
    ('ydc', 0.9742)
]:
    show_roc_auc(t, s)


++++++++++++++++++++ all ++++++++++++++++++++
   index    scores
0      0 -2.000000
1      1 -2.000000
2      2  0.295118
3      3 -2.000000
4      4  0.110424




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




++++++++++++++++++++ gfc ++++++++++++++++++++
   index  scores
0      0    -2.0
1      1    -2.0
2      2    -2.0
3      3    -2.0
4      4    -2.0




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




++++++++++++++++++++ ydc ++++++++++++++++++++
   index  scores
0      0    -2.0
1      1    -2.0
2      2    -2.0
3      3    -2.0
4      4    -2.0




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

