In [1]:
import plotly.express as px
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, auc
import pickle
import turicreate as tc
import re
import glob
from utils import load_net
import plotly.graph_objects as go

In [2]:
labels = pd.read_csv('../res/newclusterres/apks_labels_hashed.csv', index_col=0)

In [18]:
def read_and_conv(fname, labels):
    with open(fname, 'rb') as f:
        eres = pickle.load(f)
    tblre = dict()
    for r, val in eres:
        tblre[r['apk']] = [r['nn'], val, not labels.loc[r['apk']]['malware_label']]
    return pd.DataFrame.from_dict(tblre, orient='index', columns=[ 'nearest', 'prob', 'true_val'])
f_size = 189452
#glob.glob('../res/newclusterres/streamed-0.65-0.2/')

def get_file_names(gamma, p_size):
    eval_name = f"../res/newclusterres/2ndrun/streamed-{gamma}-{p_size}/{gamma}-{p_size}-evalresults.pickle"
    net_name = f"../res/newclusterres/2ndrun/streamed-{gamma}-{p_size}/merged-{gamma}-0-tc-nets-voting.pickle"
    return eval_name, net_name
    
def read_go(gammas, p_sizes, get_file_names=get_file_names):
    fnames = dict()
    aucs = dict()
    sizes = dict()
    for p_size in p_sizes: # list(range(2,10,2)):
        a = list()
        b = list()
        c = list()
        for gamma in gammas:
            eval_name, net_name = get_file_names(gamma=gamma, p_size=p_size)
            _, net = load_net(net_name)
            c.append(len(net))
            a.append(eval_name)
            df = read_and_conv(eval_name, labels)
            [p1, r1, thresholds] = precision_recall_curve(df['true_val'], df['prob'])
            b.append(auc(r1, p1))

        fnames[p_size] = a
        aucs[p_size] = b
        sizes[p_size] = c
        
    return aucs, sizes

def fix_keys(di):
    cvt = lambda x: x*10 if x>1 else x
    return {cvt(float(k)): v for k,v in di.items()}
    

In [38]:
gammas = [ 0.65, 0.75, 0.85, 0.95] # 0.3, 0.35, 0.4, 0.45, 0.5,0.6,
p_sizes_new = [f"0.0{a}" for a in range(2,10,2)] + ['0.10', '0.15']

aucs_new, sizes_new = read_go(gammas=gammas, p_sizes=p_sizes_new)

p_sizes_old = [a for a in range(2,10,2)]
aucs_old, sizes_old = read_go(gammas=gammas, p_sizes=p_sizes_old)

sizes = fix_keys(dict(sizes_old, **sizes_new))
aucs = fix_keys(dict(aucs_old, **aucs_new))


In [25]:

fig = go.Figure() 

for k, ss in sorted(sizes.items()):

    fig = fig.add_trace(go.Scatter(x = list(map(lambda x: x/f_size, ss)),
                                   y = aucs[k], 
                                   name = f"{k} %", text=gammas, 
                                   hovertemplate = 'AuC: %{y:.3f}<br>Comp: %{x}<br>Epsilon: %{text}'))
    
fig.update_layout(
    title="Compression vs. PRAuC",
    xaxis_title="Compression",
    yaxis_title="PRAuC",
    legend_title="Size of the origin partition"
)
    
fig.update_layout(hovermode="x unified")

fig.show()

In [None]:
Claim: randomness is better than network 
    
Test even smaller networks (smaller initial sets)

Look for a case where they become "worst" than the larger orgin network

Perhaps rerun: selection of apks should be randomized (not necessary the network creation)

base line prediction with origin network?

In [28]:
## Just origins

In [35]:
def get_origin_file_names(gamma, p_size):
    eval_name = f"../res/newclusterres/ori/complete_nets/streamed-{gamma}-{p_size}/{gamma}-evalresults.pickle"
    net_name = f"../res/newclusterres/ori/complete_nets/streamed-{gamma}-{p_size}/{gamma}-streamed-0-voting.pickle"
    return eval_name, net_name

gammas = [ 0.65, 0.75, 0.85, 0.95] # 0.3, 0.35, 0.4, 0.45, 0.5,0.6,
p_sizes_new = [f"0.0{a}" for a in range(2,10,2)] + ['0.10', '0.15'] + list(range(2,10,2))

aucs_ori, sizes_ori = read_go(gammas=gammas, p_sizes=p_sizes_new, get_file_names=get_origin_file_names)
sizes_ori = fix_keys(sizes_ori)
aucs_ori = fix_keys(aucs_ori)

In [40]:

fig = go.Figure() 

for k, ss in sorted(sizes_ori.items()):

    fig = fig.add_trace(go.Scatter(x = list(map(lambda x: x/f_size, ss)),
                                   y = aucs_ori[k], 
                                   name = f"ori {k} %", text=gammas, 
                                   hovertemplate = 'AuC: %{y:.3f}<br>Comp: %{x}<br>Epsilon: %{text}'))
    


for k, ss in sorted(sizes.items()):

    fig = fig.add_trace(go.Scatter(x = list(map(lambda x: x/f_size, ss)),
                                   y = aucs[k], 
                                   name = f"{k} %", text=gammas, 
                                   hovertemplate = 'AuC: %{y:.3f}<br>Comp: %{x}<br>Epsilon: %{text}'))
    
fig.update_layout(
    title="Origins and full networks Compression vs. PRAuC",
    xaxis_title="Compression",
    yaxis_title="PRAuC",
    legend_title="Size of the origin partition"
)
    
fig.update_layout(hovermode="x unified")

fig.show()