In [14]:
import configparser
import os, math, time, sys
import pandas as pd

sys.path.append('../')
from datetime import datetime
from core.utils import savepkl, loadpkl, calculate_metrics
from core.conversion import make_whitelist, multiple_ngram
from core.selection import make_graph, get_representatives
from core.clustering import clustering
from core.generation import make_signature

In [15]:
properties = configparser.ConfigParser()
properties.read('config.ini')

['config.ini']

In [16]:
string_path = properties.get('PATH', 'string_path')
label_path = properties.get('PATH', 'label_path')
train_path = properties.get('PATH', 'train_path')
test_path = properties.get('PATH', 'test_path')

save_path = properties.get('PATH', 'save_path')

In [17]:
N = properties.getint('DIME', 'N')
thetaB = properties.getint('DIME', 'thetaB')
thetaM = properties.getint('DIME', 'thetaM')
thetaD = properties.getint('DIME', 'thetaD')
eps = properties.getfloat('DIME', 'eps')
min_samples = properties.getint('DIME', 'min_samples')
Rc = properties.getfloat('DIME', 'Rc')
Rd = properties.getfloat('DIME', 'Rd')

In [18]:
data_strings = loadpkl(string_path)
data_labels = loadpkl(label_path)
train = loadpkl(train_path)
test = loadpkl(test_path)

In [19]:
train_malware = [key for key in train if data_labels[key] != 'benign']
train_benign = [key for key in train if data_labels[key] == 'benign']
test_malware = [key for key in test if data_labels[key] != 'benign']
test_benign = [key for key in test if data_labels[key] == 'benign']

In [20]:
train_benign_string = [data_strings[key] for key in train_benign]
whitelist = make_whitelist(train_benign_string, N=N, thetaB=thetaB)

Creating whitelist:   0%|          | 0/80 [00:00<?, ?it/s]

Creating whitelist: 100%|██████████| 80/80 [00:00<00:00, 357.15it/s]


In [21]:
train_labels = set(data_labels.values()) - {'benign'}

In [22]:
train_labels

{'gafgyt', 'mirai', 'miraia'}

In [27]:
%%time

for label in train_labels:
    train_string = [data_strings[key] for key in train_malware if data_labels[key] == label]
    train_size = len(train_string)
    
    train_ngrams = set()
    for strings in train_string:
        train_ngrams |= multiple_ngram(strings, N=N)
    train_ngrams -= whitelist
    
    file_ngram = []
    for strings in train_string:
        file_ngram.append(multiple_ngram(strings) - whitelist)
        
    graph_path = save_path + f'graph/{label.replace("/", ".")}_{N}_{thetaB}.pkl'
    bipartite_graph = make_graph(train_string, train_ngrams, graph_path, N)
   
    representatives_path = save_path + f'representatives/{label.replace("/", ".")}_{N}_{thetaB}_{thetaD}_{thetaM}.pkl'
    representatives = get_representatives(bipartite_graph, train_size, thetaD=thetaD, thetaM=thetaM)
    savepkl(representatives_path, representatives)
    
    cluster_path = save_path + f'cluster/{label.replace("/", ".")}_{N}_{thetaB}_{thetaD}_{thetaM}_{eps}_{min_samples}.pkl'
    bipartite_graph = make_graph(train_string, train_ngrams, graph_path, N)
    cluster_labels, wordset = clustering(bipartite_graph, representatives, train_size, eps=eps, min_samples=min_samples)
    savepkl(cluster_path, (cluster_labels, wordset))
    
    signature_path = save_path + f'signature/{label.replace("/", ".")}_{N}_{thetaB}_{thetaD}_{thetaM}_{eps}_{min_samples}_{Rc}.pkl'
    signatures = make_signature(file_ngram, cluster_labels, Rc, whitelist)
    savepkl(signature_path, signatures)

CPU times: total: 297 ms
Wall time: 609 ms


In [28]:
test_benign_ngrams = []
test_benign_strings = [data_strings[key] for key in test_benign]
for strings in test_benign_strings:
    test_benign_ngrams.append(multiple_ngram(strings))

In [29]:
test_malware_ngrams = []
test_malware_strings = [data_strings[key] for key in test_malware]
for strings in test_malware_strings:
    test_malware_ngrams.append(multiple_ngram(strings))

In [30]:
%%time

test_result = []

signatures = []
for label in train_labels:
    signatures_path = save_path + f'signature/{label.replace("/", ".")}_{N}_{thetaB}_{thetaD}_{thetaM}_{eps}_{min_samples}_{Rc}.pkl'
    if not os.path.exists(signatures_path):
        print(f"{label}'s signature not exists")
        continue
    signatures.append(loadpkl(signatures_path))

tp = tn = fp = fn = 0
for ngrams in test_malware_ngrams:
    pred = False
    for signature in signatures:
        for sig_info in signature:
            sig = set(sig_info[1][0])

            if len(sig)==0:
                continue

            if int(math.ceil(len(sig)*Rd)) <= len(ngrams&sig):
                pred = True
                break
        if pred:
            break
    if pred==True:
        tp += 1
    else:
        fn += 1

for idx, ngrams in enumerate(test_benign_ngrams):
    pred = False
    for signature in signatures:
        for sig_info in signature:
            sig = set(sig_info[1][0])

            if len(sig)==0:
                continue

            if int(math.ceil(len(sig)*Rd)) <= len(ngrams&sig):
                pred = True
                break
        if pred:
            break
    if pred==True:
        fp += 1
    else:
        tn += 1

row = [N, thetaD, thetaM, eps, Rc, Rd, tp, tn, fp, fn]
test_result.append(row)
                    
test_result = pd.DataFrame(test_result)
test_result.columns = ['N', 'thetaD', 'thetaM', 'eps', 'Rc', 'Rd', 'tp', 'tn', 'fp', 'fn']
test_result[['acc.', 'pre.', 'rec.', 'f1-score']] = test_result.apply(calculate_metrics, axis=1)

timestamp = time.time()
datetime_obj = datetime.fromtimestamp(timestamp)
datetime_str = datetime_obj.strftime("%Y%m%d%H%M%S")
test_result.to_csv(f'./test_result_{datetime_str}.csv', index=False)

CPU times: total: 0 ns
Wall time: 25 ms
