In [5]:
# import module

import pickle
from core.MV2 import MV2
from core.JIG import JIG
from core.SG2 import SG2 
from core.AWL import AWL

In [6]:
# GIPS의 입력 스트링에서 리스트 혹은 셋으로 변경

def main(payload_path, signature_path, stopword_path, virtual_vector_path, big_group_path,
         window_size, K, M, # MV2 파라미터
         thetaJ,  # JIG 파라미터
         vector_size, eps, minpts, ngram, hh1_size, hh2_size, hh3_size, ratio # SG2, AWL 파라미터
        ):
    
    with open(payload_path, 'rb') as f:
        payloads_data = pickle.load(f)

    print(len(payloads_data))
    payloads = [x for x in payloads_data]

    print(len(payloads))

    # 빅 그룹 식별
    minhashed_virtual_vectors = MV2(payloads=payloads, window_size=window_size, K=K, M=M)
    with open(virtual_vector_path, 'wb') as f:
        pickle.dump(minhashed_virtual_vectors, f)

    big_group_indices = JIG(vectors=minhashed_virtual_vectors, thetaJ=thetaJ)
    with open(big_group_path, 'wb') as f:
        pickle.dump(big_group_indices, f)
    
    big_group_payloads = []
    non_big_group_paylaods = []

    for idx, payload in enumerate(payloads):
        if idx in big_group_indices:
            big_group_payloads.append(payload)
        else:
            non_big_group_paylaods.append(payload)
    
    # 시그니처 생성
    cluster_signatures = SG2(payloads=big_group_payloads, window_size=window_size, vector_size=vector_size, 
                             eps=eps, minpts=minpts, ngram=ngram, hh1_size=hh1_size, hh2_size=hh2_size, hh3_size=hh3_size, ratio=ratio)
    
    with open(signature_path, 'wb') as f:
        pickle.dump(cluster_signatures, f)
    
    stopwords = AWL(payloads=non_big_group_paylaods, ngram=ngram, hh1_size=hh1_size, hh2_size=hh2_size, hh3_size=hh3_size, ratio=ratio)

    with open(stopword_path, 'wb') as f:
        pickle.dump(stopwords, f)

In [7]:
# 파라미터

file_path = './datasets/pkl/pe_string.pkl'
virtual_vector_path = f'./res/PE_virtual_vector.pkl'
big_group_indices_path = f'./res/PE_big_group.pkl'
signature_path = f'./res/PE_signature.pkl'
stopword_path = f'./res/PE_stopword.pkl'

K = 64
M = 2 ** 14
thetaJ = 0.6
window_size = 4
vector_size = 512
eps = 0.4
minpts = 5
ngram = 4
hh1_size = 3000
hh2_size = 3000
hh3_size = 3000
ratio = 0.1

In [8]:
main(payload_path=file_path, signature_path=signature_path, stopword_path=stopword_path,
         virtual_vector_path=virtual_vector_path, big_group_path=big_group_indices_path,
         window_size=window_size, vector_size=vector_size, K=K, M=M, thetaJ=thetaJ,
         eps=eps, minpts=minpts, ngram=ngram, hh1_size=hh2_size, hh2_size=hh2_size, hh3_size=hh3_size, ratio=ratio)
    

29909
29909
make minhashed vector


100%|██████████| 29909/29909 [06:19<00:00, 78.80it/s] 


checking big group


100%|██████████| 29909/29909 [02:48<00:00, 177.06it/s]


chunking


100%|██████████| 23402/23402 [01:02<00:00, 372.43it/s]


start DBSCAN
end DBSCAN
make signature


100%|██████████| 3/3 [00:29<00:00,  9.83s/it]


end signature
start whtie list
end white list


In [9]:
with open('./res/PE_signature.pkl', 'rb') as f:
    signature = pickle.load(f)
    
with open('./res/PE_stopword.pkl', 'rb') as f:
    stopword = pickle.load(f)
    
with open('./res/PE_big_group.pkl', 'rb') as f:
    big_group = pickle.load(f)

In [10]:
print(len(big_group))

23402


In [11]:
print('SIGNAURE')

for i in signature.values():
    for j in i:
        print(j, end=" ")
    print()
    
print('STOPWORD')
for i in stopword:
    print(i)

SIGNAURE
('Delete', 17858) ('ExitProcess', 13513) ('Exception', 13364) ('System', 12775) ('GetProcAddress', 12421) ('LoadLibraryA', 12023) ('user32', 11764) ('advapi32', 10161) ('RegCloseKey', 9995) ('CloseHandle', 9885) ('!This program cannot be run in DOS mode.', 8859) ('kernel32', 8343) ('WriteFile', 8152) ('VirtualFree', 7675) ('VirtualAlloc', 7296) ('gram must be run ', 7120) ('GetCurrentThread', 6939) ('GetLastError', 6782) ('oleaut32.dll', 6375) ('ReadFile', 6369) ('KERNEL32.DLL', 6353) ('USER32', 6315) ('GetFileSize', 6314) ('FindClose', 6247) ('CharNextA', 6175) ('SetFilePointer', 6154) ('EnterCriticalSection', 5961) ('GetFileType', 5673) ('TObject', 5574) ('CreateFileA', 5569) ('RtlUnwind', 5425) ('LocalFree', 5279) ('TlsSetValue', 5199) ('WideCharToMultiByte', 5123) ('Windows', 5113) ('.reloc', 4369) ('KERNEL32.dll', 4004) ('ADVAPI32.dll', 3858) ('.rdata', 3675) ('`.data', 3434) ('StringX', 3378) ('CreateDirectoryA', 3041) ('ole32.dll', 2666) ('Interface', 2284) ('MSVBVM60.D

In [12]:
filters = [i[0] for i in stopword if i[1] > 99]
for cluster, sig in signature.items():
    for token in sig:
        if token[1] not in filters:
            print(token)

('Delete', 17858)
('ExitProcess', 13513)
('Exception', 13364)
('System', 12775)
('GetProcAddress', 12421)
('LoadLibraryA', 12023)
('user32', 11764)
('advapi32', 10161)
('RegCloseKey', 9995)
('CloseHandle', 9885)
('!This program cannot be run in DOS mode.', 8859)
('kernel32', 8343)
('WriteFile', 8152)
('VirtualFree', 7675)
('VirtualAlloc', 7296)
('gram must be run ', 7120)
('GetCurrentThread', 6939)
('GetLastError', 6782)
('oleaut32.dll', 6375)
('ReadFile', 6369)
('KERNEL32.DLL', 6353)
('USER32', 6315)
('GetFileSize', 6314)
('FindClose', 6247)
('CharNextA', 6175)
('SetFilePointer', 6154)
('EnterCriticalSection', 5961)
('GetFileType', 5673)
('TObject', 5574)
('CreateFileA', 5569)
('RtlUnwind', 5425)
('LocalFree', 5279)
('TlsSetValue', 5199)
('WideCharToMultiByte', 5123)
('Windows', 5113)
('.reloc', 4369)
('KERNEL32.dll', 4004)
('ADVAPI32.dll', 3858)
('.rdata', 3675)
('`.data', 3434)
('StringX', 3378)
('CreateDirectoryA', 3041)
('ole32.dll', 2666)
('Interface', 2284)
('MSVBVM60.DLL', 1969