In [7]:
# import module

import pickle
from core.MV2 import MV2
from core.JIG import JIG
from core.SG2 import SG2 
from core.AWL import AWL

In [8]:
# GIPS의 입력 스트링에서 리스트 혹은 셋으로 변경

def main(payload_path, signature_path, stopword_path, virtual_vector_path, big_group_path,
         window_size, K, M, # MV2 파라미터
         thetaJ,  # JIG 파라미터
         vector_size, eps, minpts, ngram, hh1_size, hh2_size, hh3_size, ratio # SG2, AWL 파라미터
        ):
    
    with open(payload_path, 'rb') as f:
        payloads_data = pickle.load(f)

    print(len(payloads_data))
    payloads = [x for x in payloads_data]

    print(len(payloads))

    # 빅 그룹 식별
    minhashed_virtual_vectors = MV2(payloads=payloads, window_size=window_size, K=K, M=M)
    with open(virtual_vector_path, 'wb') as f:
        pickle.dump(minhashed_virtual_vectors, f)

    big_group_indices = JIG(vectors=minhashed_virtual_vectors, thetaJ=thetaJ)
    with open(big_group_path, 'wb') as f:
        pickle.dump(big_group_indices, f)
    
    big_group_payloads = []
    non_big_group_paylaods = []

    for idx, payload in enumerate(payloads):
        if idx in big_group_indices:
            big_group_payloads.append(payload)
        else:
            non_big_group_paylaods.append(payload)
    
    # 시그니처 생성
    cluster_signatures = SG2(payloads=big_group_payloads, window_size=window_size, vector_size=vector_size, 
                             eps=eps, minpts=minpts, ngram=ngram, hh1_size=hh1_size, hh2_size=hh2_size, hh3_size=hh3_size, ratio=ratio)
    
    with open(signature_path, 'wb') as f:
        pickle.dump(cluster_signatures, f)
    
    stopwords = AWL(payloads=non_big_group_paylaods, ngram=ngram, hh1_size=hh1_size, hh2_size=hh2_size, hh3_size=hh3_size, ratio=ratio)

    with open(stopword_path, 'wb') as f:
        pickle.dump(stopwords, f)

In [9]:
# 파라미터

file_path = './datasets/pkl/pe_string.pkl'
virtual_vector_path = f'./res/PE_virtual_vector.pkl'
big_group_indices_path = f'./res/PE_big_group.pkl'
signature_path = f'./res/PE_signature.pkl'
stopword_path = f'./res/PE_stopword.pkl'

K = 64
M = 2 ** 14
thetaJ = 0.6
window_size = 4
vector_size = 512
eps = 0.4
minpts = 5
ngram = 4
hh1_size = 3000
hh2_size = 3000
hh3_size = 3000
ratio = 0.1

In [10]:
main(payload_path=file_path, signature_path=signature_path, stopword_path=stopword_path,
         virtual_vector_path=virtual_vector_path, big_group_path=big_group_indices_path,
         window_size=window_size, vector_size=vector_size, K=K, M=M, thetaJ=thetaJ,
         eps=eps, minpts=minpts, ngram=ngram, hh1_size=hh2_size, hh2_size=hh2_size, hh3_size=hh3_size, ratio=ratio)
    

29909
29909
make minhashed vector


100%|██████████| 29909/29909 [06:20<00:00, 78.65it/s] 


checking big group


100%|██████████| 29909/29909 [03:00<00:00, 166.05it/s]


chunking


100%|██████████| 23402/23402 [01:05<00:00, 358.37it/s]


start DBSCAN
end DBSCAN
make signature


100%|██████████| 3/3 [00:32<00:00, 10.93s/it]


end signature
start whtie list
end white list


In [11]:
with open('./res/PE_signature.pkl', 'rb') as f:
    signature = pickle.load(f)
    
with open('./res/PE_stopword.pkl', 'rb') as f:
    stopword = pickle.load(f)
    
with open('./res/PE_big_group.pkl', 'rb') as f:
    big_group = pickle.load(f)

In [12]:
print(len(big_group))

23402


In [13]:
signatures = dict()

print('SIGNAURE')
for idx, sig in enumerate(signature.values()):
    print(f'cluster: {idx}, size: {len(sig)}')
    for j in sig:
        if j[0] not in signatures:
            signatures[j[0]] = 0
        signatures[j[0]] += j[1]
        print(j, end=" ")
    print()
    


print('\nSTOPWORD')
for i in stopword:
    print(i)

SIGNAURE
cluster: 0, size: 91
('Delete', 17858) ('ExitProcess', 13513) ('Exception', 13364) ('System', 12775) ('GetProcAddress', 12421) ('LoadLibraryA', 12023) ('user32', 11764) ('advapi32', 10161) ('RegCloseKey', 9995) ('CloseHandle', 9885) ('!This program cannot be run in DOS mode.', 8859) ('kernel32', 8343) ('WriteFile', 8152) ('VirtualFree', 7675) ('VirtualAlloc', 7296) ('gram must be run ', 7120) ('GetCurrentThread', 6939) ('GetLastError', 6782) ('oleaut32.dll', 6375) ('ReadFile', 6369) ('KERNEL32.DLL', 6353) ('USER32', 6315) ('GetFileSize', 6314) ('FindClose', 6247) ('CharNextA', 6175) ('SetFilePointer', 6154) ('EnterCriticalSection', 5961) ('GetFileType', 5673) ('TObject', 5574) ('CreateFileA', 5569) ('RtlUnwind', 5425) ('LocalFree', 5279) ('TlsSetValue', 5199) ('WideCharToMultiByte', 5123) ('Windows', 5113) ('.reloc', 4369) ('KERNEL32.dll', 4004) ('ADVAPI32.dll', 3858) ('.rdata', 3675) ('`.data', 3434) ('StringX', 3378) ('CreateDirectoryA', 3041) ('ole32.dll', 2666) ('Interface

In [14]:
for sig, cnt in signatures.items():
	print(f'{sig}:\t{cnt}')

Delete:	17858
ExitProcess:	13513
Exception:	13364
System:	12775
GetProcAddress:	12421
LoadLibraryA:	12023
user32:	11764
advapi32:	10161
RegCloseKey:	9995
CloseHandle:	9885
!This program cannot be run in DOS mode.:	8859
kernel32:	8343
WriteFile:	8152
VirtualFree:	7675
VirtualAlloc:	7296
gram must be run :	7120
GetCurrentThread:	6939
GetLastError:	6782
oleaut32.dll:	6375
ReadFile:	6369
KERNEL32.DLL:	6353
USER32:	6315
GetFileSize:	6314
FindClose:	6247
CharNextA:	6175
SetFilePointer:	6154
EnterCriticalSection:	5961
GetFileType:	5673
TObject:	5574
CreateFileA:	5569
RtlUnwind:	5425
LocalFree:	5279
TlsSetValue:	5199
WideCharToMultiByte:	5123
Windows:	5113
.reloc:	4369
KERNEL32.dll:	4004
ADVAPI32.dll:	3858
.rdata:	3675
`.data:	3434
StringX:	3378
CreateDirectoryA:	3041
ole32.dll:	2666
Interface:	2284
MSVBVM60.DLL:	1969
Boolean:	1897
`.rdat:	1709
SysFreeString:	1697
GetOEMCP:	1520
C:\Program Files\Microsoft Visual Studio\VB98\VB6.OLB:	1312
UpdateResourceA:	1298
September:	1295
version:	1290
[[[[

In [15]:
filters = [i[0] for i in stopword if i[1] > 99]
stopword_ = 0

for sig, cnt in signatures.items():
	if sig not in filters:
		print(f'{sig}:\t{cnt}')
	else:
		stopword_ += 1

ExitProcess:	13513
System:	12775
LoadLibraryA:	12023
user32:	11764
advapi32:	10161
kernel32:	8343
gram must be run :	7120
GetCurrentThread:	6939
GetLastError:	6782
oleaut32.dll:	6375
ReadFile:	6369
KERNEL32.DLL:	6353
USER32:	6315
GetFileSize:	6314
CharNextA:	6175
GetFileType:	5673
TObject:	5574
CreateFileA:	5569
RtlUnwind:	5425
LocalFree:	5279
TlsSetValue:	5199
WideCharToMultiByte:	5123
Windows:	5113
KERNEL32.dll:	4004
ADVAPI32.dll:	3858
StringX:	3378
CreateDirectoryA:	3041
ole32.dll:	2666
Interface:	2284
MSVBVM60.DLL:	1969
Boolean:	1897
GetOEMCP:	1520
C:\Program Files\Microsoft Visual Studio\VB98\VB6.OLB:	1312
UpdateResourceA:	1298
September:	1295
version:	1290
[[[[[[:	1243
~ExC[):	1193
{&8p^)j6:	1134
LookupPrivilegeValueA:	926
result:	687
MSVCRT.dll:	644
MM/dd/yy:	606
PathFileExistsA:	488
s`)L$4:	459
RtlMoveMemory:	456
t$t#t$l:	408
shlwapi:	390
! 6J[[:	361
lstrcmpiA:	358
__vbaStrCopy:	354
nKB\`lll:	334
<<<<<<<<<<<<<<<<<<<<<<<<<<u9l:	321
[fPFMlllll:	271
'HSplit:	259
VC20XC00:	247
Dvn+

In [16]:
print(stopword_)

17
