In [1]:
# import module

import pickle
import os
from tqdm import tqdm

from feature_extractor import extract_string
from core.MV2 import MV2
from core.JIG import JIG
from core.SG2 import SG2 
from core.AWL import AWL

In [2]:
def main(signature_path, stopword_path, virtual_vector_path, big_group_path,
		 window_size, K, M, # MV2 파라미터
		 thetaJ,  # JIG 파라미터
		 vector_size, eps, minpts, ngram, hh1_size, hh2_size, hh3_size, ratio # SG2, AWL 파라미터
		):
	
	with open('./datasets/pkl/pe_benign_string.pkl', 'rb') as f:
		payloads_benign = pickle.load(f)
		
	with open('./datasets/pkl/pe_string.pkl', 'rb') as f:
		payloads_malware = pickle.load(f)

	payloads = payloads_benign + payloads_malware

	print(len(payloads))

	# 빅 그룹 식별
	with open('./res/PE_benign_virtual_vector.pkl', 'rb') as f:
		vir_vec_benign = pickle.load(f)
		
	with open('./res/PE_virtual_vector.pkl', 'rb') as f:
		vir_vec_malware = pickle.load(f)

	minhashed_virtual_vectors = vir_vec_benign + vir_vec_malware
	with open(virtual_vector_path, 'wb') as f:
		pickle.dump(minhashed_virtual_vectors, f)

	big_group_indices = JIG(vectors=minhashed_virtual_vectors, thetaJ=thetaJ)
	with open(big_group_path, 'wb') as f:
		pickle.dump(big_group_indices, f)
	
	big_group_payloads = []
	non_big_group_paylaods = []

	for idx, payload in enumerate(payloads):
		if idx in big_group_indices:
			big_group_payloads.append(payload)
		else:
			non_big_group_paylaods.append(payload)
	
	# 시그니처 생성
	cluster_signatures = SG2(payloads=big_group_payloads, window_size=window_size, vector_size=vector_size, 
							 eps=eps, minpts=minpts, ngram=ngram, hh1_size=hh1_size, hh2_size=hh2_size, hh3_size=hh3_size, ratio=ratio)
	
	with open(signature_path, 'wb') as f:
		pickle.dump(cluster_signatures, f)
	
	stopwords = AWL(payloads=non_big_group_paylaods, ngram=ngram, hh1_size=hh1_size, hh2_size=hh2_size, hh3_size=hh3_size, ratio=ratio)

	with open(stopword_path, 'wb') as f:
		pickle.dump(stopwords, f)

In [3]:
# 파라미터

virtual_vector_path = f'./res/PE_multi_virtual_vector.pkl'
big_group_indices_path = f'./res/PE_multi_big_group.pkl'
signature_path = f'./res/PE_multi_signature.pkl'
stopword_path = f'./res/PE_multi_stopword.pkl'

K = 64
M = 2 ** 14
thetaJ = 0.6
window_size = 4
vector_size = 512
eps = 0.4
minpts = 5
ngram = 4
hh1_size = 3000
hh2_size = 3000
hh3_size = 3000
ratio = 0.1

In [4]:
main(signature_path=signature_path, stopword_path=stopword_path,
         virtual_vector_path=virtual_vector_path, big_group_path=big_group_indices_path,
         window_size=window_size, vector_size=vector_size, K=K, M=M, thetaJ=thetaJ,
         eps=eps, minpts=minpts, ngram=ngram, hh1_size=hh2_size, hh2_size=hh2_size, hh3_size=hh3_size, ratio=ratio)

59818
checking big group


100%|██████████| 59818/59818 [06:15<00:00, 159.38it/s]


chunking


100%|██████████| 30737/30737 [01:46<00:00, 288.47it/s]


start DBSCAN
end DBSCAN
make signature


100%|██████████| 3/3 [00:52<00:00, 17.42s/it]


end signature
start whtie list
end white list


In [5]:
with open(signature_path, 'rb') as f:
    signature = pickle.load(f)
    
with open(stopword_path, 'rb') as f:
    stopword = pickle.load(f)
    
with open(big_group_indices_path, 'rb') as f:
    big_group = pickle.load(f)

In [6]:
signatures = dict()

print('SIGNAURE')
for idx, sig in enumerate(signature.values()):
    print(f'cluster: {idx}, size: {len(sig)}')
    for j in sig:
        if j[0] not in signatures:
            signatures[j[0]] = 0
        signatures[j[0]] += j[1]
        print(j, end=" ")
    print()
    
print('\nSTOPWORD')
for i in stopword:
    print(i)

SIGNAURE
cluster: 0, size: 87
('Exception', 29228) ('System', 25455) ('GetProc', 23088) ('GetCurrentProcess', 19579) ('ExitProcess', 19445) ('.rdata', 16547) ('KERNEL32', 15089) ('CloseHandle', 14907) ('.idata', 14256) ('!This program cannot be run in DOS mode.', 13491) ('RegCloseKey', 12921) ('LoadLibraryA', 12705) ('WriteFile', 12613) ('GetCurrentThread', 12248) ('user32', 11718) ('InitializeCriticalSection', 11667) ('GetLastError', 11269) ('advapi32', 10222) ('ReadFile', 9881) ('SetFilePointer', 9781) ('GetWindow', 9528) ('FindClose', 9452) ('VirtualFree', 9371) ('USER32', 9228) ('kernel32', 9140) ('.reloc', 8791) ('VirtualAlloc', 8789) ('RtlUnwind', 8246) ('TlsSetValue', 8220) ('LocalFree', 7940) ('GetFileType', 7897) ('`.rdat', 7897) ('GetFileSize', 7668) ('ADVAPI32.dll', 7352) ('Windows', 7231) ('gram must be run ', 7150) ('oleaut32.dll', 6388) ('CreateFileA', 6220) ('CharNextA', 6075) ('TObject', 5552) ('ole32.dll', 5186) ('GetOEMCP', 4018) ('GlobalAlloc', 3783) ('September', 36

In [8]:
print(len(signatures))

for sig, cnt in signatures.items():
	print(f'{sig}:\t{cnt}')

92
Exception:	29228
System:	25455
GetProc:	23088
GetCurrentProcess:	19579
ExitProcess:	19445
.rdata:	16547
KERNEL32:	15089
CloseHandle:	14907
.idata:	14256
!This program cannot be run in DOS mode.:	13491
RegCloseKey:	12921
LoadLibraryA:	12705
WriteFile:	12613
GetCurrentThread:	12248
user32:	11718
InitializeCriticalSection:	11667
GetLastError:	11269
advapi32:	10222
ReadFile:	9881
SetFilePointer:	9781
GetWindow:	9528
FindClose:	9452
VirtualFree:	9371
USER32:	9228
kernel32:	9140
.reloc:	8791
VirtualAlloc:	8789
RtlUnwind:	8246
TlsSetValue:	8220
LocalFree:	7940
GetFileType:	7897
`.rdat:	7897
GetFileSize:	7668
ADVAPI32.dll:	7352
Windows:	7231
gram must be run :	7150
oleaut32.dll:	6388
CreateFileA:	6220
CharNextA:	6075
TObject:	5552
ole32.dll:	5186
GetOEMCP:	4018
GlobalAlloc:	3783
September:	3600
`.data:	3525
RegOpenKeyA:	3338
CreateDirectoryA:	3295
StringX:	3076
version:	2883
Interface:	2360
MM/dd/yy:	1895
MSVBVM60.DLL:	1870
Boolean:	1677
C:\Program Files\Microsoft Visual Studio\VB98\VB6.OLB

In [9]:
filters = [i[0] for i in stopword if i[1] > 1]
stopword_ = 0

for sig, cnt in signatures.items():
	if sig not in filters:
		print(f'{sig}:\t{cnt}')
	else:
		stopword_ += 1

GetProc:	23088
GetCurrentProcess:	19579
ExitProcess:	19445
.idata:	14256
RegCloseKey:	12921
GetCurrentThread:	12248
user32:	11718
InitializeCriticalSection:	11667
advapi32:	10222
ReadFile:	9881
GetWindow:	9528
USER32:	9228
kernel32:	9140
VirtualAlloc:	8789
RtlUnwind:	8246
TlsSetValue:	8220
LocalFree:	7940
GetFileType:	7897
GetFileSize:	7668
ADVAPI32.dll:	7352
Windows:	7231
gram must be run :	7150
oleaut32.dll:	6388
CreateFileA:	6220
CharNextA:	6075
TObject:	5552
ole32.dll:	5186
GetOEMCP:	4018
GlobalAlloc:	3783
September:	3600
RegOpenKeyA:	3338
CreateDirectoryA:	3295
StringX:	3076
Interface:	2360
MM/dd/yy:	1895
MSVBVM60.DLL:	1870
Boolean:	1677
C:\Program Files\Microsoft Visual Studio\VB98\VB6.OLB:	1270
[[[[[[:	1253
result:	1217
DefWindowProcW:	864
lstrcmpiA:	608
MSVCRT.dll:	576
PathFileExistsA:	499
RtlMoveMemory:	483
shlwapi:	433
s`)L$4:	430
! 6J[[:	399
__vbaStrCopy:	354
<<<<<<<<<<<<<<<<<<<<<<<<<<u9l:	326
A><<<<<<<<<<<<<<<<<<<<<<<<<<:	303
[fPFMlllll:	281
'HSplit:	259
Dvn+|Ax:	249
8x1Mjc

In [10]:
print(stopword_)

20


In [12]:
with open('./res/PE_multi_big_group.pkl', 'rb') as f:
    big_group = pickle.load(f)

In [17]:
cnt = 0

malware_big_group = []

for idx in big_group:
    if idx < 29909:
        cnt += 1
        malware_big_group.append(idx)
        
print(cnt)
print(malware_big_group[2000: 2020])

8167
[11191, 11199, 11205, 11206, 11209, 11215, 11222, 11223, 11244, 11246, 11251, 11254, 11256, 11258, 11265, 11267, 11271, 11272, 11274, 11284]
