In [14]:
# import module

import pickle
import os
from tqdm import tqdm

from GIPS.core.feature_extractor import extract_string
from GIPS.core.MV2 import MV2
from GIPS.core.JIG import JIG
from GIPS.core.SG2 import SG2 
from GIPS.core.AWL import AWL

In [4]:
# preprocessing

with open('./datasets/pkl/benign_md5.pkl', 'rb') as f:
    benign_list = pickle.load(f)
    
benign_list[: 10]

['74db80ed2730af4c13f906940ce3cbb0',
 'f921fa214cdd02fb064f977ea1389e42',
 'e1058f3fcb36755feb9b6c098e3954f0',
 'ddefdbb3e88d0ce0a623f92bdb69b020',
 'ad8b3fa73ea964a6c17cb0c0d6029c50',
 '9be6386206be19c741aeae34b1af8c90',
 '70f58da1960e7311101d92973008b1e0',
 '1abdf6bfbc01a93e775ba1c0b409e7e2',
 'cfa80ad59abfcaa79b7e9a76d1fc2625',
 'eb089d184db2d1da31e4e29503ae0bc0']

In [6]:
# benign pe file path

benign_path = "./datasets/ben/"

In [15]:
benign_strings = []

for path in tqdm(benign_list):
    res = extract_string(os.path.join(benign_path, path))
    benign_strings.append(res)
    
benign_strings[: 10][: 5]

100%|██████████| 29909/29909 [09:02<00:00, 55.17it/s] 


[{'                          ',
  "        <requestedExecutionLevel level='asInvoker' uiAccess='false' />",
  '      </requestedPrivileges>',
  '      <requestedPrivileges>',
  '    </security>',
  '    <security>',
  '  </trustInfo>',
  '  <trustInfo xmlns="urn:schemas-microsoft-com:asm.v3">',
  ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~',
  ' !"#$%&\'()*+,-./0123456789:;<=>?@abcdefghijklmnopqrstuvwxyz[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~',
  ' A_A^A\\',
  ' A_A^A\\_^',
  ' A_A^A]A\\_',
  ' A_A^_',
  " Base Class Array'",
  ' Base Class Descriptor at (',
  " Class Hierarchy Descriptor'",
  " Complete Object Locator'",
  ' H3E H3E',
  " Type Descriptor'",
  ' delete',
  ' delete[]',
  ' new[]',
  ' t38S<u',
  '!,X< w',
  '!This program cannot be run in DOS mode.',
  '"e?<<<<<<l?',
  '%GoU?*',
  '&?PPPPPPP?',
  "'D8l$@",
  '(DigiCert SHA2 Assured ID Code Signing CA',
  '(DigiCert SHA2 Assured ID Code Signing CA0',
  '(DigiCert SHA2

In [17]:
with open("./datasets/pkl/pe_benign_string.pkl", "wb") as f:
    pickle.dump(benign_strings, f)

In [2]:
def main(payload_path, signature_path, stopword_path, virtual_vector_path, big_group_path,
         window_size, K, M, # MV2 파라미터
         thetaJ,  # JIG 파라미터
         vector_size, eps, minpts, ngram, hh1_size, hh2_size, hh3_size, ratio # SG2, AWL 파라미터
        ):
    
    with open(payload_path, 'rb') as f:
        payloads_data = pickle.load(f)

    print(len(payloads_data))
    payloads = [x for x in payloads_data]

    print(len(payloads))

    # 빅 그룹 식별
    minhashed_virtual_vectors = MV2(payloads=payloads, window_size=window_size, K=K, M=M)
    with open(virtual_vector_path, 'wb') as f:
        pickle.dump(minhashed_virtual_vectors, f)

    big_group_indices = JIG(vectors=minhashed_virtual_vectors, thetaJ=thetaJ)
    with open(big_group_path, 'wb') as f:
        pickle.dump(big_group_indices, f)
    
    big_group_payloads = []
    non_big_group_paylaods = []

    for idx, payload in enumerate(payloads):
        if idx in big_group_indices:
            big_group_payloads.append(payload)
        else:
            non_big_group_paylaods.append(payload)
    
    # 시그니처 생성
    cluster_signatures = SG2(payloads=big_group_payloads, window_size=window_size, vector_size=vector_size, 
                             eps=eps, minpts=minpts, ngram=ngram, hh1_size=hh1_size, hh2_size=hh2_size, hh3_size=hh3_size, ratio=ratio)
    
    with open(signature_path, 'wb') as f:
        pickle.dump(cluster_signatures, f)
    
    stopwords = AWL(payloads=non_big_group_paylaods, ngram=ngram, hh1_size=hh1_size, hh2_size=hh2_size, hh3_size=hh3_size, ratio=ratio)

    with open(stopword_path, 'wb') as f:
        pickle.dump(stopwords, f)

In [18]:
# 파라미터

file_path = './datasets/pkl/pe_benign_string.pkl'
virtual_vector_path = f'./res/PE_benign_virtual_vector.pkl'
big_group_indices_path = f'./res/PE_benign_big_group.pkl'
signature_path = f'./res/PE_benign_signature.pkl'
stopword_path = f'./res/PE_benign_stopword.pkl'

K = 64
M = 2 ** 14
thetaJ = 0.6
window_size = 4
vector_size = 512
eps = 0.4
minpts = 5
ngram = 4
hh1_size = 3000
hh2_size = 3000
hh3_size = 3000
ratio = 0.1

In [19]:
main(payload_path=file_path, signature_path=signature_path, stopword_path=stopword_path,
         virtual_vector_path=virtual_vector_path, big_group_path=big_group_indices_path,
         window_size=window_size, vector_size=vector_size, K=K, M=M, thetaJ=thetaJ,
         eps=eps, minpts=minpts, ngram=ngram, hh1_size=hh2_size, hh2_size=hh2_size, hh3_size=hh3_size, ratio=ratio)

29909
29909
make minhashed vector


100%|██████████| 29909/29909 [14:42<00:00, 33.91it/s]


checking big group


100%|██████████| 29909/29909 [03:34<00:00, 139.26it/s]


chunking


100%|██████████| 8167/8167 [00:40<00:00, 201.09it/s]


start DBSCAN
end DBSCAN
make signature


100%|██████████| 2/2 [00:23<00:00, 11.85s/it]


end signature
start whtie list
end white list


In [20]:
with open(signature_path, 'rb') as f:
    signature = pickle.load(f)
    
with open(stopword_path, 'rb') as f:
    stopword = pickle.load(f)
    
with open(big_group_indices_path, 'rb') as f:
    big_group = pickle.load(f)

In [21]:
signatures = dict()

print('SIGNAURE')
for idx, sig in enumerate(signature.values()):
    print(f'cluster: {idx}, size: {len(sig)}')
    for j in sig:
        if j[0] not in signatures:
            signatures[j[0]] = 0
        signatures[j[0]] += j[1]
        print(j, end=" ")
    print()

SIGNAURE
cluster: 0, size: 28
('String', 15843) ('Thread', 14031) ('System', 11916) ('.rdata', 11717) ('.idata', 11167) ('Exception', 10937) ('GetCurrentProcess', 10378) ('Object', 8212) ('A_A^A]', 7181) ('Delete', 7141) ('      ', 7133) ('0123456789', 6026) ('Version', 5854) ('GetValue', 5598) ('!This program cannot be run in DOS mode.', 5285) ('ExitProcess', 5057) ('KERNEL32', 4878) ('.reloc', 4395) ('@.data', 3999) ('TerminateProcess', 3995) ('static ', 2846) ('atexit', 2700) ('Default', 2631) ('GetWindow', 2615) ('Unknown', 2524) ('@.rsrc', 2399) ('</trustInfo>', 1303) ('`.rsrc', 122) 
cluster: 1, size: 1
('F0@(BB', 13) 


In [32]:
PE_stopword = set()

for sig, cnt in signatures.items():
	print(f'{sig}:\t{cnt}')
	PE_stopword.add(sig)

with open('./datasets/pkl/pe_stopword.pkl', 'wb') as f:
	pickle.dump(PE_stopword, f)

String:	15843
Thread:	14031
System:	11916
.rdata:	11717
.idata:	11167
Exception:	10937
GetCurrentProcess:	10378
Object:	8212
A_A^A]:	7181
Delete:	7141
      :	7133
0123456789:	6026
Version:	5854
GetValue:	5598
!This program cannot be run in DOS mode.:	5285
ExitProcess:	5057
KERNEL32:	4878
.reloc:	4395
@.data:	3999
TerminateProcess:	3995
static :	2846
atexit:	2700
Default:	2631
GetWindow:	2615
Unknown:	2524
@.rsrc:	2399
</trustInfo>:	1303
`.rsrc:	122
F0@(BB:	13


In [29]:
with open("./res/PE_signature.pkl", "rb") as f:
    pe_sig = pickle.load(f)

In [30]:
pe_signatures = dict()

print('SIGNAURE')
for idx, sig in enumerate(pe_sig.values()):
    print(f'cluster: {idx}, size: {len(sig)}')
    for j in sig:
        if j[0] not in pe_signatures:
            pe_signatures[j[0]] = 0
        pe_signatures[j[0]] += j[1]
        print(j, end=" ")
    print()

SIGNAURE
cluster: 0, size: 91
('Delete', 17858) ('ExitProcess', 13513) ('Exception', 13364) ('System', 12775) ('GetProcAddress', 12421) ('LoadLibraryA', 12023) ('user32', 11764) ('advapi32', 10161) ('RegCloseKey', 9995) ('CloseHandle', 9885) ('!This program cannot be run in DOS mode.', 8859) ('kernel32', 8343) ('WriteFile', 8152) ('VirtualFree', 7675) ('VirtualAlloc', 7296) ('gram must be run ', 7120) ('GetCurrentThread', 6939) ('GetLastError', 6782) ('oleaut32.dll', 6375) ('ReadFile', 6369) ('KERNEL32.DLL', 6353) ('USER32', 6315) ('GetFileSize', 6314) ('FindClose', 6247) ('CharNextA', 6175) ('SetFilePointer', 6154) ('EnterCriticalSection', 5961) ('GetFileType', 5673) ('TObject', 5574) ('CreateFileA', 5569) ('RtlUnwind', 5425) ('LocalFree', 5279) ('TlsSetValue', 5199) ('WideCharToMultiByte', 5123) ('Windows', 5113) ('.reloc', 4369) ('KERNEL32.dll', 4004) ('ADVAPI32.dll', 3858) ('.rdata', 3675) ('`.data', 3434) ('StringX', 3378) ('CreateDirectoryA', 3041) ('ole32.dll', 2666) ('Interface

In [34]:
for sig, cnt in pe_signatures.items():
    if sig not in PE_stopword:
        print(sig, cnt)

GetProcAddress 12421
LoadLibraryA 12023
user32 11764
advapi32 10161
RegCloseKey 9995
CloseHandle 9885
kernel32 8343
WriteFile 8152
VirtualFree 7675
VirtualAlloc 7296
gram must be run  7120
GetCurrentThread 6939
GetLastError 6782
oleaut32.dll 6375
ReadFile 6369
KERNEL32.DLL 6353
USER32 6315
GetFileSize 6314
FindClose 6247
CharNextA 6175
SetFilePointer 6154
EnterCriticalSection 5961
GetFileType 5673
TObject 5574
CreateFileA 5569
RtlUnwind 5425
LocalFree 5279
TlsSetValue 5199
WideCharToMultiByte 5123
Windows 5113
KERNEL32.dll 4004
ADVAPI32.dll 3858
`.data 3434
StringX 3378
CreateDirectoryA 3041
ole32.dll 2666
Interface 2284
MSVBVM60.DLL 1969
Boolean 1897
`.rdat 1709
SysFreeString 1697
GetOEMCP 1520
C:\Program Files\Microsoft Visual Studio\VB98\VB6.OLB 1312
UpdateResourceA 1298
September 1295
version 1290
[[[[[[ 1243
~ExC[) 1193
{&8p^)j6 1134
LookupPrivilegeValueA 926
result 687
MSVCRT.dll 644
MM/dd/yy 606
PathFileExistsA 488
s`)L$4 459
RtlMoveMemory 456
t$t#t$l 408
shlwapi 390
! 6J[[ 361
